From 4003af42927f56801e9dae988e6dfed739d88025 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 1 May 2024 15:23:22 +1200
Subject: [PATCH 001/102] Initial tile script

---
 warp/tests/test_tile.py | 79 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 warp/tests/test_tile.py

diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
new file mode 100644
index 00000000..bc36d1fd
--- /dev/null
+++ b/warp/tests/test_tile.py
@@ -0,0 +1,79 @@
+import numpy as np
+import warp as wp
+
+wp.init()
+
+@wp.kernel
+def gemm(A: wp.array2d(dtype=float),
+         B: wp.array2d(dtype=float),
+         C: wp.array2d(dtype=float)):
+
+    # output index
+    i, j = wp.tid()
+
+    sum = float(0.0)
+
+    for k in range(0, A.shape[1]):
+        sum += A[i, k]*B[k, j]
+
+    C[i, j] = sum
+
+TILE_M = wp.constant(16)
+TILE_N = wp.constant(16)
+TILE_K = wp.constant(8)
+
+@wp.kernel
+def gemm_tiled(A: wp.array2d(dtype=float),
+               B: wp.array2d(dtype=float),
+               C: wp.array2d(dtype=float)):
+
+    # output tile index
+    i, j = wp.tid()
+
+    sum = wp.tile_zeros((TILE_M, TILE_N), dtype=wp.float32)
+
+    M = A.shape[0]
+    N = A.shape[1]
+    K = B.shape[1]
+
+    for k in range(0, K, TILE_K):
+
+        a = wp.tile_load(A, i, j+k, TILE_M, TILE_K)
+        b = wp.tile_load(B, i+k, j, TILE_K, TILE_N)
+
+        sum += wp.tile_matmul(a, b)
+
+    wp.tile_store(C, i, j, TILE_M, TILE_N)
+
+
+M = 240
+K = 80
+N = 350
+
+rng = np.random.default_rng(42)
+A = rng.random((M, K), dtype=np.float32)
+B = rng.random((K, N), dtype=np.float32)
+C = np.zeros((M, N), dtype=np.float32)
+
+A_wp = wp.array(A)
+B_wp = wp.array(B)
+C_wp = wp.array(C)
+
+iters = 100
+
+with wp.ScopedTimer("NumPy"):
+
+    for i in range(iters):
+        C = A@B
+
+wp.force_load()
+
+with wp.ScopedTimer("Warp", cuda_flags=wp.CUDA_TIMING_KERNEL):
+
+    for i in range(iters):
+        wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+    
+
+print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
+
+

From fadd083dae036a91aa6eee126b7ae9931d56a30b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 3 May 2024 12:24:53 +1200
Subject: [PATCH 002/102] Working on tile tests + API

---
 warp/builtins.py        | 164 ++++++++++++++++++++++++++++++++++++++++
 warp/native/builtin.h   |   1 +
 warp/native/tile.h      |  92 ++++++++++++++++++++++
 warp/tests/test_tile.py |  21 ++---
 4 files changed, 269 insertions(+), 9 deletions(-)
 create mode 100644 warp/native/tile.h

diff --git a/warp/builtins.py b/warp/builtins.py
index ea11e634..50d750e4 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1359,6 +1359,170 @@ def spatial_vector_constructor_value_func(arg_types, kwds, templates):
     group="Spatial Math",
 )
 
+# ------------------
+# Tile-based primitives
+shared_memory_id = 0
+
+def tile_zeros_value_func(arg_types, kwds, templates):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return array_t(shape=(Any, Any), dtype=Scalar)
+
+    if len(arg_types) > 0:
+        raise RuntimeError("tile_zero() args must be passed by keyword")
+
+    if "m" not in kwds:
+        raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
+
+    if "n" not in kwds:
+        raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
+
+    if "dtype" not in kwds:
+        raise RuntimeError("'dtype' keyword argument must be specified when calling tile_zeros() function")
+
+    m, n, dtype = kwds["m"], kwds["n"], kwds["dtype"]
+
+    templates.append(m)
+    templates.append(n)
+    templates.append(dtype)
+
+    global shared_memory_id
+    templates.append(shared_memory_id)
+
+    shared_memory_id += 1
+
+    return array(dtype=dtype)
+
+
+
+add_builtin(
+    "tile_zeros",
+    input_types={"m": int, "n": int, "dtype": Scalar},
+    value_func=tile_zeros_value_func,
+    variadic=True,
+    doc="Allocate a tile local block of zero'd memory",
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_load_value_func(arg_types, kwds, templates):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return array_t(shape=(Any, Any), dtype=Scalar)
+
+    if len(arg_types) != 3: 
+        raise RuntimeError("tile_load() requires 3 positional args")
+
+    if not is_array(arg_types[0]):
+        raise RuntimeError("tile_load() argument 0 must be an array")
+
+    if not type_is_int(arg_types[1]):
+        raise RuntimeError("tile_load() argument 1 must be an integer")
+
+    if not type_is_int(arg_types[2]):
+        raise RuntimeError("tile_load() argument 1 must be an integer")
+
+    if "m" not in kwds:
+        raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
+
+    if "n" not in kwds:
+        raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
+
+    m, n = kwds["m"], kwds["n"]
+
+    templates.append(m)
+    templates.append(n)
+    templates.append(arg_types[0].dtype)
+
+    global shared_memory_id
+    templates.append(shared_memory_id)
+
+    shared_memory_id += 1
+
+    return array(dtype=arg_types[0].dtype)
+
+
+
+add_builtin(
+    "tile_load",
+    input_types={"a": array(dtype=Any), "x": int, "y": int, "m": int, "n": int},
+    value_func=tile_load_value_func,
+    variadic=True,
+    doc="Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)",
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_store_value_func(arg_types, kwds, templates):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+
+    if len(arg_types) != 4: 
+        raise RuntimeError("tile_store() requires 4 positional args")
+
+    if not is_array(arg_types[0]):
+        raise RuntimeError("tile_store() argument 0 must be an array")
+
+    if not type_is_int(arg_types[1]):
+        raise RuntimeError("tile_store() argument 1 must be an integer")
+
+    if not type_is_int(arg_types[2]):
+        raise RuntimeError("tile_store() argument 2 must be an integer")
+
+    if not is_array(arg_types[3]):
+        raise RuntimeError("tile_store() argument 3 must be an array")
+
+    return None
+
+
+
+add_builtin(
+    "tile_store",
+    input_types={"a": array(dtype=Any), "x": int, "y": int, "m": int, "n": int},
+    value_func=tile_store_value_func,
+    variadic=True,
+    doc="Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)",
+    group="Tile Primitives",
+    export=False,
+)
+
+
+
+def tile_matmul_value_func(arg_types, kwds, templates):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+
+    if len(arg_types) != 3: 
+        raise RuntimeError("tile_matmul() requires 4 positional args")
+
+    if not is_array(arg_types[0]):
+        raise RuntimeError("tile_matmul() argument 0 must be an array")
+
+    if not is_array(arg_types[1]):
+        raise RuntimeError("tile_matmul() argument 1 must be an array")
+
+    if not is_array(arg_types[2]):
+        raise RuntimeError("tile_matmul() argument 2 must be an array")
+
+    return None
+
+
+add_builtin(
+    "tile_matmul",
+    input_types={"a": array(dtype=Any), "b": array(dtype=Any), "out": array(dtype=Any)},
+    value_func=tile_matmul_value_func,
+    variadic=True,
+    doc="Compute matrix product and accumulate out += a*b", 
+    group="Tile Primitives",
+    export=False,
+)
+
 # ---------------------------------
 # Linear Algebra
 
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index b2865788..97737567 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1558,3 +1558,4 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 #include "rand.h"
 #include "noise.h"
 #include "matnn.h"
+#include "tile.h"
\ No newline at end of file
diff --git a/warp/native/tile.h b/warp/native/tile.h
new file mode 100644
index 00000000..434799ec
--- /dev/null
+++ b/warp/native/tile.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "builtin.h"
+
+// #define WP_CONCAT(x, y) x ## y
+// #define WP_SHARED_MEM(name, id) WP_CONCAT(name, id)
+
+// #define zero(a) memset(a, 0, sizeof(a));
+
+// #define tile_zeros(a, b, dtype) [](){\
+// static dtype WP_SHARED_MEM(data_, __LINE__)[a][b]; \
+// zero(WP_SHARED_MEM(data_, __LINE__)); \
+// return array_t<dtype>WP_SHARED_MEM(data_, __LINE__; )}()
+
+#if !defined(__CUDA_ARCH__)
+#define __shared__ static
+#endif
+
+namespace wp
+{
+
+// 2D tile zero
+template <typename T, int M, int N, int Index>
+inline CUDA_CALLABLE array_t<T> tile_zeros()
+{
+    __shared__ T data[M*N];
+    
+    return array_t<T>(data, M, N, nullptr);
+}
+
+// 2D tile load
+template <typename T, int M, int N, int Index>
+inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
+{
+    const int length = M*N;
+
+    __shared__ T data[length];
+    
+    // cooperatively load the tile, using a block-stride iterator
+    // todo: use cub::BlockLoad or cg::memcpy_async()?
+    for (int t=threadIdx.y; t < length; t += blockDim.y)
+    {  
+        data[t] = index(src, i*M + t/N, j*N + t%N);
+    }
+        
+    return array_t<T>(data, M, N, nullptr);
+}
+
+// 2D tile store
+template <typename T>
+inline CUDA_CALLABLE array_t<T> tile_store(const array_t<T>& dest, const array_t<T>& src, int i, int j)
+{
+    const int length = src.shape[0]*src.shape[1];
+
+    // cooperatively store the tile, using a block-stride iterator
+    // todo: use cub::BlockStore or cg::memcpy_async()?
+    for (int t=threadIdx.y; t < length; t += blockDim.y)
+    {  
+        index(dest, i*M + t/N, j*N + t%N, i) = src.data[t];
+    }
+        
+    return array_t<T>(data, M, N, nullptr);
+}
+
+
+// 2D gemm accumulate out += A*B
+template <typename T>
+inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{    
+    const int length = out.shape[0]*out.shape[1];
+
+    for (int t=threadIdx.y; t < length; t += blockDim.y)
+    {  
+        // compute output index
+        const int i = t%out.shape[0];
+        const int j = t/out.shape[1];
+
+        T sum = T(0.0);
+
+        for (int k=0; k < A.shape[1]; ++k)
+        {
+            sum += index(A, i, k)*index(B, k, j);
+        }
+
+        index(out, i, j) += sum;
+    }
+}
+
+
+
+
+} // namespace wp
\ No newline at end of file
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index bc36d1fd..137662f3 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -2,6 +2,8 @@
 import warp as wp
 
 wp.init()
+wp.set_module_options({"enable_backwards": False})
+wp.set_device("cuda:0")
 
 @wp.kernel
 def gemm(A: wp.array2d(dtype=float),
@@ -30,20 +32,21 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     # output tile index
     i, j = wp.tid()
 
-    sum = wp.tile_zeros((TILE_M, TILE_N), dtype=wp.float32)
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
     M = A.shape[0]
-    N = A.shape[1]
-    K = B.shape[1]
+    N = B.shape[1]
+    K = A.shape[1]
 
     for k in range(0, K, TILE_K):
 
-        a = wp.tile_load(A, i, j+k, TILE_M, TILE_K)
-        b = wp.tile_load(B, i+k, j, TILE_K, TILE_N)
+        a = wp.tile_load(A, i, j+k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, i+k, j, m=TILE_K, n=TILE_N)
 
-        sum += wp.tile_matmul(a, b)
+        # sum += a*b
+        wp.tile_matmul(a, b, sum)
 
-    wp.tile_store(C, i, j, TILE_M, TILE_N)
+    wp.tile_store(C, i, j, sum)
 
 
 M = 240
@@ -66,9 +69,9 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     for i in range(iters):
         C = A@B
 
-wp.force_load()
+#wp.force_load()
 
-with wp.ScopedTimer("Warp", cuda_flags=wp.CUDA_TIMING_KERNEL):
+with wp.ScopedTimer("Warp", cuda_flags=wp.TIMING_KERNEL):
 
     for i in range(iters):
         wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])

From af9bae58a72f8874e3a0e86f18bd0daf5bac66d3 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 3 May 2024 16:15:49 +1200
Subject: [PATCH 003/102] Working tile-based GEMM, issues 1-block per-Warp
 logical thread, with tile size as an additional wp.launch() param

---
 warp/builtins.py        |  4 +--
 warp/codegen.py         | 33 +++++++++++++++---
 warp/context.py         |  6 ++--
 warp/native/array.h     |  6 ++++
 warp/native/tile.h      | 50 ++++++++++++++++++---------
 warp/native/warp.cu     |  6 ++--
 warp/native/warp.h      |  2 +-
 warp/tests/test_tile.py | 76 ++++++++++++++++++++++++++++++++++-------
 8 files changed, 142 insertions(+), 41 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 50d750e4..e1860363 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1383,9 +1383,9 @@ def tile_zeros_value_func(arg_types, kwds, templates):
 
     m, n, dtype = kwds["m"], kwds["n"], kwds["dtype"]
 
+    templates.append(dtype)
     templates.append(m)
     templates.append(n)
-    templates.append(dtype)
 
     global shared_memory_id
     templates.append(shared_memory_id)
@@ -1432,9 +1432,9 @@ def tile_load_value_func(arg_types, kwds, templates):
 
     m, n = kwds["m"], kwds["n"]
 
+    templates.append(arg_types[0].dtype)
     templates.append(m)
     templates.append(n)
-    templates.append(arg_types[0].dtype)
 
     global shared_memory_id
     templates.append(shared_memory_id)
diff --git a/warp/codegen.py b/warp/codegen.py
index 1b9ccedb..a9972769 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -2289,14 +2289,38 @@ def get_node_source(adj, node):
 
 """
 
+# cuda_kernel_template = """
+
+# extern "C" __global__ void {name}_cuda_kernel_forward(
+#     {forward_args})
+# {{
+#     for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+#          _idx < dim.size;
+#          _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+#     {{
+# {forward_body}    }}
+# }}
+
+# extern "C" __global__ void {name}_cuda_kernel_backward(
+#     {reverse_args})
+# {{
+#     for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+#          _idx < dim.size;
+#          _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+#     {{
+# {reverse_body}    }}
+# }}
+
+# """
+
 cuda_kernel_template = """
 
 extern "C" __global__ void {name}_cuda_kernel_forward(
     {forward_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+    for (size_t _idx = static_cast<size_t>(blockIdx.x);
          _idx < dim.size;
-         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+         _idx += static_cast<size_t>(gridDim.x))
     {{
 {forward_body}    }}
 }}
@@ -2304,15 +2328,16 @@ def get_node_source(adj, node):
 extern "C" __global__ void {name}_cuda_kernel_backward(
     {reverse_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
+    for (size_t _idx = static_cast<size_t>(blockIdx.x);
          _idx < dim.size;
-         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
+         _idx += static_cast<size_t>(gridDim.x))
     {{
 {reverse_body}    }}
 }}
 
 """
 
+
 cpu_kernel_template = """
 
 void {name}_cpu_kernel_forward(
diff --git a/warp/context.py b/warp/context.py
index d9fca4d1..c590c822 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -2842,6 +2842,7 @@ def __init__(self):
                 ctypes.c_void_p,
                 ctypes.c_size_t,
                 ctypes.c_int,
+                ctypes.c_int,
                 ctypes.POINTER(ctypes.c_void_p),
                 ctypes.c_void_p,
             ]
@@ -4232,6 +4233,7 @@ def launch(
     record_tape=True,
     record_cmd=False,
     max_blocks=0,
+    tile_size=1,
 ):
     """Launch a Warp kernel on the target device
 
@@ -4352,7 +4354,7 @@ def pack_args(args, params, adjoint=False):
                     )
 
                 runtime.core.cuda_launch_kernel(
-                    device.context, hooks.backward, bounds.size, max_blocks, kernel_params, stream.cuda_stream
+                    device.context, hooks.backward, bounds.size, max_blocks, tile_size, kernel_params, stream.cuda_stream
                 )
 
             else:
@@ -4375,7 +4377,7 @@ def pack_args(args, params, adjoint=False):
                 else:
                     # launch
                     runtime.core.cuda_launch_kernel(
-                        device.context, hooks.forward, bounds.size, max_blocks, kernel_params, stream.cuda_stream
+                        device.context, hooks.forward, bounds.size, max_blocks, tile_size, kernel_params, stream.cuda_stream
                     )
 
             try:
diff --git a/warp/native/array.h b/warp/native/array.h
index b0a43fc5..e9098c87 100644
--- a/warp/native/array.h
+++ b/warp/native/array.h
@@ -269,6 +269,12 @@ CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i)
 template <typename T>
 CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j)
 {
+    if (i < 0 || i >= arr.shape[0])
+        printf("i: %d > arr.shape[0]: %d\n", i, arr.shape[0]);
+
+    if (j < 0 || j >= arr.shape[1])
+        printf("j: %d > arr.shape[1]: %d\n", j, arr.shape[1]);
+
     assert(i >= 0 && i < arr.shape[0]);
     assert(j >= 0 && j < arr.shape[1]);
     
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 434799ec..c6eab8f0 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -3,17 +3,21 @@
 #include "builtin.h"
 
 // #define WP_CONCAT(x, y) x ## y
-// #define WP_SHARED_MEM(name, id) WP_CONCAT(name, id)
+// #define WP_TILE_SHARED_MEM(name, id) WP_CONCAT(name, id)
 
 // #define zero(a) memset(a, 0, sizeof(a));
 
 // #define tile_zeros(a, b, dtype) [](){\
-// static dtype WP_SHARED_MEM(data_, __LINE__)[a][b]; \
-// zero(WP_SHARED_MEM(data_, __LINE__)); \
-// return array_t<dtype>WP_SHARED_MEM(data_, __LINE__; )}()
+// static dtype WP_TILE_SHARED_MEM(data_, __LINE__)[a][b]; \
+// zero(WP_TILE_SHARED_MEM(data_, __LINE__)); \
+// return array_t<dtype>WP_TILE_SHARED_MEM(data_, __LINE__; )}()
 
 #if !defined(__CUDA_ARCH__)
-#define __shared__ static
+#define WP_TILE_SHARED static
+#define WP_TILE_SYNC void
+#else
+#define WP_TILE_SHARED __shared__
+#define WP_TILE_SYNC __syncthreads
 #endif
 
 namespace wp
@@ -23,8 +27,15 @@ namespace wp
 template <typename T, int M, int N, int Index>
 inline CUDA_CALLABLE array_t<T> tile_zeros()
 {
-    __shared__ T data[M*N];
+    const int length = M*N;
+
+    WP_TILE_SHARED T data[length];
     
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {  
+        data[t] = T(0.0);
+    }
+
     return array_t<T>(data, M, N, nullptr);
 }
 
@@ -34,11 +45,11 @@ inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
 {
     const int length = M*N;
 
-    __shared__ T data[length];
+    WP_TILE_SHARED T data[length];
     
     // cooperatively load the tile, using a block-stride iterator
     // todo: use cub::BlockLoad or cg::memcpy_async()?
-    for (int t=threadIdx.y; t < length; t += blockDim.y)
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
         data[t] = index(src, i*M + t/N, j*N + t%N);
     }
@@ -48,18 +59,19 @@ inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
 
 // 2D tile store
 template <typename T>
-inline CUDA_CALLABLE array_t<T> tile_store(const array_t<T>& dest, const array_t<T>& src, int i, int j)
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array_t<T>& src)
 {
-    const int length = src.shape[0]*src.shape[1];
+    const int M = src.shape[0];
+    const int N = src.shape[1];
+    
+    const int length = M*N;
 
     // cooperatively store the tile, using a block-stride iterator
     // todo: use cub::BlockStore or cg::memcpy_async()?
-    for (int t=threadIdx.y; t < length; t += blockDim.y)
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
-        index(dest, i*M + t/N, j*N + t%N, i) = src.data[t];
+        index(dest, i*M + t/N, j*N + t%N) = src.data[t];
     }
-        
-    return array_t<T>(data, M, N, nullptr);
 }
 
 
@@ -69,11 +81,13 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 {    
     const int length = out.shape[0]*out.shape[1];
 
-    for (int t=threadIdx.y; t < length; t += blockDim.y)
+    WP_TILE_SYNC();
+
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
         // compute output index
-        const int i = t%out.shape[0];
-        const int j = t/out.shape[1];
+        const int i = t/out.shape[1];
+        const int j = t%out.shape[1];
 
         T sum = T(0.0);
 
@@ -84,6 +98,8 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 
         index(out, i, j) += sum;
     }
+
+    WP_TILE_SYNC();
 }
 
 
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index 07fa91de..f921a303 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -2787,14 +2787,14 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
     return kernel;
 }
 
-size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream)
+size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int tile_size, void** args, void* stream)
 {
     ContextGuard guard(context);
 
-    const int block_dim = 256;
+    const int block_dim = tile_size;
     // CUDA specs up to compute capability 9.0 says the max x-dim grid is 2**31-1, so
     // grid_dim is fine as an int for the near future
-    int grid_dim = (dim + block_dim - 1)/block_dim;
+    int grid_dim = dim;
 
     if (max_blocks <= 0) {
         max_blocks = 2147483647;
diff --git a/warp/native/warp.h b/warp/native/warp.h
index 2c072b61..58dc5f9b 100644
--- a/warp/native/warp.h
+++ b/warp/native/warp.h
@@ -294,7 +294,7 @@ extern "C"
     WP_API void* cuda_load_module(void* context, const char* ptx);
     WP_API void cuda_unload_module(void* context, void* module);
     WP_API void* cuda_get_kernel(void* context, void* module, const char* name);
-    WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream);
+    WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int tile_size, void** args, void* stream);
 
     WP_API void cuda_set_context_restore_policy(bool always_restore);
     WP_API int cuda_get_context_restore_policy();
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 137662f3..4383f428 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -1,10 +1,52 @@
 import numpy as np
 import warp as wp
 
+#wp.config.mode = "debug"
+
 wp.init()
-wp.set_module_options({"enable_backwards": False})
+wp.set_module_options({"enable_backward": False})
 wp.set_device("cuda:0")
 
+
+wp.build.clear_kernel_cache()
+
+TILE_M = 8
+TILE_N = 4
+
+@wp.kernel
+def copy_tiled(A: wp.array2d(dtype=float),
+               B: wp.array2d(dtype=float)):
+    
+    # tile index
+    i, j = wp.tid() 
+    
+    a = wp.tile_load(A, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_store(B, i, j, a)
+
+
+def test_copy_tiled():
+
+    rng = np.random.default_rng(42)
+
+    M = TILE_M*7
+    N = TILE_N*5
+
+    A = rng.random((M, N), dtype=np.float32)
+    B = rng.random((M, N), dtype=np.float32)
+
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+
+    wp.launch(copy_tiled, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
+
+    assert(np.allclose(A, B_wp.numpy(), rtol=1.e-4))
+    
+    print("Copy passed")
+
+
+#test_copy_tiled()
+
+
 @wp.kernel
 def gemm(A: wp.array2d(dtype=float),
          B: wp.array2d(dtype=float),
@@ -20,6 +62,8 @@ def gemm(A: wp.array2d(dtype=float),
 
     C[i, j] = sum
 
+
+
 TILE_M = wp.constant(16)
 TILE_N = wp.constant(16)
 TILE_K = wp.constant(8)
@@ -38,10 +82,12 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     N = B.shape[1]
     K = A.shape[1]
 
-    for k in range(0, K, TILE_K):
+    count = int(K / 16) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
 
-        a = wp.tile_load(A, i, j+k, m=TILE_M, n=TILE_K)
-        b = wp.tile_load(B, i+k, j, m=TILE_K, n=TILE_N)
+    for k in range(count):
+
+        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
 
         # sum += a*b
         wp.tile_matmul(a, b, sum)
@@ -49,9 +95,9 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     wp.tile_store(C, i, j, sum)
 
 
-M = 240
-K = 80
-N = 350
+M = TILE_M*21
+K = TILE_K*7
+N = TILE_M*12
 
 rng = np.random.default_rng(42)
 A = rng.random((M, K), dtype=np.float32)
@@ -62,21 +108,27 @@ def gemm_tiled(A: wp.array2d(dtype=float),
 B_wp = wp.array(B)
 C_wp = wp.array(C)
 
-iters = 100
+iters = 10
 
 with wp.ScopedTimer("NumPy"):
 
     for i in range(iters):
         C = A@B
 
-#wp.force_load()
+wp.force_load(device="cuda:0")
 
-with wp.ScopedTimer("Warp", cuda_flags=wp.TIMING_KERNEL):
+with wp.ScopedTimer("Warp", cuda_filter=wp.TIMING_KERNEL):
 
     for i in range(iters):
         wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
-    
 
-print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
+
+    print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
+
+    for i in range(iters):
+        wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=256)
+
+
+    print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
 
 

From b98b7069d2c095247a84c0082e785be0be99379d Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 3 May 2024 16:19:49 +1200
Subject: [PATCH 004/102] Fix typo

---
 warp/tests/test_tile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 4383f428..f5e768fe 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -82,7 +82,7 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     N = B.shape[1]
     K = A.shape[1]
 
-    count = int(K / 16) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
+    count = int(K / 8) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
 
     for k in range(count):
 

From 0c196e4a4828c17374bb7a9e3e9029d2b0ad3a5b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 10 May 2024 11:49:36 +1200
Subject: [PATCH 005/102] Working tile/partition based GEMM, experiments with
 CUTLASS/CuTe

---
 warp/build_dll.py                          |   8 +-
 warp/config.py                             |   2 +-
 warp/examples/benchmarks/benchmark_tile.py | 182 ++++++++++++
 warp/native/array.h                        |   9 +-
 warp/native/builtin.h                      |   6 +-
 warp/native/mat.h                          |  11 +-
 warp/native/tile.h                         | 307 ++++++++++++++++++++-
 warp/native/warp.cu                        |  11 +-
 warp/tests/test_tile.py                    |  32 ++-
 9 files changed, 538 insertions(+), 30 deletions(-)
 create mode 100644 warp/examples/benchmarks/benchmark_tile.py

diff --git a/warp/build_dll.py b/warp/build_dll.py
index 25692261..6810d9c7 100644
--- a/warp/build_dll.py
+++ b/warp/build_dll.py
@@ -275,10 +275,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             cu_out = cu_path + ".o"
 
             if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -I"{nanovdb_home}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 --compiler-options=/MT,/Zi,/Od -g -G -O0 -DNDEBUG -D_ITERATOR_DEBUG_LEVEL=0 -I"{native_dir}" -I"{nanovdb_home}" -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
 
             elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -I"{nanovdb_home}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 {" ".join(nvcc_opts)} -I"{native_dir}" -I"{nanovdb_home}" -DNDEBUG -DWP_ENABLE_CUDA=1 -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
 
             with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
@@ -330,10 +330,10 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             cu_out = cu_path + ".o"
 
             if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
 
             elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
 
             with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
diff --git a/warp/config.py b/warp/config.py
index 221fc252..ef16adf4 100644
--- a/warp/config.py
+++ b/warp/config.py
@@ -25,7 +25,7 @@
     None  # preferred CUDA output format for kernels ("ptx" or "cubin"), determined automatically if unspecified
 )
 
-ptx_target_arch: int = 70  # target architecture for PTX generation, defaults to the lowest architecture that supports all of Warp's features
+ptx_target_arch: int = 80  # target architecture for PTX generation, defaults to the lowest architecture that supports all of Warp's features
 
 enable_backward: bool = True  # whether to compiler the backward passes of the kernels
 
diff --git a/warp/examples/benchmarks/benchmark_tile.py b/warp/examples/benchmarks/benchmark_tile.py
new file mode 100644
index 00000000..1918684a
--- /dev/null
+++ b/warp/examples/benchmarks/benchmark_tile.py
@@ -0,0 +1,182 @@
+import numpy as np
+import warp as wp
+
+import torch
+
+wp.init()
+wp.set_module_options({"enable_backward": False, "fast_math": True})
+wp.set_device("cuda:0")
+
+wp.build.clear_kernel_cache()
+
+@wp.kernel
+def gemm(A: wp.array2d(dtype=float),
+         B: wp.array2d(dtype=float),
+         C: wp.array2d(dtype=float)):
+
+    # output index
+    i, j = wp.tid()
+
+    sum = float(0.0)
+
+    for k in range(0, A.shape[1]):
+        sum += A[i, k]*B[k, j]
+
+    C[i, j] = sum
+
+
+
+TILE_M = wp.constant(64)
+TILE_N = wp.constant(64)
+TILE_K = wp.constant(8)
+
+@wp.kernel
+def gemm_tiled(A: wp.array2d(dtype=float),
+               B: wp.array2d(dtype=float),
+               C: wp.array2d(dtype=float)):
+
+    # output tile index
+    i, j = wp.tid()
+
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+
+    M = A.shape[0]
+    N = B.shape[1]
+    K = A.shape[1]
+
+    count = int(K / 8) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
+
+    for k in range(count):
+
+        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+
+        # sum += a*b
+        wp.tile_matmul(a, b, sum)
+
+    wp.tile_store(C, i, j, sum)
+
+
+def benchmark_numpy(A, B, C):
+
+    timers = {}
+    iters = 10
+
+    # warm up
+    for i in range(10):
+        C = A@B
+
+    with wp.ScopedTimer("NumPy", dict=timers):
+
+        for i in range(iters):
+            C = A@B
+
+    return min(timers["NumPy"])
+
+
+def benchmark_warp_simt(A, B, C):
+
+    timers = {}
+    iters = 10
+
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+    C_wp = wp.array(C)
+
+    # warm up
+    for i in range(10):
+        wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+
+    with wp.ScopedTimer("Warp (SIMT)", dict=timers, print=False, synchronize=True):
+        
+        for i in range(iters):
+            wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+
+    return min(timers["Warp (SIMT)"])
+
+
+def benchmark_warp_tiled(A, B, C):
+
+    timers = {}
+    iters = 10
+
+    num_threads = 256#TILE_M*TILE_N
+    
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+    C_wp = wp.array(C)
+
+    # warm up
+    for i in range(10):
+        wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
+
+    with wp.ScopedTimer("Warp (Tiled)", dict=timers, print=False, synchronize=True):
+
+        for i in range(iters):
+            wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
+
+        wp.synchronize()
+
+    return min(timers["Warp (Tiled)"])
+
+
+def benchmark_torch(A, B, C):
+
+    A_tc = torch.from_numpy(A).to("cuda:0")
+    B_tc = torch.from_numpy(B).to("cuda:0")
+    C_tc = torch.from_numpy(C).to("cuda:0")
+
+    # warm-up
+    for i in range(10):
+        torch.matmul(A_tc, B_tc, out=C_tc)
+
+    timers = {}
+    iters = 10
+    
+    torch.cuda.synchronize()
+
+    with wp.ScopedTimer("Torch", dict=timers, print=False):
+
+        for i in range(iters):
+            torch.matmul(A_tc, B_tc)#, out=C_tc)
+
+        torch.cuda.synchronize()
+
+    return min(timers["Torch"])
+
+
+
+results_torch = []
+results_warp_simt = []
+results_warp_tiled = []
+
+print("{:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}".format("M", "N", "K", "Torch", "Warp (SIMT)", "Warp (Tiled)"))
+print("--------------------------------------------------------")
+
+for i in range(2, 33):
+
+    M = i*128
+    N = M
+    K = N
+
+    # M = TILE_M*21
+    # K = TILE_K*7
+    # N = TILE_M*12
+
+    rng = np.random.default_rng(42)
+
+    A = rng.random((M, K), dtype=np.float32)
+    B = rng.random((K, N), dtype=np.float32)
+    C = np.zeros((M, N), dtype=np.float32)
+
+    results_torch.append(benchmark_torch(A, B, C))
+    results_warp_simt.append(0.0)#benchmark_warp_simt(A, B, C))
+    results_warp_tiled.append(benchmark_warp_tiled(A, B, C))
+
+    print("{:>8d} {:>8d} {:>8d} {:>8f} {:>8f} {:>8f}".format(M, N, K, results_torch[-1], results_warp_simt[-1], results_warp_tiled[-1]))
+
+    
+
+
+
+
diff --git a/warp/native/array.h b/warp/native/array.h
index e9098c87..e1acebcf 100644
--- a/warp/native/array.h
+++ b/warp/native/array.h
@@ -269,11 +269,12 @@ CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i)
 template <typename T>
 CUDA_CALLABLE inline size_t byte_offset(const array_t<T>& arr, int i, int j)
 {
-    if (i < 0 || i >= arr.shape[0])
-        printf("i: %d > arr.shape[0]: %d\n", i, arr.shape[0]);
+    // if (i < 0 || i >= arr.shape[0])
+    //     printf("i: %d > arr.shape[0]: %d\n", i, arr.shape[0]);
+
+    // if (j < 0 || j >= arr.shape[1])
+    //     printf("j: %d > arr.shape[1]: %d\n", j, arr.shape[1]);
 
-    if (j < 0 || j >= arr.shape[1])
-        printf("j: %d > arr.shape[1]: %d\n", j, arr.shape[1]);
 
     assert(i >= 0 && i < arr.shape[0]);
     assert(j >= 0 && j < arr.shape[1]);
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index 97737567..682230dd 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1558,4 +1558,8 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 #include "rand.h"
 #include "noise.h"
 #include "matnn.h"
-#include "tile.h"
\ No newline at end of file
+
+// only include in kernels for now
+#if defined(__CUDACC_RTC__)
+#include "tile.h"
+#endif
\ No newline at end of file
diff --git a/warp/native/mat.h b/warp/native/mat.h
index f12733e9..56f86624 100644
--- a/warp/native/mat.h
+++ b/warp/native/mat.h
@@ -518,13 +518,18 @@ inline CUDA_CALLABLE mat_t<Rows,ColsOut,Type> mul(const mat_t<Rows,Cols,Type>& a
 {
     mat_t<Rows,ColsOut,Type> t(0);
     for (unsigned i=0; i < Rows; ++i)
-    {
-        for (unsigned j=0; j < ColsOut; ++j)
+    {        
+        for (unsigned j=0; j < ColsOut; ++j)     
         {
+            Type sum(0.0);
+
             for (unsigned k=0; k < Cols; ++k)
             {
-                t.data[i][j] += a.data[i][k]*b.data[k][j];
+                //t.data[i][j] += a.data[i][k]*b.data[k][j];
+                sum = fmaf(a.data[i][k], b.data[k][j], sum);
             }
+
+            t.data[i][j] = sum;
         }
     }
     
diff --git a/warp/native/tile.h b/warp/native/tile.h
index c6eab8f0..5becab3d 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -2,6 +2,10 @@
 
 #include "builtin.h"
 
+#include "cuda_pipeline_primitives.h"
+
+//#include "cutlass/include/cute/tensor.hpp"
+
 // #define WP_CONCAT(x, y) x ## y
 // #define WP_TILE_SHARED_MEM(name, id) WP_CONCAT(name, id)
 
@@ -20,17 +24,118 @@
 #define WP_TILE_SYNC __syncthreads
 #endif
 
+
 namespace wp
 {
 
+// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
+#if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
+  #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+    #define WP_PRAGMA_UNROLL _Pragma("unroll")
+    #define WP_PRAGMA_NO_UNROLL _Pragma("unroll 1")
+  #else
+    #define WP_PRAGMA_UNROLL #pragma unroll
+    #define WP_PRAGMA_NO_UNROLL #pragma unroll 1
+  #endif
+
+#else
+
+    #define WP_PRAGMA_UNROLL
+    #define WP_PRAGMA_NO_UNROLL
+
+#endif
+
+#if 0
+template <class TA, class ASmemLayout, class AThreadLayout,
+          class TB, class BSmemLayout, class BThreadLayout,
+          class TC, class CSmemLayout, class CThreadLayout>
+
+CUDA_CALLABLE inline void
+gemm_device(TA const* smemA, ASmemLayout sA_layout, AThreadLayout tA,
+            TB const* smemB, BSmemLayout sB_layout, BThreadLayout tB,
+            TC      * smemC, CSmemLayout sC_layout, CThreadLayout tC)
+{
+	using namespace cute;
+
+	static_assert(is_static<AThreadLayout>::value);
+	static_assert(is_static<BThreadLayout>::value);
+	static_assert(is_static<CThreadLayout>::value);
+
+
+	static_assert(is_static<ASmemLayout>::value);
+	static_assert(is_static<BSmemLayout>::value);
+	static_assert(is_static<CSmemLayout>::value);
+
+
+	Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
+	Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
+	Tensor sC = make_tensor(make_smem_ptr(smemC), sC_layout);            // (BLK_M,BLK_K)
+
+	
+	Tensor tAsA = local_partition(sA, tA, threadIdx.x);                  // (THR_M,THR_K)
+	Tensor tBsB = local_partition(sB, tB, threadIdx.x);                  // (THR_N,THR_K)
+
+
+	// Partition sA (M,K) by the rows of tC
+	Tensor tCsA = local_partition(sA, tC, threadIdx.x, Step<X,_1>{});   // (THR_M,BLK_K)
+	// Partition sB (K,M) by the rows of tC
+	Tensor tCsB = local_partition(sB, tC, threadIdx.x, Step<_1, X>{});   // (THR_N,BLK_K)
+
+	// Partition gC (M,N) by the tile of tC
+	Tensor tCsC = local_partition(sC, tC, threadIdx.x, Step<_1,_1>{});   // (THR_M,THR_N)
+
+	// Allocate the accumulators -- same shape/layout as the partitioned data
+	Tensor tCrC = make_tensor_like(tCsC);                                // (THR_M,THR_N)
+
+	//*******************
+	// MM-QUESTION: this is not quite right, we need a 3d shape, but should we use local_partition or local_tile?
+	auto K_TILE_MAX = 1;//size<2>(tAsA);
+
+	// ensure smem is ready
+	__syncthreads();
+
+	if (threadIdx.x == 0 && blockIdx.x == 0)
+	{
+		print(sA); printf("\n");
+		print(sB); printf("\n");
+		print(sC); printf("\n");
+
+		print(tCsA); printf("\n");
+		print(tCsB); printf("\n");
+		print(tCsC); printf("\n");
+	}
+
+	for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
+	{
+		// Copy gmem to smem with tA|tB thread-partitioned tensors
+		// copy(tAgA(_,_,k_tile), tAsA);      // A   (THR_M,THR_K) -> (THR_M,THR_K)
+		// copy(tBgB(_,_,k_tile), tBsB);      // B   (THR_N,THR_K) -> (THR_N,THR_K)
+
+		//*******************
+		// MM-QUESTION: how to 'advance' tCsA and tCsB to next tile in smem instead of above copy from global?
+		gemm(tCsA, tCsB, tCrC);
+	}
+
+	CUTE_UNROLL
+	for (int i = 0; i < size(tCsA); ++i) {
+		tCsC(i) += tCrC(i);
+	}
+
+	// ensure writes to shared are visible
+    __syncthreads();         
+}
+
+#endif
+
 // 2D tile zero
 template <typename T, int M, int N, int Index>
 inline CUDA_CALLABLE array_t<T> tile_zeros()
 {
     const int length = M*N;
 
-    WP_TILE_SHARED T data[length];
+    WP_TILE_SHARED __align__(16) T data[length];
     
+    WP_PRAGMA_UNROLL
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
         data[t] = T(0.0);
@@ -45,14 +150,30 @@ inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
 {
     const int length = M*N;
 
-    WP_TILE_SHARED T data[length];
+    WP_TILE_SHARED __align__(16) T data[length];
     
     // cooperatively load the tile, using a block-stride iterator
     // todo: use cub::BlockLoad or cg::memcpy_async()?
-    for (int t=threadIdx.x; t < length; t += blockDim.x)
+
+    // WP_PRAGMA_UNROLL
+    // for (int t=threadIdx.x; t < length; t += blockDim.x)
+    // {  
+    //     data[t] = index(src, i*M + t/N, j*N + t%N);
+    // }
+
+    // // async copies
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x*4; t < length; t += blockDim.x*4)
     {  
-        data[t] = index(src, i*M + t/N, j*N + t%N);
+        //data[t] = index(src, i*M + t/N, j*N + t%N);
+        __pipeline_memcpy_async(&data[t],
+                                &index(src, i*M + t/N, j*N + t%N),
+                                sizeof(T)*4);
     }
+
+    __pipeline_commit();
+    __pipeline_wait_prior(0);
+
         
     return array_t<T>(data, M, N, nullptr);
 }
@@ -68,35 +189,202 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array
 
     // cooperatively store the tile, using a block-stride iterator
     // todo: use cub::BlockStore or cg::memcpy_async()?
+    WP_PRAGMA_UNROLL
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
         index(dest, i*M + t/N, j*N + t%N) = src.data[t];
     }
 }
 
+// template <typename T>
+// inline CUDA_CALLABLE void tile_matmul_cute(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+// {
+// 	using namespace cute;
+
+// 	// Define CTA matrix size (static)
+
+// 	auto bM = Int<64>{};
+// 	auto bN = Int<64>{};
+// 	auto bK = Int<8>{};
+
+// 	auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
+
+// 	// Define the smem layouts (static)
+// 	auto sA = make_layout(make_shape(bM,bK), LayoutRight{});  
+// 	auto sB = make_layout(make_shape(bN,bK));   
+// 	auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
+
+// 	// Define the thread layouts (static)
+// 	auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{}), LayoutRight{});  
+// 	auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}), LayoutRight{});  
+// 	auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}), LayoutRight{});  
+
+//   gemm_device
+//       (A.data, sA, tA,
+//        B.data, sB, tB,
+//        out.data,sC, tC);
+// }
+
+
+template <typename T>
+inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
+{
+    return p[i*stride + j];
+}
+
+template <typename T>
+inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
+{
+    return p[i*stride + j];
+}
+
+template <unsigned M, unsigned N, typename T>
+struct partition_t
+{
+	partition_t(array_t<T> A)
+	{
+		data = A;
+		
+		// todo: do ceil div for non-multiples of M,N
+		shape[0] = A.shape[0]/M;
+		shape[1] = A.shape[1]/N;
+	}
+
+	// underlying data
+	array_t<T> data;
+	
+	// partition dimensions
+	int shape[2];
+};
+
+template <unsigned M, unsigned N, typename T>
+int partition_size(const partition_t<M, N, T>& tile)
+{
+	return tile.shape[0]*tile.shape[1];
+}
+
+// returns the x, y coordinates of a tile given a linear index
+template <unsigned M, unsigned N, typename T>
+void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
+{
+	i = t/tile.shape[1];
+	j = t%tile.shape[1];
+}
+
+template <unsigned M, unsigned N, typename T>
+mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
+{
+	mat_t<M, N, T> out;
+	
+	const int tile_i = i*M;
+	const int tile_j = j*N;
+
+	// WP_PRAGMA_UNROLL
+	// for (int i=0; i < M; ++i)
+	// {
+	// 	WP_PRAGMA_UNROLL
+	// 	for (int j=0; j < N; ++j)
+	// 	{
+	// 		out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
+	// 	}
+	// }
+	
+
+	return out;
+}
+
+template <unsigned M, unsigned N, typename T>
+void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
+{
+	mat_t<M, N, T> out;
+
+	const int tile_i = M*i;
+	const int tile_j = N*j;
+
+	WP_PRAGMA_UNROLL
+	for (int i=0; i < M; ++i)
+	{	
+		WP_PRAGMA_UNROLL
+		for (int j=0; j < N; ++j)
+		{
+			index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
+		}
+	}
+}
+
 
-// 2D gemm accumulate out += A*B
 template <typename T>
 inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{   
+	const int TILE_M = 4;
+	const int TILE_N = 4;
+	const int TILE_K = 4;
+
+	partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
+	partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
+	partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
+
+    const int length = partition_size(C_tile);
+
+    WP_TILE_SYNC();
+
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {  
+		int i, j;
+		partition_coord(C_tile, t, i, j);
+
+		// accumulator
+		mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
+
+        WP_PRAGMA_UNROLL
+        for (int k=0; k < A_tile.shape[1]; ++k)
+        {
+			mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
+			mat_t<TILE_K, TILE_M, T> b = partition_load(B_tile, k, j);
+
+			sum += mul(a, b);
+        }
+        
+		partition_store(C_tile, i, j, sum);
+    }
+
+    WP_TILE_SYNC();
+}
+
+
+
+// 2D gemm accumulate out += A*B
+template <typename T>
+inline CUDA_CALLABLE void tile_matmul_scalar(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
 {    
     const int length = out.shape[0]*out.shape[1];
 
     WP_TILE_SYNC();
 
+    const T* __restrict__ A_ptr = A.data;
+    const T* __restrict__ B_ptr = B.data;
+    T* __restrict__ C_ptr = out.data;
+
+    WP_PRAGMA_UNROLL
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
         // compute output index
         const int i = t/out.shape[1];
         const int j = t%out.shape[1];
 
-        T sum = T(0.0);
+		T sum(0.0);
 
+        WP_PRAGMA_UNROLL
         for (int k=0; k < A.shape[1]; ++k)
         {
-            sum += index(A, i, k)*index(B, k, j);
-        }
+            T a = index(A_ptr, i, k, A.shape[1]);
+            T b = index(B_ptr, k, j, B.shape[1]);
 
-        index(out, i, j) += sum;
+            sum = fmaf(a, b, sum);
+        }
+        
+        index(C_ptr, i, j, out.shape[1]) += sum;
     }
 
     WP_TILE_SYNC();
@@ -104,5 +392,4 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 
 
 
-
 } // namespace wp
\ No newline at end of file
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index f921a303..eac06ebc 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -2536,7 +2536,7 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
     opts.push_back(include_opt);
-    opts.push_back("--std=c++11");
+    opts.push_back("--std=c++17");
     
     if (debug)
     {
@@ -2556,6 +2556,15 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     if (fast_math)
         opts.push_back("--use_fast_math");
 
+    char include_cutlass[max_path];
+    sprintf(include_cutlass, "--include-path=%s/cutlass/include", include_dir);
+    opts.push_back(include_cutlass);
+
+    //opts.push_back("--include-path=_build/target-deps/cuda/include");
+    opts.push_back("--include-path=C:\\packman-repo\\chk\\cuda\\11.8.0_522.06-abe3d9d7-windows-x86_64\\include");
+
+    opts.push_back("--device-as-default-execution-space");
+
 
     nvrtcProgram prog;
     nvrtcResult res;
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index f5e768fe..921f269e 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -1,6 +1,8 @@
 import numpy as np
 import warp as wp
 
+import torch
+
 #wp.config.mode = "debug"
 
 wp.init()
@@ -64,8 +66,8 @@ def gemm(A: wp.array2d(dtype=float),
 
 
 
-TILE_M = wp.constant(16)
-TILE_N = wp.constant(16)
+TILE_M = wp.constant(64)
+TILE_N = wp.constant(64)
 TILE_K = wp.constant(8)
 
 @wp.kernel
@@ -95,9 +97,9 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     wp.tile_store(C, i, j, sum)
 
 
-M = TILE_M*21
-K = TILE_K*7
-N = TILE_M*12
+M = TILE_M*7
+K = TILE_K*4
+N = TILE_N*6
 
 rng = np.random.default_rng(42)
 A = rng.random((M, K), dtype=np.float32)
@@ -126,9 +128,27 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
 
     for i in range(iters):
-        wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=256)
+        wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=128)
+        wp.synchronize()
 
 
     print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
 
 
+A_tc = torch.from_numpy(A).to("cuda:0")
+B_tc = torch.from_numpy(B).to("cuda:0")
+C_tc = torch.from_numpy(C).to("cuda:0")
+
+for i in range(10):
+    torch.matmul(A_tc, B_tc, out=C_tc)
+
+with wp.ScopedTimer("Torch"):
+
+    for i in range(iters):
+        torch.matmul(A_tc, B_tc, out=C_tc)
+
+    torch.cuda.synchronize()
+
+    
+
+

From 054155b1c4b9b8b23380120103622195799f529e Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 10 May 2024 11:51:21 +1200
Subject: [PATCH 006/102] Re-enable partition load, currently at 2.8ms for
 1024x1024 fp32 GEMM

---
 warp/native/tile.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index 5becab3d..7c1c45c0 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -279,15 +279,15 @@ mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
 	const int tile_i = i*M;
 	const int tile_j = j*N;
 
-	// WP_PRAGMA_UNROLL
-	// for (int i=0; i < M; ++i)
-	// {
-	// 	WP_PRAGMA_UNROLL
-	// 	for (int j=0; j < N; ++j)
-	// 	{
-	// 		out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
-	// 	}
-	// }
+	WP_PRAGMA_UNROLL
+	for (int i=0; i < M; ++i)
+	{
+		WP_PRAGMA_UNROLL
+		for (int j=0; j < N; ++j)
+		{
+			out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
+		}
+	}
 	
 
 	return out;

From 560d19408e1fff47a4e791f4802bebc1dddc8654 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 13 May 2024 15:50:37 +1200
Subject: [PATCH 007/102] Some more experiments with coalesced loads from
 shared memory

---
 build_lib.py        |  1 +
 warp/native/tile.h  | 47 ++++++++++++++++++++++++++++++---------------
 warp/native/warp.cu |  2 ++
 3 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/build_lib.py b/build_lib.py
index 7a33bd8b..e781f025 100644
--- a/build_lib.py
+++ b/build_lib.py
@@ -52,6 +52,7 @@
 parser.set_defaults(fast_math=False)
 
 parser.add_argument("--quick", action="store_true", help="Only generate PTX code, disable CUTLASS ops")
+parser.set_defaults(quick=True)
 
 parser.add_argument("--build_llvm", action="store_true", help="Build Clang/LLVM compiler from source, default disabled")
 parser.add_argument("--no_build_llvm", dest="build_llvm", action="store_false")
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 7c1c45c0..d2c55ff7 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -161,20 +161,21 @@ inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
     //     data[t] = index(src, i*M + t/N, j*N + t%N);
     // }
 
-    // // async copies
+    // // async copies (assumes row-major i.e.: stride 1 on y axis)
+	const int s = 4;
+
     WP_PRAGMA_UNROLL
-    for (int t=threadIdx.x*4; t < length; t += blockDim.x*4)
+    for (int t=threadIdx.x*s; t < length; t += blockDim.x*s)
     {  
         //data[t] = index(src, i*M + t/N, j*N + t%N);
         __pipeline_memcpy_async(&data[t],
                                 &index(src, i*M + t/N, j*N + t%N),
-                                sizeof(T)*4);
+                                sizeof(T)*s);
     }
 
     __pipeline_commit();
-    __pipeline_wait_prior(0);
 
-        
+
     return array_t<T>(data, M, N, nullptr);
 }
 
@@ -187,7 +188,7 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array
     
     const int length = M*N;
 
-    // cooperatively store the tile, using a block-stride iterator
+	// cooperatively store the tile, using a block-stride iterator
     // todo: use cub::BlockStore or cg::memcpy_async()?
     WP_PRAGMA_UNROLL
     for (int t=threadIdx.x; t < length; t += blockDim.x)
@@ -241,7 +242,7 @@ inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
 template <unsigned M, unsigned N, typename T>
 struct partition_t
 {
-	partition_t(array_t<T> A)
+	inline partition_t(array_t<T> A)
 	{
 		data = A;
 		
@@ -258,21 +259,21 @@ struct partition_t
 };
 
 template <unsigned M, unsigned N, typename T>
-int partition_size(const partition_t<M, N, T>& tile)
+inline int partition_size(const partition_t<M, N, T>& tile)
 {
 	return tile.shape[0]*tile.shape[1];
 }
 
 // returns the x, y coordinates of a tile given a linear index
 template <unsigned M, unsigned N, typename T>
-void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
+inline void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
 {
 	i = t/tile.shape[1];
 	j = t%tile.shape[1];
 }
 
 template <unsigned M, unsigned N, typename T>
-mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
+inline mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
 {
 	mat_t<M, N, T> out;
 	
@@ -288,13 +289,28 @@ mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
 			out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
 		}
 	}
+
+	// Specialization for when N = 4 and assumes data was swizzled into 4x4 blocks
+	// during tile_load(), this results in zero bank conflicts + 128 bit loads
+		
+	// const int tile_index = i*N + j;
+	// const int tile_count = partition_size(tile);
+
+	// float4* out4 = (float4*)(&out.data[0][0]);
+
+	// WP_PRAGMA_UNROLL
+	// for (int t=0; t < M; t += 4)
+	// {
+	// 	out4[t] = ((float4*)(tile.data.data))[t*tile_count + tile_index];
+	// }
 	
 
+
 	return out;
 }
 
 template <unsigned M, unsigned N, typename T>
-void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
+inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
 {
 	mat_t<M, N, T> out;
 
@@ -326,9 +342,10 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 
     const int length = partition_size(C_tile);
 
+    __pipeline_wait_prior(0);
+
     WP_TILE_SYNC();
 
-    WP_PRAGMA_UNROLL
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
 		int i, j;
@@ -338,10 +355,10 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 		mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
 
         WP_PRAGMA_UNROLL
-        for (int k=0; k < A_tile.shape[1]; ++k)
+        for (int k=0; k < A_tile.shape[1]; k++)
         {
-			mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
-			mat_t<TILE_K, TILE_M, T> b = partition_load(B_tile, k, j);
+			const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
+			const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
 
 			sum += mul(a, b);
         }
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index eac06ebc..90468965 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -2564,6 +2564,8 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     opts.push_back("--include-path=C:\\packman-repo\\chk\\cuda\\11.8.0_522.06-abe3d9d7-windows-x86_64\\include");
 
     opts.push_back("--device-as-default-execution-space");
+    opts.push_back("--extra-device-vectorization");
+    opts.push_back("--restrict");
 
 
     nvrtcProgram prog;

From 8f1aed1b537dc4c267492c1c2b2f22d2de16c4d4 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 13 May 2024 15:51:06 +1200
Subject: [PATCH 008/102] Use CUDA graphs in benchmark_tile.py

---
 warp/examples/benchmarks/benchmark_tile.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/warp/examples/benchmarks/benchmark_tile.py b/warp/examples/benchmarks/benchmark_tile.py
index 1918684a..fc5900fe 100644
--- a/warp/examples/benchmarks/benchmark_tile.py
+++ b/warp/examples/benchmarks/benchmark_tile.py
@@ -100,22 +100,31 @@ def benchmark_warp_tiled(A, B, C):
     timers = {}
     iters = 10
 
-    num_threads = 256#TILE_M*TILE_N
+    # must match with the tile_matmul() partition size
+    SUB_TILE_M = 4
+    SUB_TILE_N = 4
+
+    num_threads = int(TILE_M/SUB_TILE_M)*int(TILE_N/SUB_TILE_N);
     
     A_wp = wp.array(A)
     B_wp = wp.array(B)
     C_wp = wp.array(C)
 
     # warm up
+    wp.capture_begin()
+
     for i in range(10):
         wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
 
+    graph = wp.capture_end()
+
+
     with wp.ScopedTimer("Warp (Tiled)", dict=timers, print=False, synchronize=True):
 
-        for i in range(iters):
-            wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
+        #for i in range(iters):
+        #    wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
+        wp.capture_launch(graph)
 
-        wp.synchronize()
 
     return min(timers["Warp (Tiled)"])
 
@@ -155,6 +164,8 @@ def benchmark_torch(A, B, C):
 
 for i in range(2, 33):
 
+#for i in range(8,9):
+
     M = i*128
     N = M
     K = N

From 58a834ce7ba6486b9e73aa1dafc71dc51dabf8da Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 22 May 2024 15:39:47 +1200
Subject: [PATCH 009/102] Remove some experiments with CuTe and clean up some
 dead code

---
 warp/native/tile.h      | 279 ++++++++++------------------------------
 warp/tests/test_tile.py |   2 +-
 2 files changed, 71 insertions(+), 210 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index d2c55ff7..df1f8ff1 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -2,20 +2,9 @@
 
 #include "builtin.h"
 
+// todo: requires CTK, replace with inline ptx
 #include "cuda_pipeline_primitives.h"
 
-//#include "cutlass/include/cute/tensor.hpp"
-
-// #define WP_CONCAT(x, y) x ## y
-// #define WP_TILE_SHARED_MEM(name, id) WP_CONCAT(name, id)
-
-// #define zero(a) memset(a, 0, sizeof(a));
-
-// #define tile_zeros(a, b, dtype) [](){\
-// static dtype WP_TILE_SHARED_MEM(data_, __LINE__)[a][b]; \
-// zero(WP_TILE_SHARED_MEM(data_, __LINE__)); \
-// return array_t<dtype>WP_TILE_SHARED_MEM(data_, __LINE__; )}()
-
 #if !defined(__CUDA_ARCH__)
 #define WP_TILE_SHARED static
 #define WP_TILE_SYNC void
@@ -45,87 +34,6 @@ namespace wp
 
 #endif
 
-#if 0
-template <class TA, class ASmemLayout, class AThreadLayout,
-          class TB, class BSmemLayout, class BThreadLayout,
-          class TC, class CSmemLayout, class CThreadLayout>
-
-CUDA_CALLABLE inline void
-gemm_device(TA const* smemA, ASmemLayout sA_layout, AThreadLayout tA,
-            TB const* smemB, BSmemLayout sB_layout, BThreadLayout tB,
-            TC      * smemC, CSmemLayout sC_layout, CThreadLayout tC)
-{
-	using namespace cute;
-
-	static_assert(is_static<AThreadLayout>::value);
-	static_assert(is_static<BThreadLayout>::value);
-	static_assert(is_static<CThreadLayout>::value);
-
-
-	static_assert(is_static<ASmemLayout>::value);
-	static_assert(is_static<BSmemLayout>::value);
-	static_assert(is_static<CSmemLayout>::value);
-
-
-	Tensor sA = make_tensor(make_smem_ptr(smemA), sA_layout);            // (BLK_M,BLK_K)
-	Tensor sB = make_tensor(make_smem_ptr(smemB), sB_layout);            // (BLK_N,BLK_K)
-	Tensor sC = make_tensor(make_smem_ptr(smemC), sC_layout);            // (BLK_M,BLK_K)
-
-	
-	Tensor tAsA = local_partition(sA, tA, threadIdx.x);                  // (THR_M,THR_K)
-	Tensor tBsB = local_partition(sB, tB, threadIdx.x);                  // (THR_N,THR_K)
-
-
-	// Partition sA (M,K) by the rows of tC
-	Tensor tCsA = local_partition(sA, tC, threadIdx.x, Step<X,_1>{});   // (THR_M,BLK_K)
-	// Partition sB (K,M) by the rows of tC
-	Tensor tCsB = local_partition(sB, tC, threadIdx.x, Step<_1, X>{});   // (THR_N,BLK_K)
-
-	// Partition gC (M,N) by the tile of tC
-	Tensor tCsC = local_partition(sC, tC, threadIdx.x, Step<_1,_1>{});   // (THR_M,THR_N)
-
-	// Allocate the accumulators -- same shape/layout as the partitioned data
-	Tensor tCrC = make_tensor_like(tCsC);                                // (THR_M,THR_N)
-
-	//*******************
-	// MM-QUESTION: this is not quite right, we need a 3d shape, but should we use local_partition or local_tile?
-	auto K_TILE_MAX = 1;//size<2>(tAsA);
-
-	// ensure smem is ready
-	__syncthreads();
-
-	if (threadIdx.x == 0 && blockIdx.x == 0)
-	{
-		print(sA); printf("\n");
-		print(sB); printf("\n");
-		print(sC); printf("\n");
-
-		print(tCsA); printf("\n");
-		print(tCsB); printf("\n");
-		print(tCsC); printf("\n");
-	}
-
-	for (int k_tile = 0; k_tile < K_TILE_MAX; ++k_tile)
-	{
-		// Copy gmem to smem with tA|tB thread-partitioned tensors
-		// copy(tAgA(_,_,k_tile), tAsA);      // A   (THR_M,THR_K) -> (THR_M,THR_K)
-		// copy(tBgB(_,_,k_tile), tBsB);      // B   (THR_N,THR_K) -> (THR_N,THR_K)
-
-		//*******************
-		// MM-QUESTION: how to 'advance' tCsA and tCsB to next tile in smem instead of above copy from global?
-		gemm(tCsA, tCsB, tCrC);
-	}
-
-	CUTE_UNROLL
-	for (int i = 0; i < size(tCsA); ++i) {
-		tCsC(i) += tCrC(i);
-	}
-
-	// ensure writes to shared are visible
-    __syncthreads();         
-}
-
-#endif
 
 // 2D tile zero
 template <typename T, int M, int N, int Index>
@@ -152,22 +60,22 @@ inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
 
     WP_TILE_SHARED __align__(16) T data[length];
     
-    // cooperatively load the tile, using a block-stride iterator
-    // todo: use cub::BlockLoad or cg::memcpy_async()?
-
+    //---------------
+    // naive-synchronous load
+    //
     // WP_PRAGMA_UNROLL
     // for (int t=threadIdx.x; t < length; t += blockDim.x)
     // {  
     //     data[t] = index(src, i*M + t/N, j*N + t%N);
     // }
 
-    // // async copies (assumes row-major i.e.: stride 1 on y axis)
-	const int s = 4;
+    //---------------
+    // async 128 bit loads (assumes row-major i.e.: stride 1 on y axis and 4-element alignment on dimension)
+    const int s = 4;
 
     WP_PRAGMA_UNROLL
     for (int t=threadIdx.x*s; t < length; t += blockDim.x*s)
     {  
-        //data[t] = index(src, i*M + t/N, j*N + t%N);
         __pipeline_memcpy_async(&data[t],
                                 &index(src, i*M + t/N, j*N + t%N),
                                 sizeof(T)*s);
@@ -188,8 +96,7 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array
     
     const int length = M*N;
 
-	// cooperatively store the tile, using a block-stride iterator
-    // todo: use cub::BlockStore or cg::memcpy_async()?
+    // cooperatively store the tile, using a block-stride iterator
     WP_PRAGMA_UNROLL
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
@@ -197,36 +104,6 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array
     }
 }
 
-// template <typename T>
-// inline CUDA_CALLABLE void tile_matmul_cute(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
-// {
-// 	using namespace cute;
-
-// 	// Define CTA matrix size (static)
-
-// 	auto bM = Int<64>{};
-// 	auto bN = Int<64>{};
-// 	auto bK = Int<8>{};
-
-// 	auto cta_tiler = make_shape(bM, bN, bK);                   // (BLK_M, BLK_N, BLK_K)
-
-// 	// Define the smem layouts (static)
-// 	auto sA = make_layout(make_shape(bM,bK), LayoutRight{});  
-// 	auto sB = make_layout(make_shape(bN,bK));   
-// 	auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
-
-// 	// Define the thread layouts (static)
-// 	auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{}), LayoutRight{});  
-// 	auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}), LayoutRight{});  
-// 	auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}), LayoutRight{});  
-
-//   gemm_device
-//       (A.data, sA, tA,
-//        B.data, sB, tB,
-//        out.data,sC, tC);
-// }
-
-
 template <typename T>
 inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
 {
@@ -242,103 +119,87 @@ inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
 template <unsigned M, unsigned N, typename T>
 struct partition_t
 {
-	inline partition_t(array_t<T> A)
-	{
-		data = A;
-		
-		// todo: do ceil div for non-multiples of M,N
-		shape[0] = A.shape[0]/M;
-		shape[1] = A.shape[1]/N;
-	}
-
-	// underlying data
-	array_t<T> data;
-	
-	// partition dimensions
-	int shape[2];
+    inline partition_t(array_t<T> A)
+    {
+        data = A;
+        
+        // todo: do ceil div for non-multiples of M,N
+        shape[0] = A.shape[0]/M;
+        shape[1] = A.shape[1]/N;
+    }
+
+    // underlying data
+    array_t<T> data;
+    
+    // partition dimensions
+    int shape[2];
 };
 
 template <unsigned M, unsigned N, typename T>
 inline int partition_size(const partition_t<M, N, T>& tile)
 {
-	return tile.shape[0]*tile.shape[1];
+    return tile.shape[0]*tile.shape[1];
 }
 
 // returns the x, y coordinates of a tile given a linear index
 template <unsigned M, unsigned N, typename T>
 inline void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
 {
-	i = t/tile.shape[1];
-	j = t%tile.shape[1];
+    i = t/tile.shape[1];
+    j = t%tile.shape[1];
 }
 
 template <unsigned M, unsigned N, typename T>
 inline mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
 {
-	mat_t<M, N, T> out;
-	
-	const int tile_i = i*M;
-	const int tile_j = j*N;
-
-	WP_PRAGMA_UNROLL
-	for (int i=0; i < M; ++i)
-	{
-		WP_PRAGMA_UNROLL
-		for (int j=0; j < N; ++j)
-		{
-			out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
-		}
-	}
-
-	// Specialization for when N = 4 and assumes data was swizzled into 4x4 blocks
-	// during tile_load(), this results in zero bank conflicts + 128 bit loads
-		
-	// const int tile_index = i*N + j;
-	// const int tile_count = partition_size(tile);
-
-	// float4* out4 = (float4*)(&out.data[0][0]);
-
-	// WP_PRAGMA_UNROLL
-	// for (int t=0; t < M; t += 4)
-	// {
-	// 	out4[t] = ((float4*)(tile.data.data))[t*tile_count + tile_index];
-	// }
-	
-
-
-	return out;
+    mat_t<M, N, T> out;
+    
+    const int tile_i = i*M;
+    const int tile_j = j*N;
+
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < M; ++i)
+    {
+        WP_PRAGMA_UNROLL
+        for (int j=0; j < N; ++j)
+        {
+            out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
+        }
+    }
+
+    return out;
 }
 
 template <unsigned M, unsigned N, typename T>
 inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
 {
-	mat_t<M, N, T> out;
-
-	const int tile_i = M*i;
-	const int tile_j = N*j;
-
-	WP_PRAGMA_UNROLL
-	for (int i=0; i < M; ++i)
-	{	
-		WP_PRAGMA_UNROLL
-		for (int j=0; j < N; ++j)
-		{
-			index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
-		}
-	}
+    mat_t<M, N, T> out;
+
+    const int tile_i = M*i;
+    const int tile_j = N*j;
+
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < M; ++i)
+    {	
+        WP_PRAGMA_UNROLL
+        for (int j=0; j < N; ++j)
+        {
+            index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
+        }
+    }
 }
 
 
 template <typename T>
 inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
 {   
-	const int TILE_M = 4;
-	const int TILE_N = 4;
-	const int TILE_K = 4;
+    const int TILE_M = 4;
+    const int TILE_N = 4;
+    const int TILE_K = 4;
 
-	partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
-	partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
-	partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
+    partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
+    partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
+    partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
 
     const int length = partition_size(C_tile);
 
@@ -348,22 +209,22 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
-		int i, j;
-		partition_coord(C_tile, t, i, j);
+        int i, j;
+        partition_coord(C_tile, t, i, j);
 
-		// accumulator
-		mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
+        // accumulator
+        mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
 
         WP_PRAGMA_UNROLL
         for (int k=0; k < A_tile.shape[1]; k++)
         {
-			const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
-			const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
+            const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
+            const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
 
-			sum += mul(a, b);
+            sum += mul(a, b);
         }
         
-		partition_store(C_tile, i, j, sum);
+        partition_store(C_tile, i, j, sum);
     }
 
     WP_TILE_SYNC();
@@ -390,7 +251,7 @@ inline CUDA_CALLABLE void tile_matmul_scalar(const array_t<T>& A, const array_t<
         const int i = t/out.shape[1];
         const int j = t%out.shape[1];
 
-		T sum(0.0);
+        T sum(0.0);
 
         WP_PRAGMA_UNROLL
         for (int k=0; k < A.shape[1]; ++k)
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 921f269e..78fb7fc9 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -84,7 +84,7 @@ def gemm_tiled(A: wp.array2d(dtype=float),
     N = B.shape[1]
     K = A.shape[1]
 
-    count = int(K / 8) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
+    count = int(K / 8) # todo: must be the same as TILE_K
 
     for k in range(count):
 

From 3d92decbdbe907f5f966db1e6460adf710ced7b5 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 24 May 2024 13:30:51 +1200
Subject: [PATCH 010/102] Add CuTe implementation using
 `cute::cooperative_gem()` primitive

---
 warp/native/tile.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index df1f8ff1..fa04f958 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -5,6 +5,13 @@
 // todo: requires CTK, replace with inline ptx
 #include "cuda_pipeline_primitives.h"
 
+#define USE_CUTE 0
+
+#if USE_CUTE
+#include "cutlass/include/cute/tensor.hpp"
+#include "cutlass/include/cute/algorithm/cooperative_gemm.hpp"
+#endif // USE_CUTE
+
 #if !defined(__CUDA_ARCH__)
 #define WP_TILE_SHARED static
 #define WP_TILE_SYNC void
@@ -190,6 +197,8 @@ inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, cons
 }
 
 
+#if !USE_CUTE
+
 template <typename T>
 inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
 {   
@@ -268,6 +277,51 @@ inline CUDA_CALLABLE void tile_matmul_scalar(const array_t<T>& A, const array_t<
     WP_TILE_SYNC();
 }
 
+#else
+
+
+template <typename T>
+inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{
+	using namespace cute;
+
+    __pipeline_wait_prior(0);
+
+    // ensure smem tile is ready
+ 	WP_TILE_SYNC();
+
+	// Define CTA matrix size (static)
+	auto bM = Int<64>{};
+	auto bN = Int<64>{};
+	auto bK = Int<8>{};
+
+	// Define the smem layouts (static)
+	auto sA = make_layout(make_shape(bM, bK), LayoutRight{});  
+	auto sB = make_layout(make_shape(bN, bK));   
+	auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
+
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<float>(A.data), sA);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<float>(B.data), sB);
+    Tensor s_c_tensor = make_tensor(make_smem_ptr<float>(out.data), sC);
+
+
+    TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
+                                 Layout<Shape<_16,_8,_1>>{});  // 16x8x1 UniversalFMA, assumes blockDim=128
+
+    
+    cooperative_gemm< AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
+                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>, 
+                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>
+                    >(
+      threadIdx.x, tiled_mma,
+      1.0f, s_a_tensor, s_b_tensor, 1.0f, s_c_tensor,
+      cute::identity(), cute::identity(), cute::identity(), cute::identity()
+    );
+
+    WP_TILE_SYNC();
+
+}
 
+#endif // USE_CUTE
 
 } // namespace wp
\ No newline at end of file

From f47d059d00e55714670861d477897bb77833c4ba Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 7 Jun 2024 08:02:19 +1200
Subject: [PATCH 011/102] First pass of Tile expressions working

---
 warp/builtins.py        |  12 +-
 warp/codegen.py         |  35 ++-
 warp/native/builtin.h   |   1 +
 warp/native/tile.h      | 474 +++++++++++++++++++++-------------------
 warp/native/tile_gemm.h | 310 ++++++++++++++++++++++++++
 warp/tests/test_tile.py | 156 +++++++------
 warp/types.py           |  13 ++
 7 files changed, 700 insertions(+), 301 deletions(-)
 create mode 100644 warp/native/tile_gemm.h

diff --git a/warp/builtins.py b/warp/builtins.py
index e1860363..96dc4282 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1395,7 +1395,6 @@ def tile_zeros_value_func(arg_types, kwds, templates):
     return array(dtype=dtype)
 
 
-
 add_builtin(
     "tile_zeros",
     input_types={"m": int, "n": int, "dtype": Scalar},
@@ -1431,17 +1430,18 @@ def tile_load_value_func(arg_types, kwds, templates):
         raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
 
     m, n = kwds["m"], kwds["n"]
+    dtype = arg_types[0].dtype
 
-    templates.append(arg_types[0].dtype)
+    templates.append(dtype)
     templates.append(m)
     templates.append(n)
 
     global shared_memory_id
-    templates.append(shared_memory_id)
+    #templates.append(shared_memory_id)
 
     shared_memory_id += 1
 
-    return array(dtype=arg_types[0].dtype)
+    return Tile(dtype, m, n, "load")#array(dtype=arg_types[0].dtype)
 
 
 
@@ -1473,8 +1473,8 @@ def tile_store_value_func(arg_types, kwds, templates):
     if not type_is_int(arg_types[2]):
         raise RuntimeError("tile_store() argument 2 must be an integer")
 
-    if not is_array(arg_types[3]):
-        raise RuntimeError("tile_store() argument 3 must be an array")
+    if not is_tile(arg_types[3]):
+        raise RuntimeError("tile_store() argument 3 must be a tile")
 
     return None
 
diff --git a/warp/codegen.py b/warp/codegen.py
index a9972769..ae72c3ce 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -507,6 +507,8 @@ def type_to_ctype(t, value_type=False):
                 dtypestr = f"wp::{t.dtype.__name__}"
             classstr = f"wp::{type(t).__name__}"
             return f"{classstr}_t<{dtypestr}>"
+        elif is_tile(t):
+            return "auto"
         elif isinstance(t, Struct):
             return make_full_qualified_name(t.cls)
         elif is_reference(t):
@@ -1002,6 +1004,9 @@ def add_call(adj, func, args, min_outputs=None, templates=None, kwds=None):
             for i, a in enumerate(args)
         ]
 
+        # used to create an alias of the adjoint var to the primal var for tile ops
+        alias_call = None
+
         if return_type is None:
             # handles expression (zero output) functions, e.g.: void do_something();
 
@@ -1024,10 +1029,16 @@ def add_call(adj, func, args, min_outputs=None, templates=None, kwds=None):
             output_list = [output]
 
             forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+
+            # prepend auto if it is an anonymously typed var (e.g.: a tile op)
+            if output.ctype() == "auto":
+                forward_call = "auto " + forward_call
+                alias_call = f"auto& adj_{output} = var_{output};"
+
             replay_call = forward_call
             if func.custom_replay_func is not None:
                 replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
-
+               
         else:
             # handle multiple value functions
 
@@ -1039,15 +1050,19 @@ def add_call(adj, func, args, min_outputs=None, templates=None, kwds=None):
             )
             replay_call = forward_call
 
+
         if func.skip_replay:
             adj.add_forward(forward_call, replay="// " + replay_call)
         else:
             adj.add_forward(forward_call, replay=replay_call)
 
+        if alias_call:
+            adj.add_forward(alias_call)
+
         if not func.missing_grad and len(args):
             reverse_has_output_args = (
                 func.require_original_output_arg or len(output_list) > 1
-            ) and func.custom_grad_func is None
+            ) and func.custom_grad_func is None            
             arg_str = adj.format_reverse_call_args(
                 args_var,
                 args,
@@ -2562,6 +2577,11 @@ def codegen_func_forward(adj, func_type="kernel", device="cpu"):
     lines += ["// primal vars\n"]
 
     for var in adj.variables:
+        
+        # do not predeclare vars with auto type
+        if var.ctype() == "auto":
+            continue
+
         if var.constant is None:
             lines += [f"{var.ctype()} {var.emit()};\n"]
         else:
@@ -2597,6 +2617,11 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
     lines += ["// primal vars\n"]
 
     for var in adj.variables:
+
+        # do not predeclare vars with auto type
+        if var.ctype() == "auto":
+            continue
+
         if var.constant is None:
             lines += [f"{var.ctype()} {var.emit()};\n"]
         else:
@@ -2607,7 +2632,11 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
     lines += ["// dual vars\n"]
 
     for var in adj.variables:
-        lines += [f"{var.ctype(value_type=True)} {var.emit_adj()} = {{}};\n"]
+        name = var.emit_adj()
+        ctype = var.ctype(value_type=True)
+        
+        if ctype != "auto":
+            lines += [f"{ctype} {name} = {{}};\n"]
 
     # forward pass
     lines += ["//---------\n"]
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index 682230dd..a5788224 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1562,4 +1562,5 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 // only include in kernels for now
 #if defined(__CUDACC_RTC__)
 #include "tile.h"
+//#include "tile_gemm.h"
 #endif
\ No newline at end of file
diff --git a/warp/native/tile.h b/warp/native/tile.h
index fa04f958..618611f8 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -2,16 +2,6 @@
 
 #include "builtin.h"
 
-// todo: requires CTK, replace with inline ptx
-#include "cuda_pipeline_primitives.h"
-
-#define USE_CUTE 0
-
-#if USE_CUTE
-#include "cutlass/include/cute/tensor.hpp"
-#include "cutlass/include/cute/algorithm/cooperative_gemm.hpp"
-#endif // USE_CUTE
-
 #if !defined(__CUDA_ARCH__)
 #define WP_TILE_SHARED static
 #define WP_TILE_SYNC void
@@ -20,10 +10,6 @@
 #define WP_TILE_SYNC __syncthreads
 #endif
 
-
-namespace wp
-{
-
 // CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
 #if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
   #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
@@ -42,286 +28,328 @@ namespace wp
 #endif
 
 
-// 2D tile zero
-template <typename T, int M, int N, int Index>
-inline CUDA_CALLABLE array_t<T> tile_zeros()
+namespace wp
 {
-    const int length = M*N;
 
-    WP_TILE_SHARED __align__(16) T data[length];
-    
-    WP_PRAGMA_UNROLL
-    for (int t=threadIdx.x; t < length; t += blockDim.x)
-    {  
-        data[t] = T(0.0);
-    }
+template <typename T>
+void print_tile(T& t)
+{
+    t.print();
 
-    return array_t<T>(data, M, N, nullptr);
+    printf("[");
+    for (int i=0; i < T::M; ++i)
+    {
+        printf("%*s[", i>0, "");
+        for (int j=0; j < T::N; ++j)
+        {
+            printf("%5.2f ", t.fwd(i*T::N + j));
+        }
+
+        if (i == T::M-1)
+            printf("]]\n");
+        else
+            printf("]\n");
+    }
 }
 
-// 2D tile load
-template <typename T, int M, int N, int Index>
-inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
+
+template <typename Tile>
+int size(Tile& t) { return Tile::M*Tile::N; }
+
+
+template <typename T, int M_, int N_>
+struct tile_load_t
 {
-    const int length = M*N;
+    using Type = T;
+    static constexpr int M = M_;
+    static constexpr int N = N_;
 
-    WP_TILE_SHARED __align__(16) T data[length];
-    
-    //---------------
-    // naive-synchronous load
-    //
-    // WP_PRAGMA_UNROLL
-    // for (int t=threadIdx.x; t < length; t += blockDim.x)
-    // {  
-    //     data[t] = index(src, i*M + t/N, j*N + t%N);
-    // }
-
-    //---------------
-    // async 128 bit loads (assumes row-major i.e.: stride 1 on y axis and 4-element alignment on dimension)
-    const int s = 4;
-
-    WP_PRAGMA_UNROLL
-    for (int t=threadIdx.x*s; t < length; t += blockDim.x*s)
-    {  
-        __pipeline_memcpy_async(&data[t],
-                                &index(src, i*M + t/N, j*N + t%N),
-                                sizeof(T)*s);
+    array_t<T> slice;
+
+    tile_load_t(array_t<T>& src, int x, int y)
+    {
+        assert(src.ndim == 2);
+
+        // compute offsets into original array and store a view
+        const int i = x*M;
+        const int j = y*N;
+
+        // slice into src
+        if (src.data)
+            slice.data = data_at_byte_offset(src, byte_offset(src, i, j));
+        if (src.grad)
+            slice.grad = grad_at_byte_offset(src, byte_offset(src, i, j));
+
+        slice.shape[0] = M;
+        slice.shape[1] = N;
+        slice.strides[0] = src.strides[0];
+        slice.strides[1] = src.strides[1];
+        slice.ndim = 2;
     }
 
-    __pipeline_commit();
+    Type fwd(int e)
+    {
+        int i = e/N;
+        int j = e%N;
 
+        return index(slice, i, j);
+    }
 
-    return array_t<T>(data, M, N, nullptr);
-}
+    void bwd(int e, const T& adj_ret)
+    {
+        int i = e/N;
+        int j = e%N;
 
-// 2D tile store
-template <typename T>
-inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array_t<T>& src)
-{
-    const int M = src.shape[0];
-    const int N = src.shape[1];
-    
-    const int length = M*N;
+        if (slice.grad)
+            atomic_add(&index_grad(slice, i, j), adj_ret);
+    }
 
-    // cooperatively store the tile, using a block-stride iterator
-    WP_PRAGMA_UNROLL
-    for (int t=threadIdx.x; t < length; t += blockDim.x)
-    {  
-        index(dest, i*M + t/N, j*N + t%N) = src.data[t];
+    void print()
+    {
+        printf("tile_load_t<%d, %d>\n", M, N);
     }
-}
 
-template <typename T>
-inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
-{
-    return p[i*stride + j];
-}
+};
 
-template <typename T>
-inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
+template <typename Tile_>
+struct tile_store_t
 {
-    return p[i*stride + j];
-}
+    using Tile = Tile_;
+    using Type = typename Tile_::Type;
+    static constexpr int M = Tile_::M;
+    static constexpr int N = Tile_::N;
 
-template <unsigned M, unsigned N, typename T>
-struct partition_t
-{
-    inline partition_t(array_t<T> A)
+    array_t<Type> slice;
+
+    Tile& tile;
+
+    tile_store_t(array_t<Type>& dest, int x, int y, Tile& t) : tile(t)
     {
-        data = A;
-        
-        // todo: do ceil div for non-multiples of M,N
-        shape[0] = A.shape[0]/M;
-        shape[1] = A.shape[1]/N;
+        assert(dest.ndim == 2);
+
+        // compute offsets into original array and store a view
+        const int i = x*M;
+        const int j = y*N;
+
+        // slice into dest
+        if (dest.data)
+            slice.data = data_at_byte_offset(dest, byte_offset(dest, i, j));
+        if (dest.grad)
+            slice.grad = grad_at_byte_offset(dest, byte_offset(dest, i, j));
+
+        slice.shape[0] = M;
+        slice.shape[1] = N;
+        slice.strides[0] = dest.strides[0];
+        slice.strides[1] = dest.strides[1];
+        slice.ndim = 2;
     }
 
-    // underlying data
-    array_t<T> data;
-    
-    // partition dimensions
-    int shape[2];
-};
+    void fwd(int e)
+    {
+        int i = e/N;
+        int j = e%N;
 
-template <unsigned M, unsigned N, typename T>
-inline int partition_size(const partition_t<M, N, T>& tile)
-{
-    return tile.shape[0]*tile.shape[1];
-}
+        index(slice, i, j) = tile.fwd(e);
+    }
 
-// returns the x, y coordinates of a tile given a linear index
-template <unsigned M, unsigned N, typename T>
-inline void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
-{
-    i = t/tile.shape[1];
-    j = t%tile.shape[1];
-}
+    void bwd(int e)
+    {
+        int i = e/N;
+        int j = e%N;
 
-template <unsigned M, unsigned N, typename T>
-inline mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
-{
-    mat_t<M, N, T> out;
-    
-    const int tile_i = i*M;
-    const int tile_j = j*N;
+        // materialize gradient (runs entire graph backward), reading incoming grads from the dest
+        if (slice.grad)
+            tile.bwd(e, index_grad(slice, i, j));
+    }
 
-    WP_PRAGMA_UNROLL
-    for (int i=0; i < M; ++i)
+    void print()
     {
-        WP_PRAGMA_UNROLL
-        for (int j=0; j < N; ++j)
-        {
-            out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
-        }
+        printf("tile_load_t<%d, %d>-+", M, N);
+        print(tile);
     }
+};
 
-    return out;
-}
 
-template <unsigned M, unsigned N, typename T>
-inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
+template <typename T, int M_, int N_>
+struct tile_constant_t
 {
-    mat_t<M, N, T> out;
+    using Type = T;
+    static constexpr int M = M_;
+    static constexpr int N = N_;
 
-    const int tile_i = M*i;
-    const int tile_j = N*j;
+    T c;
+    T& adj_c;
 
-    WP_PRAGMA_UNROLL
-    for (int i=0; i < M; ++i)
-    {	
-        WP_PRAGMA_UNROLL
-        for (int j=0; j < N; ++j)
-        {
-            index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
-        }
-    }
-}
+    tile_constant_t(const T& c, T& adj_c) : c(c), adj_c(adj_c) {}
 
+    Type fwd(int e)
+    {
+        return c;
+    }
 
-#if !USE_CUTE
+    void bwd(int e, const T& adj_ret)
+    {
+        adj_c += adj_ret;
+    }
 
-template <typename T>
-inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
-{   
-    const int TILE_M = 4;
-    const int TILE_N = 4;
-    const int TILE_K = 4;
+    void print()
+    {
+        printf("tile_constant_t<%d, %d>-+", M, N);
+        print(c);
+        printf("\n");
+    }
+};
 
-    partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
-    partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
-    partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
 
-    const int length = partition_size(C_tile);
 
-    __pipeline_wait_prior(0);
+template <typename Tile, typename FwdOp, typename AdjOp>
+struct tile_unary_map_t
+{
+    using Type = typename Tile::Type;
+    static constexpr int M = Tile::M;
+    static constexpr int N = Tile::N;
 
-    WP_TILE_SYNC();
+    Tile& tile;
+    
+    FwdOp fwd_fn;
+    AdjOp adj_fn;
 
-    for (int t=threadIdx.x; t < length; t += blockDim.x)
-    {  
-        int i, j;
-        partition_coord(C_tile, t, i, j);
+    tile_unary_map_t(Tile& t, FwdOp f, AdjOp a)  : tile(t), fwd_fn(f), adj_fn(a) {}
 
-        // accumulator
-        mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
+    Type fwd(int e)
+    {
+        return fwd_fn(tile.fwd(e));
+    }
 
-        WP_PRAGMA_UNROLL
-        for (int k=0; k < A_tile.shape[1]; k++)
-        {
-            const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
-            const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
+    void bwd(int e, Type adj_ret)
+    {
+        Type adj_a = 0.0;
 
-            sum += mul(a, b);
-        }
+        adj_fn(tile.fwd(e), adj_a, adj_ret);
         
-        partition_store(C_tile, i, j, sum);
+        tile.bwd(e, adj_a);
     }
 
-    WP_TILE_SYNC();
-}
+    void print()
+    {
+        printf("tile_unary_map_t<%d, %d>-+", M, N);
+        tile.print();
+    }
+};
 
+template <typename TileA, typename TileB, typename FwdOp, typename AdjOp>
+struct tile_binary_map_t
+{
+    static_assert(TileA::Type == TileB::Type, "Error");
+    static_assert(TileA::M == TileB::M, "Error");
+    static_assert(TileA::N == TileB::N, "Error");
 
+    using Type = typename TileA::Type;
+    static constexpr int M = TileA::M;
+    static constexpr int N = TileA::N;
 
-// 2D gemm accumulate out += A*B
-template <typename T>
-inline CUDA_CALLABLE void tile_matmul_scalar(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
-{    
-    const int length = out.shape[0]*out.shape[1];
+    const TileA& tile_a;
+    const TileB& tile_b;
 
-    WP_TILE_SYNC();
+    FwdOp fwd_fn;
+    AdjOp adj_fn;
 
-    const T* __restrict__ A_ptr = A.data;
-    const T* __restrict__ B_ptr = B.data;
-    T* __restrict__ C_ptr = out.data;
 
-    WP_PRAGMA_UNROLL
-    for (int t=threadIdx.x; t < length; t += blockDim.x)
-    {  
-        // compute output index
-        const int i = t/out.shape[1];
-        const int j = t%out.shape[1];
+    tile_binary_map_t(const TileA& a, TileB& b, FwdOp fwd_fn, AdjOp adj_fn) : tile_a(a), tile_b(b), fwd_fn(fwd_fn), adj_fn(adj_fn) {}
 
-        T sum(0.0);
+    Type fwd(int e)
+    {
+        Type a = tile_a.fwd(e);
+        Type b = tile_b.fwd(e);
 
-        WP_PRAGMA_UNROLL
-        for (int k=0; k < A.shape[1]; ++k)
-        {
-            T a = index(A_ptr, i, k, A.shape[1]);
-            T b = index(B_ptr, k, j, B.shape[1]);
+        return fwd_fn(a, b);
+    }
 
-            sum = fmaf(a, b, sum);
-        }
-        
-        index(C_ptr, i, j, out.shape[1]) += sum;
+    void bwd(int e, Type adj_ret)
+    {
+        Type a = tile_a.fwd(e);
+        Type b = tile_b.fwd(e);
+ 
+        Type adj_a = 0.0;
+        Type adj_b = 0.0;
+
+        adj_fn(a, b, adj_a, adj_b, adj_ret);
+
+        // recurse
+        tile_a.bwd(e, adj_a);
+        tile_b.bwd(e, adj_b);
     }
 
-    WP_TILE_SYNC();
-}
+    void print()
+    {
+        printf("tile_binary_map_t<%d, %d>", M, N);
+        printf("\n   -+");
+        tile_a.print();
+        printf("\n   -+");
+        tile_b.print();
 
-#else
+    }
 
+};
 
-template <typename T>
-inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+
+
+// entry point for load
+template <typename T, int M, int N>
+tile_load_t<T, M, N> tile_load(array_t<T>& a, int x, int y)
 {
-	using namespace cute;
+    return tile_load_t<T, M, N>(a, x, y);
+}
 
-    __pipeline_wait_prior(0);
+template <typename T, int M, int N>
+void adj_tile_load(array_t<T>& a, int x, int y, array_t<T>& adj_a, int adj_x, int adj_y, const tile_load_t<T, M, N>& adj_ret)
+{
+    // nop
+}
 
-    // ensure smem tile is ready
- 	WP_TILE_SYNC();
 
-	// Define CTA matrix size (static)
-	auto bM = Int<64>{};
-	auto bN = Int<64>{};
-	auto bK = Int<8>{};
+// entry point for store
+template <typename T, typename Tile>
+void tile_store(array_t<T>& dest, int x, int y, Tile& t)
+{
+    tile_store_t<Tile> op(dest, x, y, t);
+
+    // execute op
+    for (int i=threadIdx.x; i < size(op); i += blockDim.x)
+        op.fwd(i);
+}
 
-	// Define the smem layouts (static)
-	auto sA = make_layout(make_shape(bM, bK), LayoutRight{});  
-	auto sB = make_layout(make_shape(bN, bK));   
-	auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
 
-    Tensor s_a_tensor = make_tensor(make_smem_ptr<float>(A.data), sA);
-    Tensor s_b_tensor = make_tensor(make_smem_ptr<float>(B.data), sB);
-    Tensor s_c_tensor = make_tensor(make_smem_ptr<float>(out.data), sC);
+template <typename T, typename Tile>
+void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, Tile& adj_t)
+{
+    tile_store_t<Tile> op(dest, x, y, t);
 
+    for (int i=threadIdx.x; i < size(op); i += blockDim.x)
+        op.bwd(i);
+}
 
-    TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
-                                 Layout<Shape<_16,_8,_1>>{});  // 16x8x1 UniversalFMA, assumes blockDim=128
 
-    
-    cooperative_gemm< AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
-                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>, 
-                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>
-                    >(
-      threadIdx.x, tiled_mma,
-      1.0f, s_a_tensor, s_b_tensor, 1.0f, s_c_tensor,
-      cute::identity(), cute::identity(), cute::identity(), cute::identity()
-    );
 
-    WP_TILE_SYNC();
+// unary map
+template <typename Tile, typename FwdOp, typename AdjOp>
+tile_unary_map_t<Tile, FwdOp, AdjOp> tile_map_impl(FwdOp fwd, AdjOp adj, Tile& t)
+{
+    return tile_unary_map_t<Tile, FwdOp, AdjOp>(t, fwd, adj);
+}
 
+// binary map
+template <typename TileA, typename TileB, typename FwdOp, typename AdjOp>
+tile_binary_map_t<TileA, TileB, FwdOp, AdjOp> tile_map_impl(FwdOp op, AdjOp adj, TileA& a, TileB& b)
+{
+    return tile_binary_map_t<TileA, TileB, FwdOp, AdjOp>(a, b);
 }
 
-#endif // USE_CUTE
+// use macro to capture adjoint operator
+#define tile_map(op, ...) tile_map_impl(op, adj_##op, __VA_ARGS__)
+
+// use a macro to capture the adjoint var in the expression
+#define tile_constant(T, M, N, var) tile_constant_t<T, M, N>(var, adj_##var)
+
+} // namespace wp
 
-} // namespace wp
\ No newline at end of file
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
new file mode 100644
index 00000000..91ed329d
--- /dev/null
+++ b/warp/native/tile_gemm.h
@@ -0,0 +1,310 @@
+#pragma once
+
+#include "builtin.h"
+
+// todo: requires CTK, replace with inline ptx
+#include "cuda_pipeline_primitives.h"
+
+#define USE_CUTE 1
+
+#if USE_CUTE
+#include "cutlass/include/cute/tensor.hpp"
+#include "cutlass/include/cute/algorithm/cooperative_gemm.hpp"
+#endif // USE_CUTE
+
+namespace wp
+{
+
+
+// 2D tile zero
+template <typename T, int M, int N, int Index>
+inline CUDA_CALLABLE array_t<T> tile_zeros()
+{
+    const int length = M*N;
+
+    WP_TILE_SHARED __align__(16) T data[length];
+    
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {  
+        data[t] = T(0.0);
+    }
+
+    return array_t<T>(data, M, N, nullptr);
+}
+
+// 2D tile load
+template <typename T, int M, int N, int Index>
+inline CUDA_CALLABLE array_t<T> tile_load(const array_t<T>& src, int i, int j)
+{
+    const int length = M*N;
+
+    WP_TILE_SHARED __align__(16) T data[length];
+    
+    //---------------
+    // naive-synchronous load
+    //
+    // WP_PRAGMA_UNROLL
+    // for (int t=threadIdx.x; t < length; t += blockDim.x)
+    // {  
+    //     data[t] = index(src, i*M + t/N, j*N + t%N);
+    // }
+
+    //---------------
+    // async 128 bit loads (assumes row-major i.e.: stride 1 on y axis and 4-element alignment on dimension)
+    const int s = 4;
+
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x*s; t < length; t += blockDim.x*s)
+    {  
+        __pipeline_memcpy_async(&data[t],
+                                &index(src, i*M + t/N, j*N + t%N),
+                                sizeof(T)*s);
+    }
+
+    __pipeline_commit();
+
+
+    return array_t<T>(data, M, N, nullptr);
+}
+
+// 2D tile store
+template <typename T>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array_t<T>& src)
+{
+    const int M = src.shape[0];
+    const int N = src.shape[1];
+    
+    const int length = M*N;
+
+    // cooperatively store the tile, using a block-stride iterator
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {  
+        index(dest, i*M + t/N, j*N + t%N) = src.data[t];
+    }
+}
+
+template <typename T>
+inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
+{
+    return p[i*stride + j];
+}
+
+template <typename T>
+inline CUDA_CALLABLE T& index(T* __restrict__ p, int i, int j, int stride)
+{
+    return p[i*stride + j];
+}
+
+template <unsigned M, unsigned N, typename T>
+struct partition_t
+{
+    inline partition_t(array_t<T> A)
+    {
+        data = A;
+        
+        // todo: do ceil div for non-multiples of M,N
+        shape[0] = A.shape[0]/M;
+        shape[1] = A.shape[1]/N;
+    }
+
+    // underlying data
+    array_t<T> data;
+    
+    // partition dimensions
+    int shape[2];
+};
+
+template <unsigned M, unsigned N, typename T>
+inline int partition_size(const partition_t<M, N, T>& tile)
+{
+    return tile.shape[0]*tile.shape[1];
+}
+
+// returns the x, y coordinates of a tile given a linear index
+template <unsigned M, unsigned N, typename T>
+inline void partition_coord(const partition_t<M, N, T>& tile, const int t, int& i, int& j)
+{
+    i = t/tile.shape[1];
+    j = t%tile.shape[1];
+}
+
+template <unsigned M, unsigned N, typename T>
+inline mat_t<M, N, T> partition_load(const partition_t<M, N, T>& tile, int i, int j)
+{
+    mat_t<M, N, T> out;
+    
+    const int tile_i = i*M;
+    const int tile_j = j*N;
+
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < M; ++i)
+    {
+        WP_PRAGMA_UNROLL
+        for (int j=0; j < N; ++j)
+        {
+            out.data[i][j] = index(tile.data, tile_i + i, tile_j + j);
+        }
+    }
+
+    return out;
+}
+
+template <unsigned M, unsigned N, typename T>
+inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, const mat_t<M, N, T>& value)
+{
+    mat_t<M, N, T> out;
+
+    const int tile_i = M*i;
+    const int tile_j = N*j;
+
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < M; ++i)
+    {	
+        WP_PRAGMA_UNROLL
+        for (int j=0; j < N; ++j)
+        {
+            index(tile.data, tile_i + i, tile_j + j) = value.data[i][j];
+        }
+    }
+}
+
+
+#if !USE_CUTE
+
+template <typename T>
+inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{   
+    const int TILE_M = 4;
+    const int TILE_N = 4;
+    const int TILE_K = 4;
+
+    partition_t A_tile = partition_t<TILE_M, TILE_K, T>(A);
+    partition_t B_tile = partition_t<TILE_K, TILE_N, T>(B);
+    partition_t C_tile = partition_t<TILE_M, TILE_N, T>(out);
+
+    const int length = partition_size(C_tile);
+
+    __pipeline_wait_prior(0);
+
+    WP_TILE_SYNC();
+
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {  
+        int i, j;
+        partition_coord(C_tile, t, i, j);
+
+        // accumulator
+        mat_t<TILE_M, TILE_N, T> sum = partition_load(C_tile, i, j);
+
+        WP_PRAGMA_UNROLL
+        for (int k=0; k < A_tile.shape[1]; k++)
+        {
+            const mat_t<TILE_M, TILE_K, T> a = partition_load(A_tile, i, k);
+            const mat_t<TILE_K, TILE_N, T> b = partition_load(B_tile, k, j);
+
+            sum += mul(a, b);
+        }
+        
+        partition_store(C_tile, i, j, sum);
+    }
+
+    WP_TILE_SYNC();
+}
+
+
+
+// 2D gemm accumulate out += A*B
+template <typename T>
+inline CUDA_CALLABLE void tile_matmul_scalar(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{    
+    const int length = out.shape[0]*out.shape[1];
+
+    WP_TILE_SYNC();
+
+    const T* __restrict__ A_ptr = A.data;
+    const T* __restrict__ B_ptr = B.data;
+    T* __restrict__ C_ptr = out.data;
+
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    {  
+        // compute output index
+        const int i = t/out.shape[1];
+        const int j = t%out.shape[1];
+
+        T sum(0.0);
+
+        WP_PRAGMA_UNROLL
+        for (int k=0; k < A.shape[1]; ++k)
+        {
+            T a = index(A_ptr, i, k, A.shape[1]);
+            T b = index(B_ptr, k, j, B.shape[1]);
+
+            sum = fmaf(a, b, sum);
+        }
+        
+        index(C_ptr, i, j, out.shape[1]) += sum;
+    }
+
+    WP_TILE_SYNC();
+}
+
+#else
+
+
+template <typename T>
+inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+{
+	using namespace cute;
+
+    __pipeline_wait_prior(0);
+
+    // ensure smem tile is ready
+ 	WP_TILE_SYNC();
+
+	// Define CTA matrix size (static)
+	auto bM = Int<64>{};
+	auto bN = Int<64>{};
+	auto bK = Int<8>{};
+
+	// Define the smem layouts (static)
+	auto sA = make_layout(make_shape(bM, bK), LayoutRight{});  
+	auto sB = make_layout(make_shape(bN, bK));   
+	auto sC = make_layout(make_shape(bM, bN), LayoutRight{});
+
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<float>(A.data), sA);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<float>(B.data), sB);
+    Tensor s_c_tensor = make_tensor(make_smem_ptr<float>(out.data), sC);
+
+
+    // TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
+    //                              Layout<Shape<_16,_8,_1>>{});  // 16x8x1 UniversalFMA, assumes blockDim=128
+
+
+    // TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
+    //                                     Layout<Shape<_8,_16>,Stride<_16,_1>>{});  // 8x16x1 UniversalFMA, assumes blockDim=128
+
+
+
+    TiledMMA tiled_mma = make_tiled_mma(UniversalFMA<float,float,float>{},
+                                        Layout<Shape<_2,_64>,Stride<_64,_1>>{});  // 8x16x1 UniversalFMA, assumes blockDim=128
+
+
+    cooperative_gemm< AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>,
+                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>, 
+                      AutoVectorizingCopyWithAssumedAlignment<sizeof_bits_v<float>>
+                    >(
+      threadIdx.x, tiled_mma,
+      1.0f, s_a_tensor, s_b_tensor, 1.0f, s_c_tensor,
+      cute::identity(), cute::identity(), cute::identity(), cute::identity()
+    );
+
+    WP_TILE_SYNC();
+
+}
+
+#endif // USE_CUTE
+
+} // namespace wp
\ No newline at end of file
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 78fb7fc9..4781d9ad 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -6,7 +6,7 @@
 #wp.config.mode = "debug"
 
 wp.init()
-wp.set_module_options({"enable_backward": False})
+wp.set_module_options({"enable_backward": True})
 wp.set_device("cuda:0")
 
 
@@ -36,118 +36,136 @@ def test_copy_tiled():
     A = rng.random((M, N), dtype=np.float32)
     B = rng.random((M, N), dtype=np.float32)
 
-    A_wp = wp.array(A)
-    B_wp = wp.array(B)
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
 
-    wp.launch(copy_tiled, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
+    with wp.Tape() as tape:
+        wp.launch(copy_tiled, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
 
+    # verify forward pass
     assert(np.allclose(A, B_wp.numpy(), rtol=1.e-4))
-    
-    print("Copy passed")
+    print("Copy forward passed")
 
+    # verify backward pass
+    B_wp.grad = wp.ones_like(B_wp)
+    tape.backward()
 
-#test_copy_tiled()
+    assert(np.allclose(A_wp.grad.numpy(), B_wp.grad.numpy()))
+    print("Copy backward passed")
 
 
-@wp.kernel
-def gemm(A: wp.array2d(dtype=float),
-         B: wp.array2d(dtype=float),
-         C: wp.array2d(dtype=float)):
+test_copy_tiled()
 
-    # output index
-    i, j = wp.tid()
 
-    sum = float(0.0)
+# @wp.kernel
+# def gemm(A: wp.array2d(dtype=float),
+#          B: wp.array2d(dtype=float),
+#          C: wp.array2d(dtype=float)):
 
-    for k in range(0, A.shape[1]):
-        sum += A[i, k]*B[k, j]
+#     # output index
+#     i, j = wp.tid()
 
-    C[i, j] = sum
+#     sum = float(0.0)
 
+#     for k in range(0, A.shape[1]):
+#         sum += A[i, k]*B[k, j]
 
+#     C[i, j] = sum
 
-TILE_M = wp.constant(64)
-TILE_N = wp.constant(64)
-TILE_K = wp.constant(8)
 
-@wp.kernel
-def gemm_tiled(A: wp.array2d(dtype=float),
-               B: wp.array2d(dtype=float),
-               C: wp.array2d(dtype=float)):
 
-    # output tile index
-    i, j = wp.tid()
+# TILE_M = wp.constant(64)
+# TILE_N = wp.constant(64)
+# TILE_K = wp.constant(8)
+
+# @wp.kernel
+# def gemm_tiled(A: wp.array2d(dtype=float),
+#                B: wp.array2d(dtype=float),
+#                C: wp.array2d(dtype=float)):
+
+#     # output tile index
+#     i, j = wp.tid()
+
+#     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+
+#     M = A.shape[0]
+#     N = B.shape[1]
+#     K = A.shape[1]
+
+#     count = int(K / TILE_K) # todo: must be the same as TILE_K
+
+#     for k in range(count):
+
+#         a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+#         b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
 
-    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+#         # sum += a*b
+#         wp.tile_matmul(a, b, sum)
 
-    M = A.shape[0]
-    N = B.shape[1]
-    K = A.shape[1]
+#     wp.tile_store(C, i, j, sum)
 
-    count = int(K / 8) # todo: must be the same as TILE_K
 
-    for k in range(count):
+# s = 0.0
 
-        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
-        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+# for i, j in tile.shape:
 
-        # sum += a*b
-        wp.tile_matmul(a, b, sum)
+#     s += tile[i-1, i-1]
+#     s += tile[i, i-1]
+#     s += tile[i,]
 
-    wp.tile_store(C, i, j, sum)
 
 
-M = TILE_M*7
-K = TILE_K*4
-N = TILE_N*6
+# M = TILE_M*7
+# K = TILE_K*4
+# N = TILE_N*6
 
-rng = np.random.default_rng(42)
-A = rng.random((M, K), dtype=np.float32)
-B = rng.random((K, N), dtype=np.float32)
-C = np.zeros((M, N), dtype=np.float32)
+# rng = np.random.default_rng(42)
+# A = rng.random((M, K), dtype=np.float32)
+# B = rng.random((K, N), dtype=np.float32)
+# C = np.zeros((M, N), dtype=np.float32)
 
-A_wp = wp.array(A)
-B_wp = wp.array(B)
-C_wp = wp.array(C)
+# A_wp = wp.array(A)
+# B_wp = wp.array(B)
+# C_wp = wp.array(C)
 
-iters = 10
+# iters = 10
 
-with wp.ScopedTimer("NumPy"):
+# with wp.ScopedTimer("NumPy"):
 
-    for i in range(iters):
-        C = A@B
+#     for i in range(iters):
+#         C = A@B
 
-wp.force_load(device="cuda:0")
+# wp.force_load(device="cuda:0")
 
-with wp.ScopedTimer("Warp", cuda_filter=wp.TIMING_KERNEL):
+# with wp.ScopedTimer("Warp", cuda_filter=wp.TIMING_KERNEL):
 
-    for i in range(iters):
-        wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+#     for i in range(iters):
+#         wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
 
 
-    print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
+#     print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
 
-    for i in range(iters):
-        wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=128)
-        wp.synchronize()
+#     for i in range(iters):
+#         wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=128)
+#         wp.synchronize()
 
 
-    print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
+#     print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
 
 
-A_tc = torch.from_numpy(A).to("cuda:0")
-B_tc = torch.from_numpy(B).to("cuda:0")
-C_tc = torch.from_numpy(C).to("cuda:0")
+# A_tc = torch.from_numpy(A).to("cuda:0")
+# B_tc = torch.from_numpy(B).to("cuda:0")
+# C_tc = torch.from_numpy(C).to("cuda:0")
 
-for i in range(10):
-    torch.matmul(A_tc, B_tc, out=C_tc)
+# for i in range(10):
+#     torch.matmul(A_tc, B_tc, out=C_tc)
 
-with wp.ScopedTimer("Torch"):
+# with wp.ScopedTimer("Torch"):
 
-    for i in range(iters):
-        torch.matmul(A_tc, B_tc, out=C_tc)
+#     for i in range(iters):
+#         torch.matmul(A_tc, B_tc, out=C_tc)
 
-    torch.cuda.synchronize()
+#     torch.cuda.synchronize()
 
     
 
diff --git a/warp/types.py b/warp/types.py
index 28c20fcd..11416bfd 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2861,6 +2861,19 @@ def array_type_id(a):
         raise ValueError("Invalid array type")
 
 
+# tile expression objects
+class Tile:
+    
+    def __init__(self, dtype, M, N, op):
+        self.dtype = dtype
+        self.M = M
+        self.N = N
+        self.op = op
+
+def is_tile(t):
+    return isinstance(t, Tile)
+
+
 class Bvh:
     def __init__(self, lowers, uppers):
         """Class representing a bounding volume hierarchy.

From 9f1c428aacd2bdd68f906188547ea799eff8f20a Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 7 Jun 2024 08:20:34 +1200
Subject: [PATCH 012/102] Formatting fixes for CHANGELOG.md

---
 CHANGELOG.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5927f6ed..9ccb477c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,20 +9,20 @@
 - Revised module compilation process to allow multiple processes to use the same kernel cache directory.
   Cached kernels will now be stored in hash-specific subdirectory.
 - Add runtime checks for `wp.MarchingCubes` on field dimensions and size
-- Fix memory leak in mesh BVH ([GH-225](https://github.com/NVIDIA/warp/issues/225))
+- Fix memory leak in `wp.Mesh` BVH ([GH-225](https://github.com/NVIDIA/warp/issues/225))
 - Use C++17 with NVCC when building the Warp library and user kernels
 - Increase PTX target architecture up to `sm_75` (from `sm_70`), enabling Turing ISA features
 - Extended NanoVDB support (see `warp.Volume`):
   - Add support for data-agnostic index grids, allocation at voxel granularity
-  - New `volume_lookup_index`, `volume_sample_index` and generic `volume_sample`/`volume_lookup`/`volume_store` kernel-level functions
+  - New `wp.volume_lookup_index()`, `wp.volume_sample_index()` and generic `wp.volume_sample()`/`wp.volume_lookup()`/`wp.volume_store()` kernel-level functions
   - Zero-copy aliasing of in-memory grids, support for multi-grid buffers
   - Grid introspection and blind data access capabilities
   - warp.fem can now work directly on NanoVDB grids using `warp.fem.Nanogrid`
-  - Fixed `volume_sample_v` and `volume_store_*` adjoints
-  - Prevent `volume_store` from overwriting grid background values
+  - Fixed `wp.volume_sample_v()` and `wp.volume_store_*()` adjoints
+  - Prevent `wp.volume_store()` from overwriting grid background values
 - Improve validation of user-provided fields and values in warp.fem
-- Support headless rendering of `OpenGLRenderer` via `pyglet.options["headless"] = True`
-- `RegisteredGLBuffer` can fall back to CPU-bound copying if CUDA/OpenGL interop is not available
+- Support headless rendering of `wp.render.OpenGLRenderer` via `pyglet.options["headless"] = True`
+- `wp.render.RegisteredGLBuffer` can fall back to CPU-bound copying if CUDA/OpenGL interop is not available
 - Fix to forward `wp.copy()` params to gradient and adjoint copy function calls.
 - Fix slicing of arrays with gradients in kernels
 

From 209e3cae891bf98705bfdd8c0064ae57b96eb8f4 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 7 Jun 2024 10:42:31 +1200
Subject: [PATCH 013/102] Update CHANGELOG.md

---
 CHANGELOG.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a12e527f..8916a63d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,16 +17,17 @@
   - New `wp.volume_lookup_index()`, `wp.volume_sample_index()` and generic `wp.volume_sample()`/`wp.volume_lookup()`/`wp.volume_store()` kernel-level functions
   - Zero-copy aliasing of in-memory grids, support for multi-grid buffers
   - Grid introspection and blind data access capabilities
-  - warp.fem can now work directly on NanoVDB grids using `warp.fem.Nanogrid`
+  - `warp.fem` can now work directly on NanoVDB grids using `warp.fem.Nanogrid`
   - Fixed `wp.volume_sample_v()` and `wp.volume_store_*()` adjoints
   - Prevent `wp.volume_store()` from overwriting grid background values
-- Improve validation of user-provided fields and values in warp.fem
+- Improve validation of user-provided fields and values in `warp.fem`
 - Support headless rendering of `wp.render.OpenGLRenderer` via `pyglet.options["headless"] = True`
 - `wp.render.RegisteredGLBuffer` can fall back to CPU-bound copying if CUDA/OpenGL interop is not available
 - Fix to forward `wp.copy()` params to gradient and adjoint copy function calls.
 - Fix so that `wp.randn()` doesn't return inf
 - Fix slicing of arrays with gradients in kernels
-- Fix function overload caching: ensure module is rebuilt if any function overloads are modified.
+- Fix function overload caching: ensure module is rebuilt if any function overloads are modified
+- Publish CUDA 12.5 binaries for Hopper support, see https://github.com/nvidia/warp?tab=readme-ov-file#installing for details
 
 ## [1.1.1] - 2024-05-24
 

From 15d76e0e994c215d26c8362045a05a18af92bbd7 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 8 Jul 2024 15:54:32 +1200
Subject: [PATCH 014/102] Working unary and binary tile_map() builtin

---
 warp/builtins.py        |  30 +++++++++++
 warp/native/tile.h      |  47 +++++++++++------
 warp/tests/test_tile.py | 113 ++++++++++++++++++++++++++++++++++++++--
 3 files changed, 170 insertions(+), 20 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 96dc4282..1f01df4d 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1523,6 +1523,36 @@ def tile_matmul_value_func(arg_types, kwds, templates):
     export=False,
 )
 
+# does type propagation for load()
+def tile_map_value_func(arg_types, kwds, _):
+
+    if arg_types is None:
+        return None
+
+    dtype = arg_types[0]
+    for i in arg_types:
+        if arg_types[i].dtype != dtype:
+            raise RuntimeError("tile_map() arguments must all have the same type")
+
+    input = arg_types[0]
+    
+    return Tile(dtype=input.dtype,
+                M=input.M,
+                N=input.N,
+                op="map")
+
+
+
+add_builtin(
+    "tile_map",
+    input_types={"op": Callable},
+    value_func=tile_map_value_func,
+    variadic=True,
+    doc="Map the operation onto each element of the tile", 
+    group="Tile Primitives",
+    export=False,
+)
+
 # ---------------------------------
 # Linear Algebra
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 618611f8..b3be6d81 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -31,6 +31,18 @@
 namespace wp
 {
 
+// Primary template
+template <typename T, typename U>
+struct is_same {
+    static constexpr bool value = false;
+};
+
+// Specialization for the case when T and U are the same type
+template <typename T>
+struct is_same<T, T> {
+    static constexpr bool value = true;
+};
+
 template <typename T>
 void print_tile(T& t)
 {
@@ -87,7 +99,7 @@ struct tile_load_t
         slice.ndim = 2;
     }
 
-    Type fwd(int e)
+    Type fwd(int e) const
     {
         int i = e/N;
         int j = e%N;
@@ -95,7 +107,7 @@ struct tile_load_t
         return index(slice, i, j);
     }
 
-    void bwd(int e, const T& adj_ret)
+    void bwd(int e, const T& adj_ret) const
     {
         int i = e/N;
         int j = e%N;
@@ -144,7 +156,7 @@ struct tile_store_t
         slice.ndim = 2;
     }
 
-    void fwd(int e)
+    void fwd(int e) const
     {
         int i = e/N;
         int j = e%N;
@@ -152,7 +164,7 @@ struct tile_store_t
         index(slice, i, j) = tile.fwd(e);
     }
 
-    void bwd(int e)
+    void bwd(int e) const
     {
         int i = e/N;
         int j = e%N;
@@ -216,17 +228,17 @@ struct tile_unary_map_t
 
     tile_unary_map_t(Tile& t, FwdOp f, AdjOp a)  : tile(t), fwd_fn(f), adj_fn(a) {}
 
-    Type fwd(int e)
+    Type fwd(int e) const
     {
         return fwd_fn(tile.fwd(e));
     }
 
-    void bwd(int e, Type adj_ret)
+    void bwd(int e, Type adj_ret) const
     {
         Type adj_a = 0.0;
 
         adj_fn(tile.fwd(e), adj_a, adj_ret);
-        
+
         tile.bwd(e, adj_a);
     }
 
@@ -240,7 +252,7 @@ struct tile_unary_map_t
 template <typename TileA, typename TileB, typename FwdOp, typename AdjOp>
 struct tile_binary_map_t
 {
-    static_assert(TileA::Type == TileB::Type, "Error");
+    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error");
     static_assert(TileA::M == TileB::M, "Error");
     static_assert(TileA::N == TileB::N, "Error");
 
@@ -257,7 +269,7 @@ struct tile_binary_map_t
 
     tile_binary_map_t(const TileA& a, TileB& b, FwdOp fwd_fn, AdjOp adj_fn) : tile_a(a), tile_b(b), fwd_fn(fwd_fn), adj_fn(adj_fn) {}
 
-    Type fwd(int e)
+    Type fwd(int e) const
     {
         Type a = tile_a.fwd(e);
         Type b = tile_b.fwd(e);
@@ -265,7 +277,7 @@ struct tile_binary_map_t
         return fwd_fn(a, b);
     }
 
-    void bwd(int e, Type adj_ret)
+    void bwd(int e, Type adj_ret) const
     {
         Type a = tile_a.fwd(e);
         Type b = tile_b.fwd(e);
@@ -287,7 +299,6 @@ struct tile_binary_map_t
         tile_a.print();
         printf("\n   -+");
         tile_b.print();
-
     }
 
 };
@@ -333,20 +344,26 @@ void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_des
 
 // unary map
 template <typename Tile, typename FwdOp, typename AdjOp>
-tile_unary_map_t<Tile, FwdOp, AdjOp> tile_map_impl(FwdOp fwd, AdjOp adj, Tile& t)
+tile_unary_map_t<Tile, FwdOp, AdjOp> tile_map_impl(FwdOp fwd, AdjOp adj, Tile& a)
 {
-    return tile_unary_map_t<Tile, FwdOp, AdjOp>(t, fwd, adj);
+    return tile_unary_map_t<Tile, FwdOp, AdjOp>(a, fwd, adj);
 }
 
 // binary map
 template <typename TileA, typename TileB, typename FwdOp, typename AdjOp>
-tile_binary_map_t<TileA, TileB, FwdOp, AdjOp> tile_map_impl(FwdOp op, AdjOp adj, TileA& a, TileB& b)
+tile_binary_map_t<TileA, TileB, FwdOp, AdjOp> tile_map_impl(FwdOp fwd, AdjOp adj, TileA& a, TileB& b)
 {
-    return tile_binary_map_t<TileA, TileB, FwdOp, AdjOp>(a, b);
+    return tile_binary_map_t<TileA, TileB, FwdOp, AdjOp>(a, b, fwd, adj);
 }
 
 // use macro to capture adjoint operator
 #define tile_map(op, ...) tile_map_impl(op, adj_##op, __VA_ARGS__)
+//#define tile_map(op, a) tile_map_impl(wp::##op, wp::##op, a)
+
+
+// nop
+void adj_tile_map_impl(void) {}
+#define adj_tile_map(...) adj_tile_map_impl()
 
 // use a macro to capture the adjoint var in the expression
 #define tile_constant(T, M, N, var) tile_constant_t<T, M, N>(var, adj_##var)
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 4781d9ad..beea0746 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -16,8 +16,8 @@
 TILE_N = 4
 
 @wp.kernel
-def copy_tiled(A: wp.array2d(dtype=float),
-               B: wp.array2d(dtype=float)):
+def tile_copy(A: wp.array2d(dtype=float),
+              B: wp.array2d(dtype=float)):
     
     # tile index
     i, j = wp.tid() 
@@ -26,7 +26,7 @@ def copy_tiled(A: wp.array2d(dtype=float),
     wp.tile_store(B, i, j, a)
 
 
-def test_copy_tiled():
+def test_tile_copy():
 
     rng = np.random.default_rng(42)
 
@@ -40,7 +40,7 @@ def test_copy_tiled():
     B_wp = wp.array(B, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(copy_tiled, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
+        wp.launch(tile_copy, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
 
     # verify forward pass
     assert(np.allclose(A, B_wp.numpy(), rtol=1.e-4))
@@ -53,8 +53,111 @@ def test_copy_tiled():
     assert(np.allclose(A_wp.grad.numpy(), B_wp.grad.numpy()))
     print("Copy backward passed")
 
+@wp.func
+def unary_func(x: float):
+    return wp.sin(x)
 
-test_copy_tiled()
+@wp.kernel
+def tile_unary_map(input: wp.array2d(dtype=float),
+                   output: wp.array2d(dtype=float)):
+    
+    # tile index
+    i, j = wp.tid() 
+    
+    a = wp.tile_load(input, i, j, m=TILE_M, n=TILE_N)
+    
+    sa = wp.tile_map(unary_func, a)
+    
+    wp.tile_store(output, i, j, sa)
+
+
+def test_tile_unary_map():
+
+    rng = np.random.default_rng(42)
+
+    M = TILE_M*7
+    N = TILE_N*5
+
+    A = rng.random((M, N), dtype=np.float32)
+    B = np.sin(A)
+
+    A_grad = np.cos(A)
+
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.zeros_like(A_wp, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_unary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
+
+    # verify forward pass
+    assert(np.allclose(B, B_wp.numpy(), rtol=1.e-4))
+    print("Unary map forward passed")
+
+    # verify backward pass
+    B_wp.grad = wp.ones_like(B_wp)
+    tape.backward()
+
+    assert(np.allclose(A_wp.grad.numpy(), A_grad))
+    print("Unary map backward passed")
+
+
+@wp.func
+def binary_func(x: float, y: float):
+    return wp.sin(x) + y
+
+@wp.kernel
+def tile_binary_map(input_a: wp.array2d(dtype=float),
+                   input_b: wp.array2d(dtype=float),
+                   output: wp.array2d(dtype=float)):
+    
+    # tile index
+    i, j = wp.tid() 
+    
+    a = wp.tile_load(input_a, i, j, m=TILE_M, n=TILE_N)
+    b = wp.tile_load(input_b, i, j, m=TILE_M, n=TILE_N)
+    
+    sa = wp.tile_map(binary_func, a, b)
+    
+    wp.tile_store(output, i, j, sa)
+
+
+def test_tile_binary_map():
+
+    rng = np.random.default_rng(42)
+
+    M = TILE_M*7
+    N = TILE_N*5
+
+    A = rng.random((M, N), dtype=np.float32)
+    B = rng.random((M, N), dtype=np.float32)
+    C = np.sin(A) + B
+
+    A_grad = np.cos(A)
+    B_grad = np.ones_like(B)
+
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
+    C_wp = wp.zeros_like(A_wp, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_binary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp, C_wp], tile_size=8)
+
+    # verify forward pass
+    assert(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
+    print("Binary map forward passed")
+
+    # verify backward pass
+    C_wp.grad = wp.ones_like(C_wp)
+    tape.backward()
+
+    assert(np.allclose(A_wp.grad.numpy(), A_grad))
+    assert(np.allclose(B_wp.grad.numpy(), B_grad))
+    
+    print("Binary map backward passed")
+
+test_tile_copy()
+test_tile_unary_map()
+test_tile_binary_map()
 
 
 # @wp.kernel

From 7c2a365164e9203cee82fdf715f31e6b812a2324 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 9 Aug 2024 16:16:50 +1200
Subject: [PATCH 015/102] Update Tile Expressions branch to work with changes
 to codegen / builtins

---
 warp/builtins.py | 123 +++++++++++++++++++++++++++++++----------------
 warp/codegen.py  |  25 +++++-----
 warp/types.py    |   2 +
 3 files changed, 96 insertions(+), 54 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 6e40f95b..e81b7f6e 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1697,7 +1697,7 @@ def spatial_vector_dispatch_func(input_types: Mapping[str, type], return_type: A
 # Tile-based primitives
 shared_memory_id = 0
 
-def tile_zeros_value_func(arg_types, kwds, templates):
+def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1706,90 +1706,110 @@ def tile_zeros_value_func(arg_types, kwds, templates):
     if len(arg_types) > 0:
         raise RuntimeError("tile_zero() args must be passed by keyword")
 
-    if "m" not in kwds:
+    if "m" not in arg_values:
         raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
 
-    if "n" not in kwds:
+    if "n" not in arg_values:
         raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
 
-    if "dtype" not in kwds:
+    if "dtype" not in arg_values:
         raise RuntimeError("'dtype' keyword argument must be specified when calling tile_zeros() function")
 
-    m, n, dtype = kwds["m"], kwds["n"], kwds["dtype"]
+    dtype = arg_values["dtype"]
+
+    return array(dtype=dtype)
 
-    templates.append(dtype)
-    templates.append(m)
-    templates.append(n)
+def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+
+    m, n, dtype = arg_values["m"], arg_values["n"], arg_values["dtype"]
+
+    template_args = []
+    template_args.append(dtype)
+    template_args.append(m)
+    template_args.append(n)
 
     global shared_memory_id
-    templates.append(shared_memory_id)
+    template_args.append(shared_memory_id)
 
     shared_memory_id += 1
 
-    return array(dtype=dtype)
+    return ([], template_args)
+
 
 
 add_builtin(
     "tile_zeros",
     input_types={"m": int, "n": int, "dtype": Scalar},
     value_func=tile_zeros_value_func,
+    dispatch_func=tile_zeros_dispatch_func,
     variadic=True,
     doc="Allocate a tile local block of zero'd memory",
     group="Tile Primitives",
     export=False,
 )
 
-def tile_load_value_func(arg_types, kwds, templates):
+def tile_load_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
         return array_t(shape=(Any, Any), dtype=Scalar)
 
-    if len(arg_types) != 3: 
-        raise RuntimeError("tile_load() requires 3 positional args")
+    # if len(arg_types) != 3: 
+    #     raise RuntimeError("tile_load() requires 3 positional args")
 
-    if not is_array(arg_types[0]):
+    if not is_array(arg_types["a"]):
         raise RuntimeError("tile_load() argument 0 must be an array")
 
-    if not type_is_int(arg_types[1]):
+    if not type_is_int(arg_types["x"]):
         raise RuntimeError("tile_load() argument 1 must be an integer")
 
-    if not type_is_int(arg_types[2]):
+    if not type_is_int(arg_types["y"]):
         raise RuntimeError("tile_load() argument 1 must be an integer")
 
-    if "m" not in kwds:
+    if "m" not in arg_values:
         raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
 
-    if "n" not in kwds:
+    if "n" not in arg_values:
         raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
 
-    m, n = kwds["m"], kwds["n"]
-    dtype = arg_types[0].dtype
+    m, n = arg_values["m"], arg_values["n"]
+    dtype = arg_types["a"].dtype
 
-    templates.append(dtype)
-    templates.append(m)
-    templates.append(n)
+    return Tile(dtype, m, n, "load")
+
+
+def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    
+    array = arg_values["a"]
+    x, y = arg_values["x"], arg_values["y"]
+    m, n = arg_values["m"].constant, arg_values["n"].constant
+    dtype = arg_values["a"].type.dtype
+
+    template_args = []
+    template_args.append(dtype)
+    template_args.append(m)
+    template_args.append(n)
 
     global shared_memory_id
     #templates.append(shared_memory_id)
 
     shared_memory_id += 1
 
-    return Tile(dtype, m, n, "load")#array(dtype=arg_types[0].dtype)
-
+    return ((array, x, y), template_args)
 
 
 add_builtin(
     "tile_load",
     input_types={"a": array(dtype=Any), "x": int, "y": int, "m": int, "n": int},
     value_func=tile_load_value_func,
+    dispatch_func=tile_load_dispatch_func,
     variadic=True,
     doc="Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)",
     group="Tile Primitives",
     export=False,
 )
 
-def tile_store_value_func(arg_types, kwds, templates):
+def tile_store_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1798,16 +1818,16 @@ def tile_store_value_func(arg_types, kwds, templates):
     if len(arg_types) != 4: 
         raise RuntimeError("tile_store() requires 4 positional args")
 
-    if not is_array(arg_types[0]):
+    if not is_array(arg_types["a"]):
         raise RuntimeError("tile_store() argument 0 must be an array")
 
-    if not type_is_int(arg_types[1]):
+    if not type_is_int(arg_types["x"]):
         raise RuntimeError("tile_store() argument 1 must be an integer")
 
-    if not type_is_int(arg_types[2]):
+    if not type_is_int(arg_types["y"]):
         raise RuntimeError("tile_store() argument 2 must be an integer")
 
-    if not is_tile(arg_types[3]):
+    if not is_tile(arg_types["t"]):
         raise RuntimeError("tile_store() argument 3 must be a tile")
 
     return None
@@ -1816,7 +1836,7 @@ def tile_store_value_func(arg_types, kwds, templates):
 
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "x": int, "y": int, "m": int, "n": int},
+    input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
     value_func=tile_store_value_func,
     variadic=True,
     doc="Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)",
@@ -1826,7 +1846,7 @@ def tile_store_value_func(arg_types, kwds, templates):
 
 
 
-def tile_matmul_value_func(arg_types, kwds, templates):
+def tile_matmul_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1858,29 +1878,48 @@ def tile_matmul_value_func(arg_types, kwds, templates):
 )
 
 # does type propagation for load()
-def tile_map_value_func(arg_types, kwds, _):
+def tile_map_value_func(arg_types, arg_values):
 
     if arg_types is None:
         return None
 
-    dtype = arg_types[0]
-    for i in arg_types:
-        if arg_types[i].dtype != dtype:
-            raise RuntimeError("tile_map() arguments must all have the same type")
+    # check all args are tiles
+    for a in arg_types["args"]:
+        if not is_tile(a):
+            raise RuntimeError(f"tile_map() arguments must be tiles, got type {a}")
+
+    # use first argument to define output type
+    first = arg_types["args"][0]
+
+    # check all args have the same type and dimension
+    for a in arg_types["args"]:
+        if a.dtype != first.dtype:
+            raise RuntimeError(f"tile_map() arguments must all have the same type {first.dtype} != {a.dtype}")
+
+        if a.M != first.M:
+            raise RuntimeError(f"tile_map() arguments must all have the same m dimension {first.M} != {a.M}")
+
+        if a.N != first.N:
+            raise RuntimeError(f"tile_map() arguments must all have the same n dimension {first.N} != {a.N}")
 
-    input = arg_types[0]
     
-    return Tile(dtype=input.dtype,
-                M=input.M,
-                N=input.N,
+    return Tile(dtype=first.dtype,
+                M=first.M,
+                N=first.N,
                 op="map")
 
 
+def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (args["op"], *args["args"])
+    template_args = ()
+    return (func_args, template_args)
+
 
 add_builtin(
     "tile_map",
-    input_types={"op": Callable},
+    input_types={"op": Callable, "*args": Any},
     value_func=tile_map_value_func,
+    dispatch_func=tile_map_dispatch_func,
     variadic=True,
     doc="Map the operation onto each element of the tile", 
     group="Tile Primitives",
diff --git a/warp/codegen.py b/warp/codegen.py
index 8870a196..5d017e54 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -1275,7 +1275,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             output = adj.add_var(return_type)
             output_list = [output]
 
-            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
 
             # prepend auto if it is an anonymously typed var (e.g.: a tile op)
             if output.ctype() == "auto":
@@ -1284,7 +1284,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
             replay_call = forward_call
             if func.custom_replay_func is not None:
-                replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(args_var, use_initializer_list)});"
+                replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
                
         else:
             # handle multiple value functions
@@ -1307,6 +1307,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             adj.add_forward(alias_call)
 
         if not func.missing_grad and len(args):
+            adj_args = tuple(strip_reference(x) for x in func_args)
             reverse_has_output_args = (
                 func.require_original_output_arg or len(output_list) > 1
             ) and func.custom_grad_func is None            
@@ -2611,10 +2612,10 @@ def get_constant_references(adj) -> Dict[str, Any]:
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
 
-#define builtin_tid1d() wp::tid(task_index)
-#define builtin_tid2d(x, y) wp::tid(x, y, task_index, dim)
-#define builtin_tid3d(x, y, z) wp::tid(x, y, z, task_index, dim)
-#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, task_index, dim)
+#define builtin_tid1d() wp::tid(_idx)
+#define builtin_tid2d(x, y) wp::tid(x, y, _idx, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, _idx, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, _idx, dim)
 
 """
 
@@ -2629,10 +2630,10 @@ def get_constant_references(adj) -> Dict[str, Any]:
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
 
-#define builtin_tid1d() wp::tid(task_index)
-#define builtin_tid2d(x, y) wp::tid(x, y, task_index, dim)
-#define builtin_tid3d(x, y, z) wp::tid(x, y, z, task_index, dim)
-#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, task_index, dim)
+#define builtin_tid1d() wp::tid(_idx)
+#define builtin_tid2d(x, y) wp::tid(x, y, _idx, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, _idx, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, _idx, dim)
 
 """
 
@@ -2770,7 +2771,7 @@ def get_constant_references(adj) -> Dict[str, Any]:
 WP_API void {name}_cpu_forward(
     {forward_args})
 {{
-    for (size_t task_index = 0; task_index < dim.size; ++task_index)
+    for (size_t _idx = 0; _idx < dim.size; ++_idx)
     {{
         {name}_cpu_kernel_forward(
             {forward_params});
@@ -2780,7 +2781,7 @@ def get_constant_references(adj) -> Dict[str, Any]:
 WP_API void {name}_cpu_backward(
     {reverse_args})
 {{
-    for (size_t task_index = 0; task_index < dim.size; ++task_index)
+    for (size_t _idx = 0; _idx < dim.size; ++_idx)
     {{
         {name}_cpu_kernel_backward(
             {reverse_params});
diff --git a/warp/types.py b/warp/types.py
index 32d3ed90..03065d6d 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -1270,6 +1270,8 @@ def type_typestr(dtype):
 def type_repr(t):
     if is_array(t):
         return str(f"array(ndim={t.ndim}, dtype={t.dtype})")
+    if is_tile(t):
+        return str(f"tile(dtype={t.dtype}, m={t.M}, n={t.N})")
     if type_is_vector(t):
         return str(f"vector(length={t._shape_[0]}, dtype={t._wp_scalar_type_})")
     if type_is_matrix(t):

From ceba991844ad2badec237b40aad854330a83892b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 26 Aug 2024 15:56:42 +1200
Subject: [PATCH 016/102] Working tile expressions with pre-declared tile
 operation objects

---
 warp/builtins.py        |  86 ++++++++++------
 warp/codegen.py         |   2 +-
 warp/native/tile.h      | 223 ++++++++++++++++++++++++++++++++++++----
 warp/native/tile_gemm.h |  75 +++++++++++++-
 warp/tests/test_tile.py | 109 ++++++--------------
 warp/types.py           |  42 ++++++++
 6 files changed, 407 insertions(+), 130 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index e81b7f6e..b721dea1 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1703,8 +1703,8 @@ def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     if arg_types is None:
         return array_t(shape=(Any, Any), dtype=Scalar)
 
-    if len(arg_types) > 0:
-        raise RuntimeError("tile_zero() args must be passed by keyword")
+    # if len(arg_types) > 0:
+    #     raise RuntimeError("tile_zero() args must be passed by keyword")
 
     if "m" not in arg_values:
         raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
@@ -1715,9 +1715,10 @@ def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     if "dtype" not in arg_values:
         raise RuntimeError("'dtype' keyword argument must be specified when calling tile_zeros() function")
 
+    m, n = arg_values["m"], arg_values["n"]
     dtype = arg_values["dtype"]
 
-    return array(dtype=dtype)
+    return TileZeros(dtype=dtype, M=m, N=n)
 
 def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
 
@@ -1725,13 +1726,13 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
 
     template_args = []
     template_args.append(dtype)
-    template_args.append(m)
-    template_args.append(n)
+    template_args.append(m.constant)
+    template_args.append(n.constant)
 
-    global shared_memory_id
-    template_args.append(shared_memory_id)
+    # global shared_memory_id
+    # template_args.append(shared_memory_id)
 
-    shared_memory_id += 1
+    # shared_memory_id += 1
 
     return ([], template_args)
 
@@ -1772,10 +1773,10 @@ def tile_load_value_func(arg_types, arg_values):
     if "n" not in arg_values:
         raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
 
+    a = arg_types["a"]
     m, n = arg_values["m"], arg_values["n"]
-    dtype = arg_types["a"].dtype
 
-    return Tile(dtype, m, n, "load")
+    return TileLoad(a, m, n)
 
 
 def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -1790,10 +1791,9 @@ def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     template_args.append(m)
     template_args.append(n)
 
-    global shared_memory_id
+    #global shared_memory_id
     #templates.append(shared_memory_id)
-
-    shared_memory_id += 1
+    #shared_memory_id += 1
 
     return ((array, x, y), template_args)
 
@@ -1845,6 +1845,31 @@ def tile_store_value_func(arg_types, arg_values):
 )
 
 
+def tile_realize_value_func(arg_types, arg_values):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+
+    m, n = arg_values["t"].m, arg_values["n"].n
+    dtype = arg_values["t"].dtype
+    
+    return Tile(dtype, m, n, "realize")
+
+
+
+add_builtin(
+    "tile_realize",
+    input_types={"t": Tile},
+    value_func=tile_realize_value_func,
+    variadic=True,
+    doc="Force evaluation of a tile expression tree into local memory",
+    group="Tile Primitives",
+    export=False,
+)
+
+
+
 
 def tile_matmul_value_func(arg_types, arg_values):
     
@@ -1855,24 +1880,24 @@ def tile_matmul_value_func(arg_types, arg_values):
     if len(arg_types) != 3: 
         raise RuntimeError("tile_matmul() requires 4 positional args")
 
-    if not is_array(arg_types[0]):
-        raise RuntimeError("tile_matmul() argument 0 must be an array")
+    if not is_tile(arg_types["a"]):
+        raise RuntimeError("tile_matmul() argument 0 must be a tile")
 
-    if not is_array(arg_types[1]):
-        raise RuntimeError("tile_matmul() argument 1 must be an array")
+    if not is_tile(arg_types["b"]):
+        raise RuntimeError("tile_matmul() argument 1 must be an tile")
 
-    if not is_array(arg_types[2]):
-        raise RuntimeError("tile_matmul() argument 2 must be an array")
+    if not is_tile(arg_types["out"]):
+        raise RuntimeError("tile_matmul() argument 2 must be an tile")
 
     return None
 
 
 add_builtin(
     "tile_matmul",
-    input_types={"a": array(dtype=Any), "b": array(dtype=Any), "out": array(dtype=Any)},
+    input_types={"a": Tile, "b": Tile, "out": Tile},
     value_func=tile_matmul_value_func,
     variadic=True,
-    doc="Compute matrix product and accumulate out += a*b", 
+    doc="Compute matrix product and accumulate out += a*b, a and b will be realized before evaluation, and output must already be realized.", 
     group="Tile Primitives",
     export=False,
 )
@@ -1883,16 +1908,18 @@ def tile_map_value_func(arg_types, arg_values):
     if arg_types is None:
         return None
 
+    tiles = arg_types["args"]
+
     # check all args are tiles
-    for a in arg_types["args"]:
+    for a in tiles:
         if not is_tile(a):
             raise RuntimeError(f"tile_map() arguments must be tiles, got type {a}")
 
     # use first argument to define output type
-    first = arg_types["args"][0]
+    first = tiles[0]
 
     # check all args have the same type and dimension
-    for a in arg_types["args"]:
+    for a in tiles:
         if a.dtype != first.dtype:
             raise RuntimeError(f"tile_map() arguments must all have the same type {first.dtype} != {a.dtype}")
 
@@ -1902,11 +1929,12 @@ def tile_map_value_func(arg_types, arg_values):
         if a.N != first.N:
             raise RuntimeError(f"tile_map() arguments must all have the same n dimension {first.N} != {a.N}")
 
-    
-    return Tile(dtype=first.dtype,
-                M=first.M,
-                N=first.N,
-                op="map")
+    if len(tiles) == 1:
+        return TileUnaryMap(tiles[0])
+    elif len(tiles) == 2:
+        return TileBinaryMap(tiles[0], tiles[1])
+    else:
+        raise RuntimeError(f"tile_map() must have or two tile arguments")
 
 
 def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
diff --git a/warp/codegen.py b/warp/codegen.py
index 5d017e54..6a9991af 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -607,7 +607,7 @@ def type_to_ctype(t, value_type=False):
             classstr = f"wp::{type(t).__name__}"
             return f"{classstr}_t<{dtypestr}>"
         elif is_tile(t):
-            return "auto"
+            return t.ctype()
         elif isinstance(t, Struct):
             return make_full_qualified_name(t.cls)
         elif is_reference(t):
diff --git a/warp/native/tile.h b/warp/native/tile.h
index b3be6d81..5174e140 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -27,6 +27,37 @@
 
 #endif
 
+/* Tile Expressions
+
+[x] Forward / Backward code-gen
+[ ] wp.tile_map()
+    [ ] Support user functions
+    [ ] Support built-in functions
+    [ ] Support for lambda functions
+[ ] wp.tile_matmul()
+    [ ] Forward
+    [ ] Reverse
+[ ] Support for n-d shape tiles / broadcasting / slicing?
+[ ] Compile-time block dimensions
+[ ] Support for CUB reductions
+[ ] Support for CUB sorts
+[ ] Examples
+    [ ] GEMM
+    [ ] Batched MLP
+    [ ] Point cloud alignment
+    [ ] Layer norm
+
+*/
+
+// wp.tile_load(A, offset, shape)
+// wp.tile_load(A, (x, y), (16, 16))
+// wp.tile_load(A, (x, y, z), (3, 3, 3))
+
+// wp.tile_load(A, index, shape)
+// wp.tile_load(A, x, m)
+// wp.tile_load(A, x, y, m, n)
+// wp.tile_load(A, x, y, z, m, n, o)
+// wp.tile_load(A, x, y, z, m, n, o, p)
 
 namespace wp
 {
@@ -78,6 +109,7 @@ struct tile_load_t
 
     array_t<T> slice;
 
+    tile_load_t() {}
     tile_load_t(array_t<T>& src, int x, int y)
     {
         assert(src.ndim == 2);
@@ -132,9 +164,9 @@ struct tile_store_t
     static constexpr int N = Tile_::N;
 
     array_t<Type> slice;
+    Tile tile;
 
-    Tile& tile;
-
+    tile_store_t() {}
     tile_store_t(array_t<Type>& dest, int x, int y, Tile& t) : tile(t)
     {
         assert(dest.ndim == 2);
@@ -190,9 +222,10 @@ struct tile_constant_t
     static constexpr int N = N_;
 
     T c;
-    T& adj_c;
+    T* adj_c;
 
-    tile_constant_t(const T& c, T& adj_c) : c(c), adj_c(adj_c) {}
+    tile_constant_t() {}
+    tile_constant_t(const T& c, T& adj_c) : c(c), adj_c(&adj_c) {}
 
     Type fwd(int e)
     {
@@ -201,7 +234,7 @@ struct tile_constant_t
 
     void bwd(int e, const T& adj_ret)
     {
-        adj_c += adj_ret;
+        *adj_c += adj_ret;
     }
 
     void print()
@@ -212,21 +245,71 @@ struct tile_constant_t
     }
 };
 
+template <typename T, int M_, int N_>
+struct tile_zeros_t
+{
+    using Type = T;
+    static constexpr int M = M_;
+    static constexpr int N = N_;
+
+    tile_zeros_t() {}
+
+    Type fwd(int e)
+    {
+        return Type(0.0);
+    }
+
+    void bwd(int e, const T& adj_ret) {}
+
+    void print()
+    {
+        printf("tile_zeros_t<%d, %d>-+", M, N);
+        print(c);
+        printf("\n");
+    }
+};
+
+template <typename T, int M_, int N_>
+struct tile_ones_t
+{
+    using Type = T;
+    static constexpr int M = M_;
+    static constexpr int N = N_;
+
+    tile_ones_t() {}
+
+    Type fwd(int e)
+    {
+        return Type(1.0);
+    }
 
+    void bwd(int e, const T& adj_ret) {}
 
-template <typename Tile, typename FwdOp, typename AdjOp>
+    void print()
+    {
+        printf("tile_ones_t<%d, %d>-+", M, N);
+        print(c);
+        printf("\n");
+    }
+};
+
+template <typename Tile>
 struct tile_unary_map_t
 {
     using Type = typename Tile::Type;
     static constexpr int M = Tile::M;
     static constexpr int N = Tile::N;
 
-    Tile& tile;
+    using FwdOp = Type(*)(Type);
+    using AdjOp = void(*)(Type, Type&, Type&);
+
+    Tile tile;
     
     FwdOp fwd_fn;
     AdjOp adj_fn;
 
-    tile_unary_map_t(Tile& t, FwdOp f, AdjOp a)  : tile(t), fwd_fn(f), adj_fn(a) {}
+    tile_unary_map_t() {}
+    tile_unary_map_t(Tile& t, FwdOp fwd, AdjOp adj)  : tile(t), fwd_fn(fwd), adj_fn(adj) {}
 
     Type fwd(int e) const
     {
@@ -249,7 +332,7 @@ struct tile_unary_map_t
     }
 };
 
-template <typename TileA, typename TileB, typename FwdOp, typename AdjOp>
+template <typename TileA, typename TileB>
 struct tile_binary_map_t
 {
     static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error");
@@ -260,14 +343,17 @@ struct tile_binary_map_t
     static constexpr int M = TileA::M;
     static constexpr int N = TileA::N;
 
-    const TileA& tile_a;
-    const TileB& tile_b;
+    using FwdOp = Type(*)(Type, Type);
+    using AdjOp = void(*)(Type, Type, Type&, Type&, Type&);
+
+    TileA tile_a;
+    TileB tile_b;
 
     FwdOp fwd_fn;
     AdjOp adj_fn;
 
-
-    tile_binary_map_t(const TileA& a, TileB& b, FwdOp fwd_fn, AdjOp adj_fn) : tile_a(a), tile_b(b), fwd_fn(fwd_fn), adj_fn(adj_fn) {}
+    tile_binary_map_t() {}
+    tile_binary_map_t(const TileA& a, TileB& b, FwdOp fwd, AdjOp adj) : tile_a(a), tile_b(b), fwd_fn(fwd), adj_fn(adj) {}
 
     Type fwd(int e) const
     {
@@ -300,11 +386,20 @@ struct tile_binary_map_t
         printf("\n   -+");
         tile_b.print();
     }
-
 };
 
 
 
+
+//-----------------------------------------------------------------------------------------------------
+// High level entry points for each op (correspond to one Warp builtin)
+
+template <typename T, int M, int N>
+tile_zeros_t<T, M, N> tile_zeros() { return tile_zeros_t<T, M, N>(); }
+
+template <typename T, int M, int N>
+tile_ones_t<T, M, N> tile_ones() { return tile_ones_t<T, M, N>(); }
+
 // entry point for load
 template <typename T, int M, int N>
 tile_load_t<T, M, N> tile_load(array_t<T>& a, int x, int y)
@@ -341,19 +436,18 @@ void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_des
 }
 
 
-
 // unary map
-template <typename Tile, typename FwdOp, typename AdjOp>
-tile_unary_map_t<Tile, FwdOp, AdjOp> tile_map_impl(FwdOp fwd, AdjOp adj, Tile& a)
+template <typename Tile>
+tile_unary_map_t<Tile> tile_map_impl(typename tile_unary_map_t<Tile>::FwdOp fwd, typename tile_unary_map_t<Tile>::AdjOp adj, Tile& a)
 {
-    return tile_unary_map_t<Tile, FwdOp, AdjOp>(a, fwd, adj);
+    return tile_unary_map_t<Tile>(a, fwd, adj);
 }
 
 // binary map
-template <typename TileA, typename TileB, typename FwdOp, typename AdjOp>
-tile_binary_map_t<TileA, TileB, FwdOp, AdjOp> tile_map_impl(FwdOp fwd, AdjOp adj, TileA& a, TileB& b)
+template <typename TileA, typename TileB>
+tile_binary_map_t<TileA, TileB> tile_map_impl(typename tile_binary_map_t<TileA, TileB>::FwdOp fwd, typename tile_binary_map_t<TileA, TileB>::AdjOp adj, TileA& a, TileB& b)
 {
-    return tile_binary_map_t<TileA, TileB, FwdOp, AdjOp>(a, b, fwd, adj);
+    return tile_binary_map_t<TileA, TileB>(a, b, fwd, adj);
 }
 
 // use macro to capture adjoint operator
@@ -370,3 +464,90 @@ void adj_tile_map_impl(void) {}
 
 } // namespace wp
 
+#if 0
+
+//-----------------------------------------------------
+// c = a + b
+
+// forward
+auto var_0 = wp::tile_load<wp::float32,8,4>(var_A, x, y);
+auto var_1 = wp::tile_load<wp::float32,8,4>(var_B, x, y);
+auto var_2 = wp::tile_add(var_0, var_1);
+wp::tile_store(var_C, x, y, var_2)
+
+// reverse
+wp::adj_store(var_C, x, y, var_2, adj_C, _, _, adj_2)
+wp::adj_tile_add(var_0, var_1, adj_0, adj_1, adj_2)
+wp::adj_tile_load(var_B, x, y, adj_B, _, _, adj_1);
+wp::adj_tile_load(var_B, x, y, adj_B, _, _, adj_0);
+
+
+//-----------------------------------------------------
+// x = a[0]
+// c = x*2.0 + x
+
+// forward
+auto var_0 = wp::tile_load<wp::float32,8,4>(var_A, x, y);
+auto var_1 = wp::tile_mul(var_0, 2.0);
+auto var_2 = wp::tile_add(var_0, var_1);
+wp::tile_store(var_C, x, y, var_2)
+
+struct adj_store_t
+{
+    adj_store_t()
+    {
+
+    }
+
+    float bwd(int i, float adj_ret)
+    {
+        return array.grad[i];
+    }
+};
+
+template <typename P>
+struct adj_add_t
+{
+    adj_add_t(P& parent)
+    {
+        
+    }
+
+    float bwd(int i, float& adj_a, float& adj_b)
+    {
+        // evaluate parent
+        float adj_ret = parent.bwd(i);
+
+        adj_a += adj_ret;
+        adj_b += adj_ret;
+    }
+};
+
+template <typename T>
+struct adj_tile
+{
+    adj_tile(T& parent)
+    {
+
+    }
+
+
+
+};
+
+void adj_tile_load(A, x, y, adj_A, adj_x, adj_y, adj_ret)
+{
+    for i in A(x,y):
+        adj_A[i] += adj_ret(i);
+}
+
+
+
+// reverse
+wp::adj_store(var_C, x, y, var_2, adj_C, _, _, adj_2)   // adj_2->adj_C
+wp::adj_tile_add(var_0, var_1, adj_0, adj_1, adj_2)     // adj_0->adj_2->adj_C, adj_1->adj_2->adj_C
+wp::adj_tile_mul(var_0, 2.0, adj_0, _, adj_1);          // adj_0->adj_1->adj_2->adj_C
+wp::adj_tile_load(var_A, x, y, adj_A, _, _, adj_0);     // adj_A->adj_0->adj_1->adj_2->adj_C
+
+
+#endif
\ No newline at end of file
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index 91ed329d..15e22cbd 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -5,7 +5,7 @@
 // todo: requires CTK, replace with inline ptx
 #include "cuda_pipeline_primitives.h"
 
-#define USE_CUTE 1
+#define USE_CUTE 0
 
 #if USE_CUTE
 #include "cutlass/include/cute/tensor.hpp"
@@ -15,7 +15,7 @@
 namespace wp
 {
 
-
+/*
 // 2D tile zero
 template <typename T, int M, int N, int Index>
 inline CUDA_CALLABLE array_t<T> tile_zeros()
@@ -84,6 +84,7 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int i, int j, const array
         index(dest, i*M + t/N, j*N + t%N) = src.data[t];
     }
 }
+*/
 
 template <typename T>
 inline CUDA_CALLABLE const T& index(const T* __restrict__ p, int i, int j, int stride)
@@ -174,7 +175,7 @@ inline void partition_store(const partition_t<M, N, T>& tile, int i, int j, cons
 #if !USE_CUTE
 
 template <typename T>
-inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+inline CUDA_CALLABLE void gemm(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
 {   
     const int TILE_M = 4;
     const int TILE_N = 4;
@@ -307,4 +308,72 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 
 #endif // USE_CUTE
 
+template <typename TileA, typename TileB, typename TileC>
+struct tile_matmul_t
+{
+    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error");
+    static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
+    static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
+    static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
+
+    using Type = typename TileA::Type;
+    static constexpr int M = TileC::M;
+    static constexpr int N = TileC::N;
+
+    const TileA& tile_a;
+    const TileB& tile_b;
+
+    tile_matmul_t(const TileA &a, TileB &b, TileC &b) : tile_a(a),
+                                                        tile_b(b),
+                                                        tile_c(c) {}
+
+    Type fwd(int e) const
+    {
+        // load 
+        
+
+    }
+
+    void bwd(int e, Type adj_ret) const
+    {
+        Type a = tile_a.fwd(e);
+        Type b = tile_b.fwd(e);
+ 
+        Type adj_a = 0.0;
+        Type adj_b = 0.0;
+
+        adj_fn(a, b, adj_a, adj_b, adj_ret);
+
+        // recurse
+        tile_a.bwd(e, adj_a);
+        tile_b.bwd(e, adj_b);
+    }
+
+    void print()
+    {
+        printf("tile_binary_map_t<%d, %d>", M, N);
+        printf("\n   -+");
+        tile_a.print();
+        printf("\n   -+");
+        tile_b.print();
+    }
+};
+
+template <typename TileA, typename TileB, typename TileC>
+void tile_matmul(TileA& a, TileB& b, TileC& c)
+{
+    // load a to shared
+    // load b to shared
+
+}
+
+
+template <typename TileA, typename TileB, typename TileC>
+void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
+                     TileA& adj_a, TileB& adj_b, TileC& adj_c)
+{
+}
+
+
+
 } // namespace wp
\ No newline at end of file
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index beea0746..cc1f4a3a 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -155,36 +155,31 @@ def test_tile_binary_map():
     
     print("Binary map backward passed")
 
-test_tile_copy()
-test_tile_unary_map()
-test_tile_binary_map()
 
 
-# @wp.kernel
-# def gemm(A: wp.array2d(dtype=float),
-#          B: wp.array2d(dtype=float),
-#          C: wp.array2d(dtype=float)):
+TILE_M = wp.constant(64)
+TILE_N = wp.constant(64)
+TILE_K = wp.constant(8)
 
-#     # output index
-#     i, j = wp.tid()
+# sum = wp.tile_zeros(M,N)
 
-#     sum = float(0.0)
+# for i in range(5):
 
-#     for k in range(0, A.shape[1]):
-#         sum += A[i, k]*B[k, j]
+#     a = wp.tile_load(A)
+#     b = wp.tile_load(B)
 
-#     C[i, j] = sum
+#     a2 = a*2.0
+    
+#     wp.tile_matmul(a2, b, sum)
 
+# wp.tile_store(sum)
 
 
-# TILE_M = wp.constant(64)
-# TILE_N = wp.constant(64)
-# TILE_K = wp.constant(8)
 
 # @wp.kernel
-# def gemm_tiled(A: wp.array2d(dtype=float),
-#                B: wp.array2d(dtype=float),
-#                C: wp.array2d(dtype=float)):
+# def tile_gemm(A: wp.array2d(dtype=float),
+#               B: wp.array2d(dtype=float),
+#               C: wp.array2d(dtype=float)):
 
 #     # output tile index
 #     i, j = wp.tid()
@@ -197,7 +192,7 @@ def test_tile_binary_map():
 
 #     count = int(K / TILE_K) # todo: must be the same as TILE_K
 
-#     for k in range(count):
+#     for k in range(0, K, TILE_K):
 
 #         a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
 #         b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
@@ -208,68 +203,30 @@ def test_tile_binary_map():
 #     wp.tile_store(C, i, j, sum)
 
 
-# s = 0.0
-
-# for i, j in tile.shape:
-
-#     s += tile[i-1, i-1]
-#     s += tile[i, i-1]
-#     s += tile[i,]
-
-
-
-# M = TILE_M*7
-# K = TILE_K*4
-# N = TILE_N*6
-
-# rng = np.random.default_rng(42)
-# A = rng.random((M, K), dtype=np.float32)
-# B = rng.random((K, N), dtype=np.float32)
-# C = np.zeros((M, N), dtype=np.float32)
-
-# A_wp = wp.array(A)
-# B_wp = wp.array(B)
-# C_wp = wp.array(C)
-
-# iters = 10
+# def test_tile_gemm():
 
-# with wp.ScopedTimer("NumPy"):
+#     M = TILE_M*7
+#     K = TILE_K*4
+#     N = TILE_N*6
 
-#     for i in range(iters):
-#         C = A@B
+#     rng = np.random.default_rng(42)
+#     A = rng.random((M, K), dtype=np.float32)
+#     B = rng.random((K, N), dtype=np.float32)
+#     C = np.zeros((M, N), dtype=np.float32)
 
-# wp.force_load(device="cuda:0")
+#     A_wp = wp.array(A)
+#     B_wp = wp.array(B)
+#     C_wp = wp.array(C)
 
-# with wp.ScopedTimer("Warp", cuda_filter=wp.TIMING_KERNEL):
+#     iters = 10
 
-#     for i in range(iters):
-#         wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+#     wp.launch(tile_gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
 
+#     assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
 
-#     print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
-
-#     for i in range(iters):
-#         wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=128)
-#         wp.synchronize()
-
-
-#     print(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
-
-
-# A_tc = torch.from_numpy(A).to("cuda:0")
-# B_tc = torch.from_numpy(B).to("cuda:0")
-# C_tc = torch.from_numpy(C).to("cuda:0")
-
-# for i in range(10):
-#     torch.matmul(A_tc, B_tc, out=C_tc)
-
-# with wp.ScopedTimer("Torch"):
-
-#     for i in range(iters):
-#         torch.matmul(A_tc, B_tc, out=C_tc)
-
-#     torch.cuda.synchronize()
-
-    
 
 
+test_tile_copy()
+test_tile_unary_map()
+test_tile_binary_map()
+#test_tile_gemm()
\ No newline at end of file
diff --git a/warp/types.py b/warp/types.py
index 03065d6d..9d993169 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2869,6 +2869,48 @@ def __init__(self, dtype, M, N, op):
         self.N = N
         self.op = op
 
+class TileZeros(Tile):
+
+    def __init__(self, dtype, M, N):
+        Tile.__init__(self, dtype, M, N, "zeros")
+        
+    def ctype(self):
+        from warp.codegen import Var
+        return f"wp::tile_zeros_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+
+class TileLoad(Tile):
+
+    def __init__(self, array, M, N):
+        Tile.__init__(self, array.dtype, M, N, "load")
+        
+    def ctype(self):
+        from warp.codegen import Var
+        return f"wp::tile_load_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+
+class TileUnaryMap(Tile):
+
+    def __init__(self, t):
+        Tile.__init__(self, t.dtype, t.M, t.N, "unary_map")
+
+        self.t = t
+        
+    def ctype(self):
+        from warp.codegen import Var
+        return f"wp::tile_unary_map_t<{self.t.ctype()}>"
+
+class TileBinaryMap(Tile):
+
+    def __init__(self, a, b):
+        Tile.__init__(self, a.dtype, a.M, a.N, "binary_map")
+
+        self.a = a
+        self.b = b
+        
+    def ctype(self):
+        from warp.codegen import Var
+        return f"wp::tile_binary_map_t<{self.a.ctype()}, {self.b.ctype()}>"
+
+
 def is_tile(t):
     return isinstance(t, Tile)
 

From b61be8f0ef452056351204b6b074019c485ed6e7 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 27 Aug 2024 14:55:54 +1200
Subject: [PATCH 017/102] wp.tile_matmul() and wp.tile_eval() expressions
 forward mode working

---
 warp/builtins.py        | 84 +++++++++++++++++++++++++++++------------
 warp/native/builtin.h   |  2 +-
 warp/native/tile.h      | 49 ++++++++++++++++++++++--
 warp/native/tile_gemm.h | 63 ++++++++++++++++---------------
 warp/tests/test_tile.py | 65 +++++++++++++++----------------
 warp/types.py           | 12 ++++++
 6 files changed, 182 insertions(+), 93 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index b721dea1..c96b9f56 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1845,63 +1845,97 @@ def tile_store_value_func(arg_types, arg_values):
 )
 
 
-def tile_realize_value_func(arg_types, arg_values):
+
+def tile_matmul_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
         return None
 
-    m, n = arg_values["t"].m, arg_values["n"].n
-    dtype = arg_values["t"].dtype
+    if len(arg_types) != 3: 
+        raise RuntimeError("tile_matmul() requires 4 positional args")
+
+    if not is_tile(arg_types["a"]):
+        raise RuntimeError("tile_matmul() argument 0 must be a tile")
+
+    if not is_tile(arg_types["b"]):
+        raise RuntimeError("tile_matmul() argument 1 must be an tile")
+
+    if not isinstance(arg_types["out"], TileShared):
+        raise RuntimeError("tile_matmul() output must be a fully evaluated tile, e.g.: created using tile_eval()")
+
+    return None
+
+def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
     
-    return Tile(dtype, m, n, "realize")
+    a = arg_values["a"]
+    b = arg_values["b"]
+    out = arg_values["out"]
+
+    # template_args.append(dtype)
+    # template_args.append(m)
+    # template_args.append(n)
+
+    global shared_memory_id
+
+    template_args = []
+    template_args.append(shared_memory_id)
 
+    # matmul makes two allocations (one for each of its arguments)
+    shared_memory_id += 1        
+    shared_memory_id += 1
+
+    return ((a, b, out), template_args)
 
 
 add_builtin(
-    "tile_realize",
-    input_types={"t": Tile},
-    value_func=tile_realize_value_func,
+    "tile_matmul",
+    input_types={"a": Tile, "b": Tile, "out": Tile},
+    value_func=tile_matmul_value_func,
+    dispatch_func=tile_matmul_dispatch_func,
     variadic=True,
-    doc="Force evaluation of a tile expression tree into local memory",
+    doc="Compute matrix product and accumulate out += a*b, a and b will be realized before evaluation, and output must already be realized.", 
     group="Tile Primitives",
     export=False,
 )
 
-
-
-
-def tile_matmul_value_func(arg_types, arg_values):
+def tile_eval_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
         return None
 
-    if len(arg_types) != 3: 
-        raise RuntimeError("tile_matmul() requires 4 positional args")
+    if not is_tile(arg_types["t"]):
+        raise RuntimeError("tile_eval() argument must be a tile")
 
-    if not is_tile(arg_types["a"]):
-        raise RuntimeError("tile_matmul() argument 0 must be a tile")
+    return TileShared(arg_types["t"])
 
-    if not is_tile(arg_types["b"]):
-        raise RuntimeError("tile_matmul() argument 1 must be an tile")
+def tile_eval_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
 
-    if not is_tile(arg_types["out"]):
-        raise RuntimeError("tile_matmul() argument 2 must be an tile")
+    t = arg_values["t"]
 
-    return None
+    global shared_memory_id
+
+    template_args = []
+    template_args.append(shared_memory_id)
+
+    # matmul makes two allocations (one for each of its arguments)
+    shared_memory_id += 1        
 
+    return ((t,), template_args)
 
 add_builtin(
-    "tile_matmul",
-    input_types={"a": Tile, "b": Tile, "out": Tile},
-    value_func=tile_matmul_value_func,
+    "tile_eval",
+    input_types={"t": Tile},
+    value_func=tile_eval_value_func,
+    dispatch_func=tile_eval_dispatch_func,
     variadic=True,
-    doc="Compute matrix product and accumulate out += a*b, a and b will be realized before evaluation, and output must already be realized.", 
+    doc="Force evaluation of a tile expression into shared memory", 
     group="Tile Primitives",
     export=False,
 )
 
+
 # does type propagation for load()
 def tile_map_value_func(arg_types, arg_values):
 
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index 544d771d..a899d9a7 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1590,5 +1590,5 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 // only include in kernels for now
 #if defined(__CUDACC_RTC__)
 #include "tile.h"
-//#include "tile_gemm.h"
+#include "tile_gemm.h"
 #endif
\ No newline at end of file
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 5174e140..c7666513 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -227,12 +227,12 @@ struct tile_constant_t
     tile_constant_t() {}
     tile_constant_t(const T& c, T& adj_c) : c(c), adj_c(&adj_c) {}
 
-    Type fwd(int e)
+    Type fwd(int e) const
     {
         return c;
     }
 
-    void bwd(int e, const T& adj_ret)
+    void bwd(int e, const T& adj_ret) const
     {
         *adj_c += adj_ret;
     }
@@ -254,12 +254,12 @@ struct tile_zeros_t
 
     tile_zeros_t() {}
 
-    Type fwd(int e)
+    Type fwd(int e) const
     {
         return Type(0.0);
     }
 
-    void bwd(int e, const T& adj_ret) {}
+    void bwd(int e, const T& adj_ret) const {}
 
     void print()
     {
@@ -389,7 +389,30 @@ struct tile_binary_map_t
 };
 
 
+template <typename T, int M_, int N_>
+struct tile_shared_t
+{
+    using Type = T;
+    static constexpr int M = M_;
+    static constexpr int N = N_;
+
+    T* data = NULL;
+
+    tile_shared_t() {}
+    tile_shared_t(T* smem) : data(smem)
+    {
+    }
+
+    T fwd(int e) const
+    {
+        return data[e];
+    }
+
+    void bwd(int e, T adj_ret) const
+    {
 
+    }
+};
 
 //-----------------------------------------------------------------------------------------------------
 // High level entry points for each op (correspond to one Warp builtin)
@@ -407,6 +430,24 @@ tile_load_t<T, M, N> tile_load(array_t<T>& a, int x, int y)
     return tile_load_t<T, M, N>(a, x, y);
 }
 
+template <int Index, typename Tile>
+tile_shared_t<typename Tile::Type, Tile::M, Tile::N> tile_eval(Tile& t)
+{
+    WP_TILE_SHARED typename Tile::Type data[Tile::M*Tile::N];
+    
+    // evaluate the input tile and store into shared memory
+    for (int i=threadIdx.x; i < size(t); i += blockDim.x)
+        data[i] = t.fwd(i);
+
+    return tile_shared_t<typename Tile::Type, Tile::M, Tile::N>(data);
+}
+
+template <typename Tile>
+void adj_tile_eval(Tile& t, Tile& adj_t, tile_shared_t<typename Tile::Type, Tile::M, Tile::N>& adj_ret)
+{
+    // nop
+}
+
 template <typename T, int M, int N>
 void adj_tile_load(array_t<T>& a, int x, int y, array_t<T>& adj_a, int adj_x, int adj_y, const tile_load_t<T, M, N>& adj_ret)
 {
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index 15e22cbd..27b5b852 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -215,15 +215,18 @@ inline CUDA_CALLABLE void gemm(const array_t<T>& A, const array_t<T>& B, const a
 }
 
 
-
 // 2D gemm accumulate out += A*B
-template <typename T>
-inline CUDA_CALLABLE void tile_matmul_scalar(const array_t<T>& A, const array_t<T>& B, const array_t<T>& out)
+template <typename TileA, typename TileB, typename TileC>
+inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
+                                             const TileB& B,
+                                             const TileC& out)
 {    
-    const int length = out.shape[0]*out.shape[1];
+    const int length = size(out);
 
     WP_TILE_SYNC();
 
+    using T = typename TileA::Type;
+
     const T* __restrict__ A_ptr = A.data;
     const T* __restrict__ B_ptr = B.data;
     T* __restrict__ C_ptr = out.data;
@@ -232,21 +235,21 @@ inline CUDA_CALLABLE void tile_matmul_scalar(const array_t<T>& A, const array_t<
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
         // compute output index
-        const int i = t/out.shape[1];
-        const int j = t%out.shape[1];
+        const int i = t/out.N;
+        const int j = t%out.N;
 
         T sum(0.0);
 
         WP_PRAGMA_UNROLL
-        for (int k=0; k < A.shape[1]; ++k)
+        for (int k=0; k < A.N; ++k)
         {
-            T a = index(A_ptr, i, k, A.shape[1]);
-            T b = index(B_ptr, k, j, B.shape[1]);
+            T a = index(A_ptr, i, k, A.N);
+            T b = index(B_ptr, k, j, B.N);
 
             sum = fmaf(a, b, sum);
         }
         
-        index(C_ptr, i, j, out.shape[1]) += sum;
+        index(C_ptr, i, j, out.N) += sum;
     }
 
     WP_TILE_SYNC();
@@ -311,7 +314,7 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 template <typename TileA, typename TileB, typename TileC>
 struct tile_matmul_t
 {
-    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error");
+    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error, tile datatypes must match");
     static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
     static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
     static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
@@ -320,12 +323,13 @@ struct tile_matmul_t
     static constexpr int M = TileC::M;
     static constexpr int N = TileC::N;
 
-    const TileA& tile_a;
-    const TileB& tile_b;
+    TileA tile_a;
+    TileB tile_b;
+    TileC tile_c;
 
-    tile_matmul_t(const TileA &a, TileB &b, TileC &b) : tile_a(a),
-                                                        tile_b(b),
-                                                        tile_c(c) {}
+    tile_matmul_t(TileA &a, TileB &b, TileC &c) : tile_a(a),
+                                                  tile_b(b),
+                                                  tile_c(c) {}
 
     Type fwd(int e) const
     {
@@ -336,22 +340,11 @@ struct tile_matmul_t
 
     void bwd(int e, Type adj_ret) const
     {
-        Type a = tile_a.fwd(e);
-        Type b = tile_b.fwd(e);
- 
-        Type adj_a = 0.0;
-        Type adj_b = 0.0;
-
-        adj_fn(a, b, adj_a, adj_b, adj_ret);
-
-        // recurse
-        tile_a.bwd(e, adj_a);
-        tile_b.bwd(e, adj_b);
     }
 
     void print()
     {
-        printf("tile_binary_map_t<%d, %d>", M, N);
+        printf("tile_matmul_t<%d, %d>", M, N);
         printf("\n   -+");
         tile_a.print();
         printf("\n   -+");
@@ -359,12 +352,20 @@ struct tile_matmul_t
     }
 };
 
-template <typename TileA, typename TileB, typename TileC>
+
+template <int Index, typename TileA, typename TileB, typename TileC>
 void tile_matmul(TileA& a, TileB& b, TileC& c)
 {
-    // load a to shared
-    // load b to shared
+    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error, tile datatypes must match");
+    static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
+    static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
+    static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
 
+    // load inputs to shared
+    auto a_shared = tile_eval<Index+0>(a);
+    auto b_shared = tile_eval<Index+1>(b);
+    
+    tile_matmul_scalar(a_shared, b_shared, c);
 }
 
 
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index cc1f4a3a..56a621f0 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -176,57 +176,58 @@ def test_tile_binary_map():
 
 
 
-# @wp.kernel
-# def tile_gemm(A: wp.array2d(dtype=float),
-#               B: wp.array2d(dtype=float),
-#               C: wp.array2d(dtype=float)):
+@wp.kernel
+def tile_gemm(A: wp.array2d(dtype=float),
+              B: wp.array2d(dtype=float),
+              C: wp.array2d(dtype=float)):
 
-#     # output tile index
-#     i, j = wp.tid()
+    # output tile index
+    i, j = wp.tid()
 
-#     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+    sum = wp.tile_eval(wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32))
 
-#     M = A.shape[0]
-#     N = B.shape[1]
-#     K = A.shape[1]
+    M = A.shape[0]
+    N = B.shape[1]
+    K = A.shape[1]
 
-#     count = int(K / TILE_K) # todo: must be the same as TILE_K
+    count = int(K / TILE_K) # todo: must be the same as TILE_K
 
-#     for k in range(0, K, TILE_K):
+    for k in range(0, count):
 
-#         a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
-#         b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
 
-#         # sum += a*b
-#         wp.tile_matmul(a, b, sum)
+        # sum += a*b
+        wp.tile_matmul(a, b, sum)
 
-#     wp.tile_store(C, i, j, sum)
+    wp.tile_store(C, i, j, sum)
 
 
-# def test_tile_gemm():
+def test_tile_gemm():
 
-#     M = TILE_M*7
-#     K = TILE_K*4
-#     N = TILE_N*6
+    M = TILE_M*7
+    K = TILE_K*4
+    N = TILE_N*6
 
-#     rng = np.random.default_rng(42)
-#     A = rng.random((M, K), dtype=np.float32)
-#     B = rng.random((K, N), dtype=np.float32)
-#     C = np.zeros((M, N), dtype=np.float32)
+    rng = np.random.default_rng(42)
+    A = rng.random((M, K), dtype=np.float32)
+    B = rng.random((K, N), dtype=np.float32)
+    C = np.zeros((M, N), dtype=np.float32)
 
-#     A_wp = wp.array(A)
-#     B_wp = wp.array(B)
-#     C_wp = wp.array(C)
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+    C_wp = wp.array(C)
 
-#     iters = 10
+    wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=8)
 
-#     wp.launch(tile_gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+    assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
 
-#     assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
+    # GEMM forward passed
+    print("Binary map backward passed")
 
 
 
 test_tile_copy()
 test_tile_unary_map()
 test_tile_binary_map()
-#test_tile_gemm()
\ No newline at end of file
+test_tile_gemm()
\ No newline at end of file
diff --git a/warp/types.py b/warp/types.py
index 9d993169..b5f02dba 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2911,6 +2911,18 @@ def ctype(self):
         return f"wp::tile_binary_map_t<{self.a.ctype()}, {self.b.ctype()}>"
 
 
+class TileShared(Tile):
+
+    def __init__(self, t):
+        Tile.__init__(self, t.dtype, t.M, t.N, "shared")
+
+        self.t = t
+        
+    def ctype(self):
+        from warp.codegen import Var
+        return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+
+
 def is_tile(t):
     return isinstance(t, Tile)
 

From e3dfca85f7386de95a85cbb96460016e4bbc1af5 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 29 Aug 2024 14:50:31 +1200
Subject: [PATCH 018/102] Working on operator support

---
 warp/builtins.py        |  91 ++++++++++++
 warp/native/tile.h      | 300 +++++++++++++++++++++++++++++++++++++++-
 warp/native/tile_gemm.h |   7 +
 warp/tests/test_tile.py |  98 +++++++++++--
 warp/types.py           |  17 ++-
 5 files changed, 495 insertions(+), 18 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index c96b9f56..fc3438cb 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -4442,3 +4442,94 @@ def matmat_mul_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
 
 
 add_builtin("unot", input_types={"a": array(dtype=Any)}, value_type=builtins.bool, doc="", group="Operators")
+
+
+# Tile operators
+def tile_unary_value_func(arg_types, arg_values):
+
+    if arg_types is None:
+        return Tile(dtype=Any, M=Any, N=Any)
+
+    t = arg_types["x"]
+
+    if not is_tile(t):
+        raise RuntimeError("Expected tile for unary expression")
+    
+    return TileUnaryMap(t)
+
+def tile_scalar_mul_value_func(arg_types, arg_values):
+
+    if arg_types is None:
+        return Tile(dtype=Any, M=Any, N=Any)
+
+    x = arg_types["x"]
+    y = arg_types["y"]
+
+    # tile*scalar
+    if is_tile(x):
+        if x.dtype != y:
+            raise RuntimeError("Scalar factor should have the same type as tile for tile*scalar, tile type: {x} scalar type: {y}")
+        
+        return TileBinaryMap(x, TileConstant(x.dtype, x.M, x.N))
+    
+    # scalar*tile
+    if is_tile(y):
+        if y.dtype != x:
+            raise RuntimeError("Scalar factor should have the same type as tile for scalar*tile, tile type: {x} scalar type: {y}")
+        
+        return TileBinaryMap(TileConstant(x.dtype, x.M, x.N), y)
+
+
+
+# def tile_binary_value_func(arg_types, arg_values):
+
+#     if arg_types is None:
+#         return Tile(dtype=Any, M=Any, N=Any)
+
+#     a = arg_types[0]
+    
+
+#     if not is_tile(t):
+#         raise RuntimeError("Expected tile for unary expression")
+    
+#     return TileUnaryMap(t.dtype, t.M, t.N)
+
+add_builtin(
+    "neg",
+    input_types={"x": Tile(dtype=Any, M=Any, N=Any)},
+    value_func=tile_unary_value_func,
+    doc="",
+    export=False,
+    native_func="tile_neg",
+    group="Operators",
+)
+
+add_builtin(
+    "mul",
+    input_types={"x": Tile(dtype=Any, M=Any, N=Any), "y": Scalar},
+    value_func=tile_scalar_mul_value_func,
+    doc="",
+    export=False,
+    native_func="tile_mul",
+    group="Operators",
+)
+
+add_builtin(
+    "mul",
+    input_types={"x": Scalar, "y": Tile(dtype=Any, M=Any, N=Any)},
+    value_func=tile_scalar_mul_value_func,
+    doc="",
+    export=False,
+    native_func="tile_mul",
+    group="Operators",
+)
+
+# add_builtin(
+#     "mul",
+#     input_types={"x": Tile, "s": Scalar},
+#     value_func=tile_binary_value_func,
+#     doc="",
+#     group="Operators",
+# )
+
+
diff --git a/warp/native/tile.h b/warp/native/tile.h
index c7666513..009709a2 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -31,13 +31,13 @@
 
 [x] Forward / Backward code-gen
 [ ] wp.tile_map()
-    [ ] Support user functions
+    [x]  Support user functions
     [ ] Support built-in functions
     [ ] Support for lambda functions
 [ ] wp.tile_matmul()
-    [ ] Forward
+    [x] Forward
     [ ] Reverse
-[ ] Support for n-d shape tiles / broadcasting / slicing?
+[ ] Support for n-d shape tiles / broadcasting / slicing / transpose?
 [ ] Compile-time block dimensions
 [ ] Support for CUB reductions
 [ ] Support for CUB sorts
@@ -46,7 +46,7 @@
     [ ] Batched MLP
     [ ] Point cloud alignment
     [ ] Layer norm
-
+    
 */
 
 // wp.tile_load(A, offset, shape)
@@ -388,7 +388,251 @@ struct tile_binary_map_t
     }
 };
 
+//-----------------------------------------------
+// Operators
+
+
+template<typename Tile>
+CUDA_CALLABLE inline tile_unary_map_t<Tile> tile_pos(const Tile& t)
+{
+    return tile_unary_map_t<Tile>(t, [](typename Tile::Type x) { return pos(x); } );
+}
+
+template<typename Tile>
+CUDA_CALLABLE inline tile_unary_map_t<Tile> tile_neg(Tile& t)
+{
+    typedef tile_unary_map_t<Tile> Op;
+
+    typename Op::FwdOp fwd = [](typename Tile::Type x) { return neg(x); };
+    typename Op::AdjOp adj = [](typename Tile::Type x, typename Tile::Type& adj_x, typename Tile::Type& adj_ret) { adj_neg(x, adj_x, adj_ret); };
+
+    return Op(t, fwd, adj);
+}
+
+template<typename Tile>
+CUDA_CALLABLE inline void adj_tile_neg(const Tile& t, Tile& adj_t, tile_unary_map_t<Tile>& adj_ret)
+{
+    // nop
+}
+
+
+/*
+
+template<unsigned Length, typename Type>
+CUDA_CALLABLE inline vec_t<Length, Type> neg(const vec_t<Length, Type>& x)
+{
+    return -x;
+}
+
+template<typename Type>
+CUDA_CALLABLE inline vec_t<3, Type> neg(const vec_t<3, Type>& x)
+{
+    return vec_t<3, Type>(-x.c[0], -x.c[1], -x.c[2]);
+}
+
+template<typename Type>
+CUDA_CALLABLE inline vec_t<2, Type> neg(const vec_t<2, Type>& x)
+{
+    return vec_t<2, Type>(-x.c[0], -x.c[1]);
+}
+
+template<unsigned Length, typename Type>
+CUDA_CALLABLE inline void adj_neg(const vec_t<Length, Type>& x, vec_t<Length, Type>& adj_x, const vec_t<Length, Type>& adj_ret)
+{
+    adj_x -= adj_ret;
+}
+
+// equality:
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE bool operator ==(const vec_t<Length, Type>& a, const vec_t<Length, Type>& b)
+{
+    for( unsigned i=0; i < Length; ++i )
+    {
+        if(a[i] != b[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+// scalar multiplication:
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> mul(vec_t<Length, Type> a, Type s)
+{
+    vec_t<Length, Type> ret;
+    for( unsigned i=0; i < Length; ++i )
+    {
+        ret[i] = a[i] * s;
+    }
+    return ret;
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<3, Type> mul(vec_t<3, Type> a, Type s)
+{
+    return vec_t<3, Type>(a.c[0]*s,a.c[1]*s,a.c[2]*s);
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<2, Type> mul(vec_t<2, Type> a, Type s)
+{
+    return vec_t<2, Type>(a.c[0]*s,a.c[1]*s);
+}
+
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> mul(Type s, vec_t<Length, Type> a)
+{
+    return mul(a, s);
+}
+
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> operator*(Type s, vec_t<Length, Type> a)
+{
+    return mul(a, s);
+}
+
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> operator*(vec_t<Length, Type> a, Type s)
+{
+    return mul(a, s);
+}
+
+
+// component wise multiplication:
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> cw_mul(vec_t<Length, Type> a, vec_t<Length, Type> b)
+{
+    vec_t<Length, Type> ret;
+    for( unsigned i=0; i < Length; ++i )
+    {
+        ret[i] = a[i] * b[i];
+    }
+    return ret;
+}
+
+// division
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> div(vec_t<Length, Type> a, Type s)
+{
+    vec_t<Length, Type> ret;
+    for( unsigned i=0; i < Length; ++i )
+    {
+        ret[i] = a[i] / s;
+    }
+    return ret;
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<3, Type> div(vec_t<3, Type> a, Type s)
+{
+    return vec_t<3, Type>(a.c[0]/s,a.c[1]/s,a.c[2]/s);
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<2, Type> div(vec_t<2, Type> a, Type s)
+{
+    return vec_t<2, Type>(a.c[0]/s,a.c[1]/s);
+}
+
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> div(Type s, vec_t<Length, Type> a)
+{
+    vec_t<Length, Type> ret;
+    for (unsigned i=0; i < Length; ++i)
+    {
+        ret[i] = s / a[i];
+    }
+    return ret;
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<3, Type> div(Type s, vec_t<3, Type> a)
+{
+    return vec_t<3, Type>(s/a.c[0],s/a.c[1],s/a.c[2]);
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<2, Type> div(Type s, vec_t<2, Type> a)
+{
+    return vec_t<2, Type>(s/a.c[0],s/a.c[1]);
+}
+
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> operator / (vec_t<Length, Type> a, Type s)
+{
+    return div(a,s);
+}
+
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> operator / (Type s, vec_t<Length, Type> a)
+{
+    return div(s, a);
+}
+
+// component wise division
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b)
+{
+    vec_t<Length, Type> ret;
+    for( unsigned i=0; i < Length; ++i )
+    {
+        ret[i] = a[i] / b[i];
+    }
+    return ret;
+}
+
+// addition
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> add(vec_t<Length, Type> a, vec_t<Length, Type> b)
+{
+    vec_t<Length, Type> ret;
+    for( unsigned i=0; i < Length; ++i )
+    {
+        ret[i] = a[i] + b[i];
+    }
+    return ret;
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<2, Type> add(vec_t<2, Type> a, vec_t<2, Type> b)
+{
+    return vec_t<2, Type>( a.c[0] + b.c[0], a.c[1] + b.c[1]);
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<3, Type> add(vec_t<3, Type> a, vec_t<3, Type> b)
+{
+    return vec_t<3, Type>( a.c[0] + b.c[0], a.c[1] + b.c[1], a.c[2] + b.c[2]);
+}
+
+// subtraction
+template<unsigned Length, typename Type>
+inline CUDA_CALLABLE vec_t<Length, Type> sub(vec_t<Length, Type> a, vec_t<Length, Type> b)
+{
+    vec_t<Length, Type> ret;
+    for( unsigned i=0; i < Length; ++i )
+    {
+        ret[i] = Type(a[i] - b[i]);
+    }
+    return ret;
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<2, Type> sub(vec_t<2, Type> a, vec_t<2, Type> b)
+{
+    return vec_t<2, Type>( a.c[0] - b.c[0], a.c[1] - b.c[1]);
+}
+
+template<typename Type>
+inline CUDA_CALLABLE vec_t<3, Type> sub(vec_t<3, Type> a, vec_t<3, Type> b)
+{
+    return vec_t<3, Type>( a.c[0] - b.c[0], a.c[1] - b.c[1], a.c[2] - b.c[2]);
+}
+*/
+
 
+// represents a fully evaluated tile in shared memory
 template <typename T, int M_, int N_>
 struct tile_shared_t
 {
@@ -495,7 +739,6 @@ tile_binary_map_t<TileA, TileB> tile_map_impl(typename tile_binary_map_t<TileA,
 #define tile_map(op, ...) tile_map_impl(op, adj_##op, __VA_ARGS__)
 //#define tile_map(op, a) tile_map_impl(wp::##op, wp::##op, a)
 
-
 // nop
 void adj_tile_map_impl(void) {}
 #define adj_tile_map(...) adj_tile_map_impl()
@@ -503,6 +746,53 @@ void adj_tile_map_impl(void) {}
 // use a macro to capture the adjoint var in the expression
 #define tile_constant(T, M, N, var) tile_constant_t<T, M, N>(var, adj_##var)
 
+
+/*
+// handle tile*scalar
+template<typename Tile>
+CUDA_CALLABLE inline auto tile_mul_impl(Tile& t, typename Tile::Type s,
+                                        Tile& adj_t, typename Tile::Type adj_s)
+{
+    typedef typename Tile::Type T;
+    typedef tile_constant_t<T, Tile::M, Tile::N> Constant;
+
+    typedef tile_binary_map_t<Tile, Constant> Op;
+
+    typename Op::FwdOp fwd = [](T a, T b) { return mul(a, b); };
+    typename Op::AdjOp adj = [](T a, T b, T& adj_a, T& adj_b, T& adj_ret) { adj_mul(a, b, adj_a, adj_b, adj_ret); };
+
+    // promote scalar to constant tile
+    Constant c(s, adj_s);
+
+    return Op(t, c, fwd, adj);
+}
+
+// handle scalar*tile
+template<typename Tile>
+CUDA_CALLABLE inline auto tile_mul_impl(typename Tile::Type s, Tile& t,
+                                        typename Tile::Type adj_s, Tile& adj_t)
+{
+    typedef typename Tile::Type T;
+    typedef tile_constant_t<T, Tile::M, Tile::N> Constant;
+
+    typedef tile_binary_map_t<Constant, Tile> Op;
+
+    typename Op::FwdOp fwd = [](T a, T b) { return mul(a, b); };
+    typename Op::AdjOp adj = [](T a, T b, T& adj_a, T& adj_b, T& adj_ret) { adj_mul(a, b, adj_a, adj_b, adj_ret); };
+
+    // promote scalar to constant tile
+    Constant c(s, adj_s);
+
+    return Op(c, t, fwd, adj);
+
+}
+
+
+#define tile_mul(a, b) tile_mul_impl(a, b adj_##a, adj_##b)
+#define tile_add(a, b) tile_add_impl(a, b adj_##a, adj_##b)
+*/
+
+
 } // namespace wp
 
 #if 0
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index 27b5b852..b1d3435e 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -373,6 +373,13 @@ template <typename TileA, typename TileB, typename TileC>
 void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
                      TileA& adj_a, TileB& adj_b, TileC& adj_c)
 {
+
+    // auto a_shared = tile_eval<Index+0>(a);
+    // auto b_shared = tile_eval<Index+1>(b);
+    // auto adj_c_shared = tile_eval<Index+1>(b);
+
+    // tile_matmul_scalar(adj_c, wp.tile_transpose(b), adj_a);
+    // tile_matmul_scalar(wp.tile_transpose(a), adj_c, adj_b);
 }
 
 
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 56a621f0..e0c34de2 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -156,24 +156,98 @@ def test_tile_binary_map():
     print("Binary map backward passed")
 
 
+@wp.kernel
+def tile_operators(input: wp.array3d(dtype=float),
+                   output: wp.array3d(dtype=float)):
+
+    # output tile index
+    i = wp.tid()
+
+    a = wp.tile_load(input[i], 0, 0, m=32, n=8)
+    
+    # neg
+    b = -a
+
+    # scalar multiply
+#    c = b*0.5
+
+    # # add tiles
+    # c = a + b    
+    
+    wp.tile_store(output[i], 0, 0, b)
+
+
+def test_tile_operators():
+
+    batch_count = 56
+
+    M = 32
+    N = 8
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, M, N), dtype=np.float32)
+    output = -input
+
+    input_wp = wp.array(input)
+    output_wp = wp.zeros_like(input_wp)
+
+    wp.launch(tile_operators, dim=batch_count, inputs=[input_wp, output_wp], tile_size=8)
+
+    assert(np.allclose(output, output_wp.numpy(), rtol=1.e-4))
+
+    print("operators forward passed")
+
+
 
 TILE_M = wp.constant(64)
 TILE_N = wp.constant(64)
 TILE_K = wp.constant(8)
 
-# sum = wp.tile_zeros(M,N)
+@wp.kernel
+def tile_grouped_gemm(A: wp.array3d(dtype=float),
+                      B: wp.array3d(dtype=float),
+                      C: wp.array3d(dtype=float)):
 
-# for i in range(5):
+    # output tile index
+    i = wp.tid()
 
-#     a = wp.tile_load(A)
-#     b = wp.tile_load(B)
+    a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
+    b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
 
-#     a2 = a*2.0
-    
-#     wp.tile_matmul(a2, b, sum)
+    sum = wp.tile_eval(wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32))
+
+    wp.tile_matmul(a, b, sum)
+
+    wp.tile_store(C[i], 0, 0, sum)
+
+
+def test_tile_batched_gemm():
+
+    batch_count = 56
+
+    M = TILE_M
+    N = TILE_N
+    K = TILE_K
 
-# wp.tile_store(sum)
+    rng = np.random.default_rng(42)
+    A = rng.random((batch_count, M, K), dtype=np.float32)
+    B = rng.random((batch_count, K, N), dtype=np.float32)
+    C = np.zeros((batch_count, M, N), dtype=np.float32)
+
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+    C_wp = wp.array(C)
 
+    wp.launch(tile_grouped_gemm, dim=batch_count, inputs=[A_wp, B_wp, C_wp], tile_size=8)
+
+    # bring back to host
+    C_wp = C_wp.numpy()
+
+    for i in range(batch_count):
+        assert(np.allclose(A[i]@B[i], C_wp[i], rtol=1.e-4))
+
+    # GEMM forward passed
+    print("batched matmul forward passed")
 
 
 @wp.kernel
@@ -190,7 +264,7 @@ def tile_gemm(A: wp.array2d(dtype=float),
     N = B.shape[1]
     K = A.shape[1]
 
-    count = int(K / TILE_K) # todo: must be the same as TILE_K
+    count = int(K / TILE_K) 
 
     for k in range(0, count):
 
@@ -223,11 +297,13 @@ def test_tile_gemm():
     assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
 
     # GEMM forward passed
-    print("Binary map backward passed")
+    print("matmul forward passed")
 
 
 
 test_tile_copy()
 test_tile_unary_map()
 test_tile_binary_map()
-test_tile_gemm()
\ No newline at end of file
+test_tile_batched_gemm()
+test_tile_gemm()
+test_tile_operators()
\ No newline at end of file
diff --git a/warp/types.py b/warp/types.py
index b5f02dba..1074b3df 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -1405,7 +1405,10 @@ def types_equal(a, b, match_generic=False):
 
     if is_array(a) and type(a) is type(b):
         return True
-
+    
+    if is_tile(a) and is_tile(b):
+        return True
+    
     return scalars_equal(a, b, match_generic)
 
 
@@ -2863,7 +2866,7 @@ def array_type_id(a):
 # tile expression objects
 class Tile:
     
-    def __init__(self, dtype, M, N, op):
+    def __init__(self, dtype, M, N, op=None):
         self.dtype = dtype
         self.M = M
         self.N = N
@@ -2878,6 +2881,16 @@ def ctype(self):
         from warp.codegen import Var
         return f"wp::tile_zeros_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
 
+class TileConstant(Tile):
+
+    def __init__(self, dtype, M, N):
+        Tile.__init__(self, dtype, M, N, "zeros")
+        
+    def ctype(self):
+        from warp.codegen import Var
+        return f"wp::tile_constant_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+
+
 class TileLoad(Tile):
 
     def __init__(self, array, M, N):

From 04fd859fd0a8036ad0cb612981bf7fccb101fcae Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 3 Sep 2024 15:23:46 +1200
Subject: [PATCH 019/102] Working implementation of register based tiles for
 most tests, added support for compile time block dimensions for tile kernels

---
 warp/builtins.py        | 112 +++---
 warp/codegen.py         |  17 +-
 warp/context.py         |  23 +-
 warp/native/tile.h      | 749 +++++++++++-----------------------------
 warp/native/tile_gemm.h |  12 +-
 warp/tape.py            |  10 +-
 warp/tests/test_tile.py |  20 +-
 warp/types.py           |  57 ++-
 8 files changed, 335 insertions(+), 665 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index fc3438cb..abe2d7b5 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1729,10 +1729,9 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     template_args.append(m.constant)
     template_args.append(n.constant)
 
-    # global shared_memory_id
-    # template_args.append(shared_memory_id)
-
-    # shared_memory_id += 1
+    global shared_memory_id
+    template_args.append(shared_memory_id)
+    shared_memory_id += 1
 
     return ([], template_args)
 
@@ -1791,9 +1790,9 @@ def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     template_args.append(m)
     template_args.append(n)
 
-    #global shared_memory_id
-    #templates.append(shared_memory_id)
-    #shared_memory_id += 1
+    global shared_memory_id
+    template_args.append(shared_memory_id)
+    shared_memory_id += 1
 
     return ((array, x, y), template_args)
 
@@ -1861,8 +1860,12 @@ def tile_matmul_value_func(arg_types, arg_values):
     if not is_tile(arg_types["b"]):
         raise RuntimeError("tile_matmul() argument 1 must be an tile")
 
-    if not isinstance(arg_types["out"], TileShared):
-        raise RuntimeError("tile_matmul() output must be a fully evaluated tile, e.g.: created using tile_eval()")
+    if not isinstance(arg_types["out"], Tile):
+        raise RuntimeError("tile_matmul() output argument must be a tile")
+
+    if arg_types["out"].storage != "shared":
+        raise RuntimeError("tile_matmul() output argument must have shared memory storage")
+
 
     return None
 
@@ -1876,14 +1879,14 @@ def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     # template_args.append(m)
     # template_args.append(n)
 
-    global shared_memory_id
+    # global shared_memory_id
 
     template_args = []
-    template_args.append(shared_memory_id)
+    # template_args.append(shared_memory_id)
 
-    # matmul makes two allocations (one for each of its arguments)
-    shared_memory_id += 1        
-    shared_memory_id += 1
+    # # matmul makes two allocations (one for each of its arguments)
+    # shared_memory_id += 1        
+    # shared_memory_id += 1
 
     return ((a, b, out), template_args)
 
@@ -1937,38 +1940,18 @@ def tile_eval_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
 
 
 # does type propagation for load()
-def tile_map_value_func(arg_types, arg_values):
+def tile_unary_map_value_func(arg_types, arg_values):
 
     if arg_types is None:
         return None
 
-    tiles = arg_types["args"]
+    a = arg_types["a"]
 
     # check all args are tiles
-    for a in tiles:
-        if not is_tile(a):
-            raise RuntimeError(f"tile_map() arguments must be tiles, got type {a}")
-
-    # use first argument to define output type
-    first = tiles[0]
+    if not is_tile(a):
+        raise RuntimeError(f"tile_map() arguments must be tiles, got type {a}")
 
-    # check all args have the same type and dimension
-    for a in tiles:
-        if a.dtype != first.dtype:
-            raise RuntimeError(f"tile_map() arguments must all have the same type {first.dtype} != {a.dtype}")
-
-        if a.M != first.M:
-            raise RuntimeError(f"tile_map() arguments must all have the same m dimension {first.M} != {a.M}")
-
-        if a.N != first.N:
-            raise RuntimeError(f"tile_map() arguments must all have the same n dimension {first.N} != {a.N}")
-
-    if len(tiles) == 1:
-        return TileUnaryMap(tiles[0])
-    elif len(tiles) == 2:
-        return TileBinaryMap(tiles[0], tiles[1])
-    else:
-        raise RuntimeError(f"tile_map() must have or two tile arguments")
+    return TileUnaryMap(a)
 
 
 def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
@@ -1979,10 +1962,51 @@ def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, ar
 
 add_builtin(
     "tile_map",
-    input_types={"op": Callable, "*args": Any},
-    value_func=tile_map_value_func,
-    dispatch_func=tile_map_dispatch_func,
-    variadic=True,
+    input_types={"op": Callable, "a": Any},
+    value_func=tile_unary_map_value_func,
+    #dispatch_func=tile_map_dispatch_func,
+    #variadic=True,
+    native_func="tile_unary_map",
+    doc="Map the operation onto each element of the tile", 
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_binary_map_value_func(arg_types, arg_values):
+
+    if arg_types is None:
+        return None
+
+    a = arg_types["a"]
+    b = arg_types["b"]
+
+    # check all args are tiles
+    if not is_tile(a):
+        raise RuntimeError(f"tile_map() arguments must be tiles, got type {a}")
+
+    if not is_tile(b):
+        raise RuntimeError(f"tile_map() arguments must be tiles, got type {b}")
+
+    # use first argument to define output type
+    if a.dtype != b.dtype:
+        raise RuntimeError(f"tile_map() arguments must all have the same type {a.dtype} != {b.dtype}")
+
+    if a.M != b.M:
+        raise RuntimeError(f"tile_map() arguments must all have the same m dimension {a.M} != {b.M}")
+
+    if a.N != b.N:
+        raise RuntimeError(f"tile_map() arguments must all have the same n dimension {a.N} != {b.N}")
+
+    return TileBinaryMap(a, b)
+
+
+add_builtin(
+    "tile_map",
+    input_types={"op": Callable, "a": Any, "b": Any},
+    value_func=tile_binary_map_value_func,
+    #dispatch_func=tile_map_dispatch_func,
+    #variadic=True,
+    native_func="tile_binary_map",
     doc="Map the operation onto each element of the tile", 
     group="Tile Primitives",
     export=False,
@@ -4464,7 +4488,7 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
 
     x = arg_types["x"]
     y = arg_types["y"]
-
+ 
     # tile*scalar
     if is_tile(x):
         if x.dtype != y:
diff --git a/warp/codegen.py b/warp/codegen.py
index 6a9991af..9a38d7c1 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -942,9 +942,10 @@ def format_args(adj, prefix, args):
             if isinstance(a, warp.context.Function):
                 # functions don't have a var_ prefix so strip it off here
                 if prefix == "var":
-                    arg_strs.append(a.key)
+                    arg_strs.append(f"{a.namespace}{a.key}")
                 else:
-                    arg_strs.append(f"{prefix}_{a.key}")
+                    arg_strs.append(f"{a.namespace}{prefix}_{a.key}")
+
             elif is_reference(a.type):
                 arg_strs.append(f"{prefix}_{a}")
             elif isinstance(a, Var):
@@ -2602,6 +2603,7 @@ def get_constant_references(adj) -> Dict[str, Any]:
 # code generation
 
 cpu_module_header = """
+#define WP_TILE_BLOCK_DIM {tile_size}
 #define WP_NO_CRT
 #include "builtin.h"
 
@@ -2620,6 +2622,7 @@ def get_constant_references(adj) -> Dict[str, Any]:
 """
 
 cuda_module_header = """
+#define WP_TILE_BLOCK_DIM {tile_size}
 #define WP_NO_CRT
 #include "builtin.h"
 
@@ -3013,10 +3016,6 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
 
     for var in adj.variables:
 
-        # do not predeclare vars with auto type
-        if var.ctype() == "auto":
-            continue
-
         if var.constant is None:
             lines += [f"{var.ctype()} {var.emit()};\n"]
         else:
@@ -3029,8 +3028,10 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
     for var in adj.variables:
         name = var.emit_adj()
         ctype = var.ctype(value_type=True)
-        
-        if ctype != "auto":
+               
+        if is_tile(var.type) and var.type.storage == "shared":
+            lines += [f"{ctype} {name} = wp::tile_alloc_shared<{Var.type_to_ctype(var.type.dtype)},{var.type.M},{var.type.N},{var.type.alloc()}>();\n"]
+        else:
             lines += [f"{ctype} {name} = {{}};\n"]
 
     # forward pass
diff --git a/warp/context.py b/warp/context.py
index 1d066f66..95f36afb 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -1404,9 +1404,9 @@ def codegen(self, device):
 
         # add headers
         if device == "cpu":
-            source = warp.codegen.cpu_module_header + source
+            source = warp.codegen.cpu_module_header.format(tile_size=self.options["tile_size"]) + source
         else:
-            source = warp.codegen.cuda_module_header + source
+            source = warp.codegen.cuda_module_header.format(tile_size=self.options["tile_size"]) + source
 
         return source
 
@@ -1439,6 +1439,7 @@ def __init__(self, name, loader):
             "fast_math": False,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
+            "tile_size": 0
         }
 
         # kernel hook lookup per device
@@ -1682,11 +1683,18 @@ def hash_recursive(module, visited):
 
         return hash_recursive(self, visited=set())
 
-    def load(self, device) -> bool:
+    def load(self, device, tile_size=0) -> bool:
         from warp.utils import ScopedTimer
 
         device = get_device(device)
 
+        # re-compile module if tile size (blockdim) changes
+        # todo: it would be better to have a method such as `module.get_kernel(tile_size=N)`
+        # that can return a single kernel instance with a given block size
+        if self.options["tile_size"] != tile_size:
+            self.unload()
+        self.options["tile_size"] = tile_size
+
         if device.is_cpu:
             # check if already loaded
             if self.cpu_module:
@@ -1695,7 +1703,7 @@ def load(self, device) -> bool:
             if self.cpu_build_failed:
                 return False
             if not warp.is_cpu_available():
-                raise RuntimeError("Failed to build CPU module because no CPU buildchain was found")
+                raise RuntimeError("Failed to build CPU module because no CPU build chain was found")
         else:
             # check if already loaded
             if device.context in self.cuda_modules:
@@ -4630,7 +4638,7 @@ def launch(
     record_tape=True,
     record_cmd=False,
     max_blocks=0,
-    tile_size=1,
+    tile_size=0,
 ):
     """Launch a Warp kernel on the target device
 
@@ -4650,6 +4658,7 @@ def launch(
         record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
         max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
             If negative or zero, the maximum hardware value will be used.
+        tile_size: The number of threads per-program instance
     """
 
     init()
@@ -4704,7 +4713,7 @@ def pack_args(args, params, adjoint=False):
 
         # delay load modules, including new overload if needed
         module = kernel.module
-        if not module.load(device):
+        if not module.load(device, tile_size):
             return
 
         # late bind
@@ -4788,7 +4797,7 @@ def pack_args(args, params, adjoint=False):
         # record file, lineno, func as metadata
         frame = inspect.currentframe().f_back
         caller = {"file": frame.f_code.co_filename, "lineno": frame.f_lineno, "func": frame.f_code.co_name}
-        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device, metadata={"caller": caller})
+        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device, tile_size, metadata={"caller": caller})
 
         # detect illegal inter-kernel read/write access patterns if verification flag is set
         if warp.config.verify_autograd_array_access:
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 009709a2..4315eda7 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -27,18 +27,21 @@
 
 #endif
 
+
+
 /* Tile Expressions
 
 [x] Forward / Backward code-gen
 [ ] wp.tile_map()
-    [x]  Support user functions
-    [ ] Support built-in functions
+    [x] Support user functions
+    [x] Support built-in functions
     [ ] Support for lambda functions
+    [ ] Infer tile_map() output from operator type (e.g.: dot for each element)
 [ ] wp.tile_matmul()
     [x] Forward
     [ ] Reverse
 [ ] Support for n-d shape tiles / broadcasting / slicing / transpose?
-[ ] Compile-time block dimensions
+[x] Compile-time block dimensions
 [ ] Support for CUB reductions
 [ ] Support for CUB sorts
 [ ] Examples
@@ -46,7 +49,10 @@
     [ ] Batched MLP
     [ ] Point cloud alignment
     [ ] Layer norm
-    
+[ ] Error checking
+    [ ] Ensure functions passed to tile_map() are compatible with tile type
+    [ ] Ensure that args passed to tile ops are compatible
+
 */
 
 // wp.tile_load(A, offset, shape)
@@ -85,7 +91,7 @@ void print_tile(T& t)
         printf("%*s[", i>0, "");
         for (int j=0; j < T::N; ++j)
         {
-            printf("%5.2f ", t.fwd(i*T::N + j));
+            printf("%5.2f ", t.data[i*T::N + j]);
         }
 
         if (i == T::M-1)
@@ -95,656 +101,293 @@ void print_tile(T& t)
     }
 }
 
-
 template <typename Tile>
-int size(Tile& t) { return Tile::M*Tile::N; }
+int tile_size(Tile& t) { return Tile::M*Tile::N; }
 
+constexpr int tile_regcount(int m, int n) {
+    return (m*n + WP_TILE_BLOCK_DIM - 1) / WP_TILE_BLOCK_DIM;
+}
 
-template <typename T, int M_, int N_>
-struct tile_load_t
+struct coord_t
 {
-    using Type = T;
-    static constexpr int M = M_;
-    static constexpr int N = N_;
-
-    array_t<T> slice;
-
-    tile_load_t() {}
-    tile_load_t(array_t<T>& src, int x, int y)
-    {
-        assert(src.ndim == 2);
-
-        // compute offsets into original array and store a view
-        const int i = x*M;
-        const int j = y*N;
-
-        // slice into src
-        if (src.data)
-            slice.data = data_at_byte_offset(src, byte_offset(src, i, j));
-        if (src.grad)
-            slice.grad = grad_at_byte_offset(src, byte_offset(src, i, j));
-
-        slice.shape[0] = M;
-        slice.shape[1] = N;
-        slice.strides[0] = src.strides[0];
-        slice.strides[1] = src.strides[1];
-        slice.ndim = 2;
-    }
-
-    Type fwd(int e) const
-    {
-        int i = e/N;
-        int j = e%N;
-
-        return index(slice, i, j);
-    }
-
-    void bwd(int e, const T& adj_ret) const
-    {
-        int i = e/N;
-        int j = e%N;
-
-        if (slice.grad)
-            atomic_add(&index_grad(slice, i, j), adj_ret);
-    }
-
-    void print()
-    {
-        printf("tile_load_t<%d, %d>\n", M, N);
-    }
-
+    int i;
+    int j;
 };
 
-template <typename Tile_>
-struct tile_store_t
-{
-    using Tile = Tile_;
-    using Type = typename Tile_::Type;
-    static constexpr int M = Tile_::M;
-    static constexpr int N = Tile_::N;
-
-    array_t<Type> slice;
-    Tile tile;
-
-    tile_store_t() {}
-    tile_store_t(array_t<Type>& dest, int x, int y, Tile& t) : tile(t)
-    {
-        assert(dest.ndim == 2);
-
-        // compute offsets into original array and store a view
-        const int i = x*M;
-        const int j = y*N;
-
-        // slice into dest
-        if (dest.data)
-            slice.data = data_at_byte_offset(dest, byte_offset(dest, i, j));
-        if (dest.grad)
-            slice.grad = grad_at_byte_offset(dest, byte_offset(dest, i, j));
-
-        slice.shape[0] = M;
-        slice.shape[1] = N;
-        slice.strides[0] = dest.strides[0];
-        slice.strides[1] = dest.strides[1];
-        slice.ndim = 2;
-    }
-
-    void fwd(int e) const
-    {
-        int i = e/N;
-        int j = e%N;
-
-        index(slice, i, j) = tile.fwd(e);
-    }
-
-    void bwd(int e) const
-    {
-        int i = e/N;
-        int j = e%N;
 
-        // materialize gradient (runs entire graph backward), reading incoming grads from the dest
-        if (slice.grad)
-            tile.bwd(e, index_grad(slice, i, j));
-    }
+template <typename T, int M, int N, int Alloc>
+inline CUDA_CALLABLE T* tile_alloc_shared()
+{
+    WP_TILE_SHARED __align__(16) T data[M*N];
 
-    void print()
-    {
-        printf("tile_load_t<%d, %d>-+", M, N);
-        print(tile);
-    }
-};
+    for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
+        data[i] = T(0);
 
+    return data;
+}
 
 template <typename T, int M_, int N_>
-struct tile_constant_t
+struct tile_shared_t
 {
     using Type = T;
     static constexpr int M = M_;
     static constexpr int N = N_;
 
-    T c;
-    T* adj_c;
-
-    tile_constant_t() {}
-    tile_constant_t(const T& c, T& adj_c) : c(c), adj_c(&adj_c) {}
+    T* data = NULL;
 
-    Type fwd(int e) const
+    tile_shared_t() {}
+    tile_shared_t(T* smem) : data(smem)
     {
-        return c;
     }
 
-    void bwd(int e, const T& adj_ret) const
+    struct iterator
     {
-        *adj_c += adj_ret;
-    }
+        tile_shared_t<Type, M, N>& tile;
+        int offset;
+        
+        inline CUDA_CALLABLE iterator(tile_shared_t<Type, M, N>& t, int i) : tile(t), offset(i) {}
+        inline CUDA_CALLABLE T& operator*() const { return tile.data[offset]; }
+        inline CUDA_CALLABLE iterator& operator++() { offset += WP_TILE_BLOCK_DIM; return *this; }        
+        inline CUDA_CALLABLE bool valid() const { return index() < tile_size(tile); }
+
+        // linear index into the tile's data (assuming row-major layout)
+        inline CUDA_CALLABLE int index() const { return offset; }
+        inline CUDA_CALLABLE coord_t coord() const
+        {
+            int i = index();
+            return {i/N, i%N};
+        }
+    };    
 
-    void print()
-    {
-        printf("tile_constant_t<%d, %d>-+", M, N);
-        print(c);
-        printf("\n");
-    }
+    iterator iter() { return iterator(*this, threadIdx.x); }
 };
 
-template <typename T, int M_, int N_>
-struct tile_zeros_t
-{
-    using Type = T;
-    static constexpr int M = M_;
-    static constexpr int N = N_;
-
-    tile_zeros_t() {}
-
-    Type fwd(int e) const
-    {
-        return Type(0.0);
-    }
-
-    void bwd(int e, const T& adj_ret) const {}
-
-    void print()
-    {
-        printf("tile_zeros_t<%d, %d>-+", M, N);
-        print(c);
-        printf("\n");
-    }
-};
 
 template <typename T, int M_, int N_>
-struct tile_ones_t
+struct tile_register_t
 {
     using Type = T;
     static constexpr int M = M_;
     static constexpr int N = N_;
+    static constexpr int NumRegs = tile_regcount(M, N);
 
-    tile_ones_t() {}
-
-    Type fwd(int e)
-    {
-        return Type(1.0);
-    }
-
-    void bwd(int e, const T& adj_ret) {}
-
-    void print()
-    {
-        printf("tile_ones_t<%d, %d>-+", M, N);
-        print(c);
-        printf("\n");
-    }
-};
-
-template <typename Tile>
-struct tile_unary_map_t
-{
-    using Type = typename Tile::Type;
-    static constexpr int M = Tile::M;
-    static constexpr int N = Tile::N;
-
-    using FwdOp = Type(*)(Type);
-    using AdjOp = void(*)(Type, Type&, Type&);
-
-    Tile tile;
-    
-    FwdOp fwd_fn;
-    AdjOp adj_fn;
-
-    tile_unary_map_t() {}
-    tile_unary_map_t(Tile& t, FwdOp fwd, AdjOp adj)  : tile(t), fwd_fn(fwd), adj_fn(adj) {}
-
-    Type fwd(int e) const
-    {
-        return fwd_fn(tile.fwd(e));
-    }
-
-    void bwd(int e, Type adj_ret) const
-    {
-        Type adj_a = 0.0;
-
-        adj_fn(tile.fwd(e), adj_a, adj_ret);
-
-        tile.bwd(e, adj_a);
-    }
-
-    void print()
-    {
-        printf("tile_unary_map_t<%d, %d>-+", M, N);
-        tile.print();
-    }
-};
-
-template <typename TileA, typename TileB>
-struct tile_binary_map_t
-{
-    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error");
-    static_assert(TileA::M == TileB::M, "Error");
-    static_assert(TileA::N == TileB::N, "Error");
-
-    using Type = typename TileA::Type;
-    static constexpr int M = TileA::M;
-    static constexpr int N = TileA::N;
-
-    using FwdOp = Type(*)(Type, Type);
-    using AdjOp = void(*)(Type, Type, Type&, Type&, Type&);
-
-    TileA tile_a;
-    TileB tile_b;
-
-    FwdOp fwd_fn;
-    AdjOp adj_fn;
-
-    tile_binary_map_t() {}
-    tile_binary_map_t(const TileA& a, TileB& b, FwdOp fwd, AdjOp adj) : tile_a(a), tile_b(b), fwd_fn(fwd), adj_fn(adj) {}
-
-    Type fwd(int e) const
+    T data[NumRegs];
+   
+    tile_register_t() 
     {
-        Type a = tile_a.fwd(e);
-        Type b = tile_b.fwd(e);
-
-        return fwd_fn(a, b);
+        // zero-initialize by default
+        // necessary for tile adjoints
+        // need to check if this results in worse codegen
+        for (int i=0; i < NumRegs; ++i)
+            data[i] = T(0);
     }
 
-    void bwd(int e, Type adj_ret) const
+    struct iterator
     {
-        Type a = tile_a.fwd(e);
-        Type b = tile_b.fwd(e);
- 
-        Type adj_a = 0.0;
-        Type adj_b = 0.0;
+        tile_register_t<Type, M, N>& tile;
+        int offset;
+       
+        inline CUDA_CALLABLE iterator(tile_register_t<Type, M, N>& t, int i) : tile(t), offset(i) {}
 
-        adj_fn(a, b, adj_a, adj_b, adj_ret);
+        inline CUDA_CALLABLE T& operator*() const { return tile.data[offset]; }
+        inline CUDA_CALLABLE iterator& operator++() { ++offset; return *this; }
+        inline CUDA_CALLABLE bool valid() const { return offset < NumRegs && index() < tile_size(tile); }
 
-        // recurse
-        tile_a.bwd(e, adj_a);
-        tile_b.bwd(e, adj_b);
-    }
+        // linear index into the tile's data (assuming row-major layout)
+        inline CUDA_CALLABLE int index() const { return threadIdx.x + offset*WP_TILE_BLOCK_DIM; }
+        inline CUDA_CALLABLE coord_t coord() const
+        {
+            int i = index();
+            return {i/N, i%N};
+        }
+    };    
 
-    void print()
-    {
-        printf("tile_binary_map_t<%d, %d>", M, N);
-        printf("\n   -+");
-        tile_a.print();
-        printf("\n   -+");
-        tile_b.print();
-    }
+    iterator iter() { return iterator(*this, 0); }
 };
 
-//-----------------------------------------------
-// Operators
 
 
-template<typename Tile>
-CUDA_CALLABLE inline tile_unary_map_t<Tile> tile_pos(const Tile& t)
-{
-    return tile_unary_map_t<Tile>(t, [](typename Tile::Type x) { return pos(x); } );
-}
-
-template<typename Tile>
-CUDA_CALLABLE inline tile_unary_map_t<Tile> tile_neg(Tile& t)
-{
-    typedef tile_unary_map_t<Tile> Op;
-
-    typename Op::FwdOp fwd = [](typename Tile::Type x) { return neg(x); };
-    typename Op::AdjOp adj = [](typename Tile::Type x, typename Tile::Type& adj_x, typename Tile::Type& adj_ret) { adj_neg(x, adj_x, adj_ret); };
-
-    return Op(t, fwd, adj);
-}
+//-----------------------------------------------------------------------------------------------------
+// High level entry points for each op (correspond to one Warp builtin)
 
-template<typename Tile>
-CUDA_CALLABLE inline void adj_tile_neg(const Tile& t, Tile& adj_t, tile_unary_map_t<Tile>& adj_ret)
+template <typename T, int M, int N, int Index>
+inline CUDA_CALLABLE auto tile_zeros()
 {
-    // nop
-}
-
+    const int length = M*N;
 
-/*
+    WP_TILE_SHARED __align__(16) T data[length];
+    
+    WP_PRAGMA_UNROLL
+    for (int t=threadIdx.x; t < length; t += WP_TILE_BLOCK_DIM)
+    {  
+        data[t] = T(0.0);
+    }
 
-template<unsigned Length, typename Type>
-CUDA_CALLABLE inline vec_t<Length, Type> neg(const vec_t<Length, Type>& x)
-{
-    return -x;
+    return tile_shared_t<T, M, N>(data);
 }
 
-template<typename Type>
-CUDA_CALLABLE inline vec_t<3, Type> neg(const vec_t<3, Type>& x)
-{
-    return vec_t<3, Type>(-x.c[0], -x.c[1], -x.c[2]);
-}
 
-template<typename Type>
-CUDA_CALLABLE inline vec_t<2, Type> neg(const vec_t<2, Type>& x)
+// entry point for store
+template <typename T, int M, int N, int Alloc>
+inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x, int y)
 {
-    return vec_t<2, Type>(-x.c[0], -x.c[1]);
-}
+    const int length = M*N;
 
-template<unsigned Length, typename Type>
-CUDA_CALLABLE inline void adj_neg(const vec_t<Length, Type>& x, vec_t<Length, Type>& adj_x, const vec_t<Length, Type>& adj_ret)
-{
-    adj_x -= adj_ret;
-}
+    WP_TILE_SHARED __align__(16) T data[length];
 
-// equality:
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE bool operator ==(const vec_t<Length, Type>& a, const vec_t<Length, Type>& b)
-{
-    for( unsigned i=0; i < Length; ++i )
-    {
-        if(a[i] != b[i])
-        {
-            return false;
-        }
-    }
-    return true;
-}
+    tile_shared_t<T, M, N> dest(data);
+    
+    WP_PRAGMA_UNROLL
+    for (auto dst_iter=dest.iter(); dst_iter.valid(); ++dst_iter)
+    {  
+        coord_t c = dst_iter.coord();
 
-// scalar multiplication:
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> mul(vec_t<Length, Type> a, Type s)
-{
-    vec_t<Length, Type> ret;
-    for( unsigned i=0; i < Length; ++i )
-    {
-        ret[i] = a[i] * s;
+        *dst_iter = index(src, x*M + c.i, y*N + c.j);
     }
-    return ret;
-}
-
-template<typename Type>
-inline CUDA_CALLABLE vec_t<3, Type> mul(vec_t<3, Type> a, Type s)
-{
-    return vec_t<3, Type>(a.c[0]*s,a.c[1]*s,a.c[2]*s);
-}
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<2, Type> mul(vec_t<2, Type> a, Type s)
-{
-    return vec_t<2, Type>(a.c[0]*s,a.c[1]*s);
+    return dest;
 }
 
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> mul(Type s, vec_t<Length, Type> a)
-{
-    return mul(a, s);
-}
-
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> operator*(Type s, vec_t<Length, Type> a)
+// entry point for store
+template <typename T, typename Tile>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src)
 {
-    return mul(a, s);
-}
+    const int M = src.M;
+    const int N = src.N;
+   
+    // cooperatively store the tile, using a block-stride iterator
+    WP_PRAGMA_UNROLL
+    for (auto src_iter=src.iter(); src_iter.valid(); ++src_iter)
+    {  
+        coord_t c = src_iter.coord();
 
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> operator*(vec_t<Length, Type> a, Type s)
-{
-    return mul(a, s);
+        index(dest, x*M + c.i, y*N + c.j) = *src_iter;
+    }
 }
 
+//-------------------------------------
+// Adjoints
 
-// component wise multiplication:
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> cw_mul(vec_t<Length, Type> a, vec_t<Length, Type> b)
+template <typename T, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
+                                        array_t<T>& adj_src, int adj_x, int adj_y,
+                                        AdjTile& adj_ret)
 {
-    vec_t<Length, Type> ret;
-    for( unsigned i=0; i < Length; ++i )
-    {
-        ret[i] = a[i] * b[i];
+    // add gradients to src array
+    WP_PRAGMA_UNROLL
+    for (auto adj_iter=adj_ret.iter(); adj_iter.valid(); ++adj_iter)
+    {  
+        coord_t c = adj_iter.coord();
+        atomic_add(adj_src, x*adj_ret.M + c.i, y*adj_ret.N + c.j, *adj_iter);
     }
-    return ret;
 }
 
-// division
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> div(vec_t<Length, Type> a, Type s)
+template <typename T, typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, AdjTile& adj_t)
 {
-    vec_t<Length, Type> ret;
-    for( unsigned i=0; i < Length; ++i )
-    {
-        ret[i] = a[i] / s;
+    const int M = t.M;
+    const int N = t.N;
+
+    // load gradients from output
+    WP_PRAGMA_UNROLL
+    for (auto adj_iter=adj_t.iter(); adj_iter.valid(); ++adj_iter)
+    {  
+        coord_t c = adj_iter.coord();
+        *adj_iter += index(adj_dest, x*M + c.i, y*N + c.j, *adj_iter);
     }
-    return ret;
 }
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<3, Type> div(vec_t<3, Type> a, Type s)
+// unary map
+template <typename Tile, typename Fwd>
+auto tile_map(Fwd op,
+              Tile &a)
 {
-    return vec_t<3, Type>(a.c[0]/s,a.c[1]/s,a.c[2]/s);
-}
+    auto out = tile_register_t<typename Tile::Type, Tile::M, Tile::N>();
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<2, Type> div(vec_t<2, Type> a, Type s)
-{
-    return vec_t<2, Type>(a.c[0]/s,a.c[1]/s);
-}
+    auto out_iter = out.iter();
+    auto a_iter = a.iter();
 
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> div(Type s, vec_t<Length, Type> a)
-{
-    vec_t<Length, Type> ret;
-    for (unsigned i=0; i < Length; ++i)
+    for (; out_iter.valid(); ++out_iter, ++a_iter)
     {
-        ret[i] = s / a[i];
+        *out_iter = op(*a_iter);
     }
-    return ret;
-}
-
-template<typename Type>
-inline CUDA_CALLABLE vec_t<3, Type> div(Type s, vec_t<3, Type> a)
-{
-    return vec_t<3, Type>(s/a.c[0],s/a.c[1],s/a.c[2]);
-}
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<2, Type> div(Type s, vec_t<2, Type> a)
-{
-    return vec_t<2, Type>(s/a.c[0],s/a.c[1]);
+    return out;
 }
 
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> operator / (vec_t<Length, Type> a, Type s)
+template <typename Tile, typename AdjTile, typename Fwd, typename Adj>
+void adj_tile_map(Fwd op,
+                  Tile &a,
+                  Adj adj_op,
+                  Tile &adj_a,
+                  AdjTile &adj_ret)
 {
-    return div(a,s);
-}
+    auto a_iter = a.iter();   
+    auto adj_a_iter = adj_a.iter();
+    auto adj_ret_iter = adj_ret.iter();
 
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> operator / (Type s, vec_t<Length, Type> a)
-{
-    return div(s, a);
-}
-
-// component wise division
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> cw_div(vec_t<Length, Type> a, vec_t<Length, Type> b)
-{
-    vec_t<Length, Type> ret;
-    for( unsigned i=0; i < Length; ++i )
+    for (; a_iter.valid(); ++a_iter, ++adj_a_iter, ++adj_ret_iter)
     {
-        ret[i] = a[i] / b[i];
+        adj_op(*a_iter, *adj_a_iter, *adj_ret_iter);
     }
-    return ret;
 }
 
-// addition
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> add(vec_t<Length, Type> a, vec_t<Length, Type> b)
+// binary map
+template <typename TileA, typename TileB, typename Fwd>
+auto tile_map(Fwd op,
+              TileA &a,
+              TileB &b)
 {
-    vec_t<Length, Type> ret;
-    for( unsigned i=0; i < Length; ++i )
-    {
-        ret[i] = a[i] + b[i];
-    }
-    return ret;
-}
+    auto out = tile_register_t<typename TileA::Type, TileA::M, TileA::N>();
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<2, Type> add(vec_t<2, Type> a, vec_t<2, Type> b)
-{
-    return vec_t<2, Type>( a.c[0] + b.c[0], a.c[1] + b.c[1]);
-}
+    auto out_iter = out.iter();
+    auto a_iter = a.iter();
+    auto b_iter = b.iter();
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<3, Type> add(vec_t<3, Type> a, vec_t<3, Type> b)
-{
-    return vec_t<3, Type>( a.c[0] + b.c[0], a.c[1] + b.c[1], a.c[2] + b.c[2]);
-}
-
-// subtraction
-template<unsigned Length, typename Type>
-inline CUDA_CALLABLE vec_t<Length, Type> sub(vec_t<Length, Type> a, vec_t<Length, Type> b)
-{
-    vec_t<Length, Type> ret;
-    for( unsigned i=0; i < Length; ++i )
+    for (; out_iter.valid(); ++out_iter, ++a_iter, ++b_iter)
     {
-        ret[i] = Type(a[i] - b[i]);
+        *out_iter = op(*a_iter, *b_iter);
     }
-    return ret;
-}
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<2, Type> sub(vec_t<2, Type> a, vec_t<2, Type> b)
-{
-    return vec_t<2, Type>( a.c[0] - b.c[0], a.c[1] - b.c[1]);
+    return out;
 }
 
-template<typename Type>
-inline CUDA_CALLABLE vec_t<3, Type> sub(vec_t<3, Type> a, vec_t<3, Type> b)
+template <typename TileA, typename TileB, typename Fwd, typename Adj, typename AdjTile>
+void adj_tile_map(Fwd op,
+                  TileA &a,
+                  TileB &b,
+                  Adj adj_op,
+                  TileA &adj_a,
+                  TileB &adj_b,
+                  AdjTile &adj_ret)
 {
-    return vec_t<3, Type>( a.c[0] - b.c[0], a.c[1] - b.c[1], a.c[2] - b.c[2]);
-}
-*/
-
-
-// represents a fully evaluated tile in shared memory
-template <typename T, int M_, int N_>
-struct tile_shared_t
-{
-    using Type = T;
-    static constexpr int M = M_;
-    static constexpr int N = N_;
-
-    T* data = NULL;
-
-    tile_shared_t() {}
-    tile_shared_t(T* smem) : data(smem)
-    {
-    }
+    auto a_iter = a.iter();   
+    auto b_iter = b.iter();
+    auto adj_a_iter = adj_a.iter();
+    auto adj_b_iter = adj_b.iter();    
+    auto adj_ret_iter = adj_ret.iter();
 
-    T fwd(int e) const
+    for (; a_iter.valid(); ++a_iter, ++b_iter, ++adj_a_iter, ++adj_b_iter, ++adj_ret_iter)
     {
-        return data[e];
+        adj_op(*a_iter, *b_iter, *adj_a_iter, *adj_b_iter, *adj_ret_iter);
     }
-
-    void bwd(int e, T adj_ret) const
-    {
-
-    }
-};
-
-//-----------------------------------------------------------------------------------------------------
-// High level entry points for each op (correspond to one Warp builtin)
-
-template <typename T, int M, int N>
-tile_zeros_t<T, M, N> tile_zeros() { return tile_zeros_t<T, M, N>(); }
-
-template <typename T, int M, int N>
-tile_ones_t<T, M, N> tile_ones() { return tile_ones_t<T, M, N>(); }
-
-// entry point for load
-template <typename T, int M, int N>
-tile_load_t<T, M, N> tile_load(array_t<T>& a, int x, int y)
-{
-    return tile_load_t<T, M, N>(a, x, y);
 }
 
-template <int Index, typename Tile>
-tile_shared_t<typename Tile::Type, Tile::M, Tile::N> tile_eval(Tile& t)
-{
-    WP_TILE_SHARED typename Tile::Type data[Tile::M*Tile::N];
-    
-    // evaluate the input tile and store into shared memory
-    for (int i=threadIdx.x; i < size(t); i += blockDim.x)
-        data[i] = t.fwd(i);
+// wrap the operator in a lambda so that we don't have to do overload resolution for things like e.g.: wp.sin()
+// this is important because many of the builtin operators don't follow particular conventions on references for 
+// the `adj_ret` parameter, which means it's not possible to figure out the overload we need using simple casting
+#define tile_unary_map(op, a) tile_map([](auto x) { return op(x);}, a)
+#define adj_tile_unary_map(op, a, adj_op, adj_a, adj_ret) adj_tile_map([](auto x) { return op(x);}, a, [](auto x, auto& adj_x, auto adj_ret) { adj_op(x, adj_x, adj_ret);}, adj_a, adj_ret)
 
-    return tile_shared_t<typename Tile::Type, Tile::M, Tile::N>(data);
-}
+#define tile_binary_map(op, a, b) tile_map([](auto x, auto y) { return op(x, y);}, a, b)
+#define adj_tile_binary_map(op, a, b, adj_op, adj_a, adj_b, adj_ret) adj_tile_map([](auto x, auto y) { return op(x, y);}, a, b, [](auto x, auto y, auto& adj_x, auto& adj_y, auto adj_ret) { adj_op(x, y, adj_x, adj_y, adj_ret);}, adj_a, adj_b, adj_ret)
 
+// unary neg
 template <typename Tile>
-void adj_tile_eval(Tile& t, Tile& adj_t, tile_shared_t<typename Tile::Type, Tile::M, Tile::N>& adj_ret)
-{
-    // nop
-}
-
-template <typename T, int M, int N>
-void adj_tile_load(array_t<T>& a, int x, int y, array_t<T>& adj_a, int adj_x, int adj_y, const tile_load_t<T, M, N>& adj_ret)
-{
-    // nop
-}
-
-
-// entry point for store
-template <typename T, typename Tile>
-void tile_store(array_t<T>& dest, int x, int y, Tile& t)
-{
-    tile_store_t<Tile> op(dest, x, y, t);
-
-    // execute op
-    for (int i=threadIdx.x; i < size(op); i += blockDim.x)
-        op.fwd(i);
-}
-
-
-template <typename T, typename Tile>
-void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, Tile& adj_t)
-{
-    tile_store_t<Tile> op(dest, x, y, t);
-
-    for (int i=threadIdx.x; i < size(op); i += blockDim.x)
-        op.bwd(i);
-}
-
-
-// unary map
-template <typename Tile>
-tile_unary_map_t<Tile> tile_map_impl(typename tile_unary_map_t<Tile>::FwdOp fwd, typename tile_unary_map_t<Tile>::AdjOp adj, Tile& a)
-{
-    return tile_unary_map_t<Tile>(a, fwd, adj);
-}
-
-// binary map
-template <typename TileA, typename TileB>
-tile_binary_map_t<TileA, TileB> tile_map_impl(typename tile_binary_map_t<TileA, TileB>::FwdOp fwd, typename tile_binary_map_t<TileA, TileB>::AdjOp adj, TileA& a, TileB& b)
-{
-    return tile_binary_map_t<TileA, TileB>(a, b, fwd, adj);
-}
-
-// use macro to capture adjoint operator
-#define tile_map(op, ...) tile_map_impl(op, adj_##op, __VA_ARGS__)
-//#define tile_map(op, a) tile_map_impl(wp::##op, wp::##op, a)
-
-// nop
-void adj_tile_map_impl(void) {}
-#define adj_tile_map(...) adj_tile_map_impl()
+auto tile_neg(Tile& a) { return tile_unary_map(wp::neg, a); }
 
-// use a macro to capture the adjoint var in the expression
-#define tile_constant(T, M, N, var) tile_constant_t<T, M, N>(var, adj_##var)
+template <typename Tile, typename AdjTile>
+void adj_tile_neg(Tile& a, Tile& adj_a, AdjTile& adj_ret) { adj_tile_unary_map(wp::neg, a, wp::adj_neg, adj_a, adj_ret); }
 
 
 /*
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index b1d3435e..fca527d0 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -221,7 +221,7 @@ inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
                                              const TileB& B,
                                              const TileC& out)
 {    
-    const int length = size(out);
+    const int length = tile_size(out);
 
     WP_TILE_SYNC();
 
@@ -353,19 +353,15 @@ struct tile_matmul_t
 };
 
 
-template <int Index, typename TileA, typename TileB, typename TileC>
+template <typename TileA, typename TileB, typename TileC>
 void tile_matmul(TileA& a, TileB& b, TileC& c)
 {
     static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error, tile datatypes must match");
     static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
     static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
     static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
-
-    // load inputs to shared
-    auto a_shared = tile_eval<Index+0>(a);
-    auto b_shared = tile_eval<Index+1>(b);
-    
-    tile_matmul_scalar(a_shared, b_shared, c);
+   
+    tile_matmul_scalar(a, b, c);
 }
 
 
diff --git a/warp/tape.py b/warp/tape.py
index 8c3cc103..15aebf81 100644
--- a/warp/tape.py
+++ b/warp/tape.py
@@ -129,7 +129,8 @@ def backward(self, loss: wp.array = None, grads: dict = None):
                 inputs = launch[3]
                 outputs = launch[4]
                 device = launch[5]
-
+                tile_size = launch[6]
+                
                 adj_inputs = []
                 adj_outputs = []
 
@@ -151,13 +152,14 @@ def backward(self, loss: wp.array = None, grads: dict = None):
                     device=device,
                     adjoint=True,
                     max_blocks=max_blocks,
+                    tile_size=tile_size
                 )
 
     # record a kernel launch on the tape
-    def record_launch(self, kernel, dim, max_blocks, inputs, outputs, device, metadata=None):
+    def record_launch(self, kernel, dim, max_blocks, inputs, outputs, device, tile_size=0, metadata=None):
         if metadata is None:
             metadata = {}
-        self.launches.append([kernel, dim, max_blocks, inputs, outputs, device, metadata])
+        self.launches.append([kernel, dim, max_blocks, inputs, outputs, device, tile_size, metadata])
 
     def record_func(self, backward, arrays):
         """
@@ -612,7 +614,7 @@ def emit_kernel_launch_node(
         self.array_grad_stats.insert(0, grad_stats)
 
 
-Launch = namedtuple("Launch", ["id", "kernel", "dim", "max_blocks", "inputs", "outputs", "device", "metadata"])
+Launch = namedtuple("Launch", ["id", "kernel", "dim", "max_blocks", "inputs", "outputs", "device", "tile_size", "metadata"])
 RepeatedSequence = namedtuple("RepeatedSequence", ["start", "end", "repetitions"])
 
 
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index e0c34de2..02fc9870 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -9,11 +9,13 @@
 wp.set_module_options({"enable_backward": True})
 wp.set_device("cuda:0")
 
+wp.config.verify_cuda = True
 
 wp.build.clear_kernel_cache()
 
-TILE_M = 8
-TILE_N = 4
+TILE_M = wp.constant(32)
+TILE_N = wp.constant(32)
+TILE_K = wp.constant(8)
 
 @wp.kernel
 def tile_copy(A: wp.array2d(dtype=float),
@@ -66,7 +68,7 @@ def tile_unary_map(input: wp.array2d(dtype=float),
     
     a = wp.tile_load(input, i, j, m=TILE_M, n=TILE_N)
     
-    sa = wp.tile_map(unary_func, a)
+    sa = wp.tile_map(wp.sin, a)
     
     wp.tile_store(output, i, j, sa)
 
@@ -199,10 +201,6 @@ def test_tile_operators():
 
 
 
-TILE_M = wp.constant(64)
-TILE_N = wp.constant(64)
-TILE_K = wp.constant(8)
-
 @wp.kernel
 def tile_grouped_gemm(A: wp.array3d(dtype=float),
                       B: wp.array3d(dtype=float),
@@ -214,7 +212,7 @@ def tile_grouped_gemm(A: wp.array3d(dtype=float),
     a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
     b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
 
-    sum = wp.tile_eval(wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32))
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
     wp.tile_matmul(a, b, sum)
 
@@ -258,7 +256,7 @@ def tile_gemm(A: wp.array2d(dtype=float),
     # output tile index
     i, j = wp.tid()
 
-    sum = wp.tile_eval(wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32))
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
     M = A.shape[0]
     N = B.shape[1]
@@ -304,6 +302,6 @@ def test_tile_gemm():
 test_tile_copy()
 test_tile_unary_map()
 test_tile_binary_map()
-test_tile_batched_gemm()
-test_tile_gemm()
+# test_tile_batched_gemm()
+# test_tile_gemm()
 test_tile_operators()
\ No newline at end of file
diff --git a/warp/types.py b/warp/types.py
index 1074b3df..9bc6f7d7 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2865,75 +2865,72 @@ def array_type_id(a):
 
 # tile expression objects
 class Tile:
-    
-    def __init__(self, dtype, M, N, op=None):
+
+    allocation = 0
+
+    def __init__(self, dtype, M, N, op=None, storage="register"):
         self.dtype = dtype
         self.M = M
         self.N = N
         self.op = op
+        self.storage = storage
+
+    def ctype(self):
+        from warp.codegen import Var
+
+        if self.storage == "register":
+            return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+        elif self.storage == "shared":
+            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+
+    # generate a unique allocation index for shared memory
+    @classmethod
+    def alloc(cls):
+        index = cls.allocation
+        cls.allocation += 1
+        return index
 
 class TileZeros(Tile):
 
     def __init__(self, dtype, M, N):
-        Tile.__init__(self, dtype, M, N, "zeros")
+        Tile.__init__(self, dtype, M, N, op="zeros", storage="shared")
         
-    def ctype(self):
-        from warp.codegen import Var
-        return f"wp::tile_zeros_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
 
 class TileConstant(Tile):
 
     def __init__(self, dtype, M, N):
-        Tile.__init__(self, dtype, M, N, "zeros")
+        Tile.__init__(self, dtype, M, N, op="constant", storage="register")
         
-    def ctype(self):
-        from warp.codegen import Var
-        return f"wp::tile_constant_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
-
 
 class TileLoad(Tile):
 
     def __init__(self, array, M, N):
-        Tile.__init__(self, array.dtype, M, N, "load")
+        Tile.__init__(self, array.dtype, M, N, op="load", storage="shared")
         
-    def ctype(self):
-        from warp.codegen import Var
-        return f"wp::tile_load_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
 
 class TileUnaryMap(Tile):
 
     def __init__(self, t):
-        Tile.__init__(self, t.dtype, t.M, t.N, "unary_map")
+        Tile.__init__(self, t.dtype, t.M, t.N, op="unary_map", storage="register")
 
         self.t = t
-        
-    def ctype(self):
-        from warp.codegen import Var
-        return f"wp::tile_unary_map_t<{self.t.ctype()}>"
+
 
 class TileBinaryMap(Tile):
 
     def __init__(self, a, b):
-        Tile.__init__(self, a.dtype, a.M, a.N, "binary_map")
+        Tile.__init__(self, a.dtype, a.M, a.N, op="binary_map", storage="register")
 
         self.a = a
         self.b = b
-        
-    def ctype(self):
-        from warp.codegen import Var
-        return f"wp::tile_binary_map_t<{self.a.ctype()}, {self.b.ctype()}>"
 
 
 class TileShared(Tile):
 
     def __init__(self, t):
-        Tile.__init__(self, t.dtype, t.M, t.N, "shared")
+        Tile.__init__(self, t.dtype, t.M, t.N, "shared", storage="shared")
 
         self.t = t
-        
-    def ctype(self):
-        from warp.codegen import Var
-        return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
 
 
 def is_tile(t):

From 5fce6ced169526fdae66ed8874f1c48a94ee1c02 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 9 Sep 2024 15:17:05 +1200
Subject: [PATCH 020/102] wp.tile_matmul() reverse mode working, added support
 for strided shared memory tiles

---
 warp/codegen.py         |   5 +-
 warp/native/tile.h      | 112 +++++++++++++++++++++++++++++++++-------
 warp/native/tile_gemm.h |  62 +++-------------------
 warp/tests/test_tile.py |  49 ++++++++++--------
 4 files changed, 132 insertions(+), 96 deletions(-)

diff --git a/warp/codegen.py b/warp/codegen.py
index 9a38d7c1..88c12d8a 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -1424,7 +1424,10 @@ def end_for(adj, iter):
 
         # zero adjoints
         for i in body_block.vars:
-            reverse.append(adj.indentation + f"\t{i.emit_adj()} = {{}};")
+            if is_tile(i.type):
+                reverse.append(adj.indentation + f"\t{i.emit_adj()}.zero();")
+            else:
+                reverse.append(adj.indentation + f"\t{i.emit_adj()} = {{}};")
 
         # replay
         for i in body_block.body_replay:
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 4315eda7..5111d958 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -37,9 +37,10 @@
     [x] Support built-in functions
     [ ] Support for lambda functions
     [ ] Infer tile_map() output from operator type (e.g.: dot for each element)
-[ ] wp.tile_matmul()
+[x] wp.tile_matmul()
     [x] Forward
-    [ ] Reverse
+    [x] Reverse
+[ ] wp.tile_atomic_add()   
 [ ] Support for n-d shape tiles / broadcasting / slicing / transpose?
 [x] Compile-time block dimensions
 [ ] Support for CUB reductions
@@ -49,6 +50,17 @@
     [ ] Batched MLP
     [ ] Point cloud alignment
     [ ] Layer norm
+    [ ] Convolution: https://github.com/NVIDIA/MinkowskiEngine/blob/master/src/convolution_kernel.cu#L123
+    [ ] MeshCNN (Modulus, Oliver)
+    [ ] BioNemo (Ali)
+    [ ] Skinning (David/Or/Vismay)
+    [ ] warp.sim (VBD)
+    [ ] warp.sim (CRBA)
+    [ ] Point clustering
+    [ ] GEMM
+    [ ] MLP
+    [ ] LayerNorm  
+    [ ] SoftMax
 [ ] Error checking
     [ ] Ensure functions passed to tile_map() are compatible with tile type
     [ ] Ensure that args passed to tile ops are compatible
@@ -126,12 +138,15 @@ inline CUDA_CALLABLE T* tile_alloc_shared()
     return data;
 }
 
-template <typename T, int M_, int N_>
+
+template <typename T, int M_, int N_, int StrideM_=N_, int StrideN_=1>
 struct tile_shared_t
 {
     using Type = T;
     static constexpr int M = M_;
     static constexpr int N = N_;
+    static constexpr int StrideM = StrideM_;
+    static constexpr int StrideN = StrideN_;
 
     T* data = NULL;
 
@@ -140,13 +155,36 @@ struct tile_shared_t
     {
     }
 
+    inline T& operator()(int i, int j)
+    {
+        assert(i < M);
+        assert(j < N);
+
+        return data[i*StrideM + j*StrideN];
+    }
+
+    inline const T& operator()(int i, int j) const
+    {
+        assert(i < M);
+        assert(j < N);
+
+        return data[i*StrideM + j*StrideN];
+    }
+
     struct iterator
     {
-        tile_shared_t<Type, M, N>& tile;
+        tile_shared_t<Type, M, N, StrideM, StrideN>& tile;
         int offset;
         
-        inline CUDA_CALLABLE iterator(tile_shared_t<Type, M, N>& t, int i) : tile(t), offset(i) {}
-        inline CUDA_CALLABLE T& operator*() const { return tile.data[offset]; }
+        template <typename Tile>
+        inline CUDA_CALLABLE iterator(Tile& t, int i) : tile(t), offset(i) {}
+        inline CUDA_CALLABLE T& operator*() const 
+        {
+            assert(offset < tile_size(tile));
+
+            return tile.data[offset];
+        }
+
         inline CUDA_CALLABLE iterator& operator++() { offset += WP_TILE_BLOCK_DIM; return *this; }        
         inline CUDA_CALLABLE bool valid() const { return index() < tile_size(tile); }
 
@@ -160,8 +198,22 @@ struct tile_shared_t
     };    
 
     iterator iter() { return iterator(*this, threadIdx.x); }
+
+    void zero()
+    {
+        // todo: make this subtile (stride aware)?
+        for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
+            data[i] = T(0);
+    }
 };
 
+template <typename Tile>
+auto tile_transpose(Tile& t)
+{
+    // alias incoming tile 
+    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
+}
+
 
 template <typename T, int M_, int N_>
 struct tile_register_t
@@ -175,9 +227,11 @@ struct tile_register_t
    
     tile_register_t() 
     {
-        // zero-initialize by default
-        // necessary for tile adjoints
+        // zero-initialize by default necessary for tile adjoints
         // need to check if this results in worse codegen
+        // than doing adj_var = tile_zeros() explicitly
+        // in backwards pass and letting default constructor
+        // avoid initialization
         for (int i=0; i < NumRegs; ++i)
             data[i] = T(0);
     }
@@ -187,14 +241,19 @@ struct tile_register_t
         tile_register_t<Type, M, N>& tile;
         int offset;
        
-        inline CUDA_CALLABLE iterator(tile_register_t<Type, M, N>& t, int i) : tile(t), offset(i) {}
+        inline CUDA_CALLABLE iterator(tile_register_t<Type, M, N>& t) : tile(t), offset(0) { }
 
-        inline CUDA_CALLABLE T& operator*() const { return tile.data[offset]; }
+        inline CUDA_CALLABLE T& operator*() const 
+        {
+            assert(offset < NumRegs);
+
+            return tile.data[offset]; 
+        }
         inline CUDA_CALLABLE iterator& operator++() { ++offset; return *this; }
-        inline CUDA_CALLABLE bool valid() const { return offset < NumRegs && index() < tile_size(tile); }
+        inline CUDA_CALLABLE bool valid() const { return index() < tile_size(tile); }
 
         // linear index into the tile's data (assuming row-major layout)
-        inline CUDA_CALLABLE int index() const { return threadIdx.x + offset*WP_TILE_BLOCK_DIM; }
+        inline CUDA_CALLABLE int index() const { return threadIdx.x + offset*WP_TILE_BLOCK_DIM;}
         inline CUDA_CALLABLE coord_t coord() const
         {
             int i = index();
@@ -202,7 +261,7 @@ struct tile_register_t
         }
     };    
 
-    iterator iter() { return iterator(*this, 0); }
+    iterator iter() { return iterator(*this); }
 };
 
 
@@ -272,19 +331,27 @@ template <typename T, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
                                         array_t<T>& adj_src, int adj_x, int adj_y,
                                         AdjTile& adj_ret)
-{
-    // add gradients to src array
-    WP_PRAGMA_UNROLL
+{    
+    // add gradients to src array   
     for (auto adj_iter=adj_ret.iter(); adj_iter.valid(); ++adj_iter)
     {  
         coord_t c = adj_iter.coord();
-        atomic_add(adj_src, x*adj_ret.M + c.i, y*adj_ret.N + c.j, *adj_iter);
+
+        int i = x*adj_ret.M + c.i;
+        int j = y*adj_ret.N + c.j;
+
+        auto grad = *adj_iter;
+
+        if (adj_src.data)
+            adj_atomic_add(&index(adj_src, i, j), grad);
+        else if (src.grad)
+            adj_atomic_add(&index_grad(src, i, j), grad);
     }
 }
 
 template <typename T, typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, AdjTile& adj_t)
-{
+{  
     const int M = t.M;
     const int N = t.N;
 
@@ -293,7 +360,14 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t
     for (auto adj_iter=adj_t.iter(); adj_iter.valid(); ++adj_iter)
     {  
         coord_t c = adj_iter.coord();
-        *adj_iter += index(adj_dest, x*M + c.i, y*N + c.j, *adj_iter);
+
+        int i = x*M + c.i;
+        int j = y*N + c.j;
+
+        if (adj_dest.data)
+            *adj_iter += index(adj_dest, i, j);
+        else if (dest.grad)
+            *adj_iter += index_grad(dest, i, j);        
     }
 }
 
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index fca527d0..5cf8ba04 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -219,7 +219,7 @@ inline CUDA_CALLABLE void gemm(const array_t<T>& A, const array_t<T>& B, const a
 template <typename TileA, typename TileB, typename TileC>
 inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
                                              const TileB& B,
-                                             const TileC& out)
+                                             TileC& out)
 {    
     const int length = tile_size(out);
 
@@ -227,10 +227,6 @@ inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
 
     using T = typename TileA::Type;
 
-    const T* __restrict__ A_ptr = A.data;
-    const T* __restrict__ B_ptr = B.data;
-    T* __restrict__ C_ptr = out.data;
-
     WP_PRAGMA_UNROLL
     for (int t=threadIdx.x; t < length; t += blockDim.x)
     {  
@@ -243,13 +239,13 @@ inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
         WP_PRAGMA_UNROLL
         for (int k=0; k < A.N; ++k)
         {
-            T a = index(A_ptr, i, k, A.N);
-            T b = index(B_ptr, k, j, B.N);
+            T a = A(i,k);
+            T b = B(k,j);
 
-            sum = fmaf(a, b, sum);
+            sum += a*b; // todo: use fmaf() 
         }
         
-        index(C_ptr, i, j, out.N) += sum;
+        out(i,j) += sum;
     }
 
     WP_TILE_SYNC();
@@ -311,46 +307,6 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 
 #endif // USE_CUTE
 
-template <typename TileA, typename TileB, typename TileC>
-struct tile_matmul_t
-{
-    static_assert(wp::is_same<typename TileA::Type, typename TileB::Type>::value, "Error, tile datatypes must match");
-    static_assert(TileA::N == TileB::M, "Error, inner dimensions must match");
-    static_assert(TileC::M == TileA::M, "Error, first output dimension must match");
-    static_assert(TileC::N == TileB::N, "Error, second output dimension must match");
-
-    using Type = typename TileA::Type;
-    static constexpr int M = TileC::M;
-    static constexpr int N = TileC::N;
-
-    TileA tile_a;
-    TileB tile_b;
-    TileC tile_c;
-
-    tile_matmul_t(TileA &a, TileB &b, TileC &c) : tile_a(a),
-                                                  tile_b(b),
-                                                  tile_c(c) {}
-
-    Type fwd(int e) const
-    {
-        // load 
-        
-
-    }
-
-    void bwd(int e, Type adj_ret) const
-    {
-    }
-
-    void print()
-    {
-        printf("tile_matmul_t<%d, %d>", M, N);
-        printf("\n   -+");
-        tile_a.print();
-        printf("\n   -+");
-        tile_b.print();
-    }
-};
 
 
 template <typename TileA, typename TileB, typename TileC>
@@ -370,12 +326,8 @@ void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
                      TileA& adj_a, TileB& adj_b, TileC& adj_c)
 {
 
-    // auto a_shared = tile_eval<Index+0>(a);
-    // auto b_shared = tile_eval<Index+1>(b);
-    // auto adj_c_shared = tile_eval<Index+1>(b);
-
-    // tile_matmul_scalar(adj_c, wp.tile_transpose(b), adj_a);
-    // tile_matmul_scalar(wp.tile_transpose(a), adj_c, adj_b);
+    tile_matmul_scalar(adj_c, wp::tile_transpose(b), adj_a);
+    tile_matmul_scalar(wp::tile_transpose(a), adj_c, adj_b);
 }
 
 
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 02fc9870..6365d91d 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -3,18 +3,16 @@
 
 import torch
 
-#wp.config.mode = "debug"
-
 wp.init()
 wp.set_module_options({"enable_backward": True})
 wp.set_device("cuda:0")
-
+wp.config.mode = "debug"
 wp.config.verify_cuda = True
 
 wp.build.clear_kernel_cache()
 
 TILE_M = wp.constant(32)
-TILE_N = wp.constant(32)
+TILE_N = wp.constant(16)
 TILE_K = wp.constant(8)
 
 @wp.kernel
@@ -232,17 +230,15 @@ def test_tile_batched_gemm():
     B = rng.random((batch_count, K, N), dtype=np.float32)
     C = np.zeros((batch_count, M, N), dtype=np.float32)
 
-    A_wp = wp.array(A)
-    B_wp = wp.array(B)
-    C_wp = wp.array(C)
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
+    C_wp = wp.array(C, requires_grad=True)
 
-    wp.launch(tile_grouped_gemm, dim=batch_count, inputs=[A_wp, B_wp, C_wp], tile_size=8)
+    with wp.Tape() as tape:    
+        wp.launch(tile_grouped_gemm, dim=batch_count, inputs=[A_wp, B_wp, C_wp], tile_size=8)
 
     # bring back to host
-    C_wp = C_wp.numpy()
-
-    for i in range(batch_count):
-        assert(np.allclose(A[i]@B[i], C_wp[i], rtol=1.e-4))
+    C_host = C_wp.numpy()
 
     # GEMM forward passed
     print("batched matmul forward passed")
@@ -263,7 +259,7 @@ def tile_gemm(A: wp.array2d(dtype=float),
     K = A.shape[1]
 
     count = int(K / TILE_K) 
-
+    
     for k in range(0, count):
 
         a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
@@ -278,30 +274,41 @@ def tile_gemm(A: wp.array2d(dtype=float),
 def test_tile_gemm():
 
     M = TILE_M*7
-    K = TILE_K*4
-    N = TILE_N*6
+    K = TILE_K*5
+    N = TILE_N*2
 
     rng = np.random.default_rng(42)
     A = rng.random((M, K), dtype=np.float32)
     B = rng.random((K, N), dtype=np.float32)
     C = np.zeros((M, N), dtype=np.float32)
 
-    A_wp = wp.array(A)
-    B_wp = wp.array(B)
-    C_wp = wp.array(C)
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
+    C_wp = wp.array(C, requires_grad=True)
 
-    wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=8)
+    with wp.Tape() as tape:    
+        wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=32)
 
     assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
 
     # GEMM forward passed
     print("matmul forward passed")
 
+    adj_C = np.ones_like(C)
+
+    tape.backward(grads={C_wp: wp.array(adj_C)})
+
+    assert(np.allclose(adj_C@B.T, A_wp.grad.numpy(), rtol=1.e-4))
+    assert(np.allclose(A.T@adj_C, B_wp.grad.numpy(), rtol=1.e-4))
+
+    print("matmul backward passed")
+
+
 
 
 test_tile_copy()
 test_tile_unary_map()
 test_tile_binary_map()
-# test_tile_batched_gemm()
-# test_tile_gemm()
+test_tile_batched_gemm()
+test_tile_gemm()
 test_tile_operators()
\ No newline at end of file

From defbe87a4326acdf67ce703f46cc0126e095a69e Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 9 Sep 2024 17:22:02 +1200
Subject: [PATCH 021/102] Add support for scalar*tile, tile*scalar operators

---
 warp/builtins.py        |  16 +++++-
 warp/native/tile.h      |  83 +++++++++++++++++++++++++++---
 warp/native/tile_gemm.h |   2 +-
 warp/tests/test_tile.py | 109 +++++++++++++++++++++-------------------
 4 files changed, 149 insertions(+), 61 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index abe2d7b5..7e3c5722 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2012,6 +2012,18 @@ def tile_binary_map_value_func(arg_types, arg_values):
     export=False,
 )
 
+add_builtin(
+    "add",
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)},
+    value_func=tile_binary_map_value_func,
+    #dispatch_func=tile_map_dispatch_func,
+    #variadic=True,
+    native_func="tile_add",
+    doc="Add each element of two tiles together", 
+    group="Tile Primitives",
+    export=False,
+)
+
 # ---------------------------------
 # Linear Algebra
 
@@ -4494,14 +4506,14 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
         if x.dtype != y:
             raise RuntimeError("Scalar factor should have the same type as tile for tile*scalar, tile type: {x} scalar type: {y}")
         
-        return TileBinaryMap(x, TileConstant(x.dtype, x.M, x.N))
+        return TileBinaryMap(x, TileConstant(y, x.M, x.N))
     
     # scalar*tile
     if is_tile(y):
         if y.dtype != x:
             raise RuntimeError("Scalar factor should have the same type as tile for scalar*tile, tile type: {x} scalar type: {y}")
         
-        return TileBinaryMap(TileConstant(x.dtype, x.M, x.N), y)
+        return TileBinaryMap(TileConstant(x, y.M, y.N), y)
 
 
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 5111d958..b08cdf4c 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -64,6 +64,7 @@
 [ ] Error checking
     [ ] Ensure functions passed to tile_map() are compatible with tile type
     [ ] Ensure that args passed to tile ops are compatible
+    [ ] Ensure tile load/store operations don't go out of bounds of arrays in debug mode
 
 */
 
@@ -225,7 +226,7 @@ struct tile_register_t
 
     T data[NumRegs];
    
-    tile_register_t() 
+    tile_register_t(T value=T(0.0)) 
     {
         // zero-initialize by default necessary for tile adjoints
         // need to check if this results in worse codegen
@@ -233,7 +234,7 @@ struct tile_register_t
         // in backwards pass and letting default constructor
         // avoid initialization
         for (int i=0; i < NumRegs; ++i)
-            data[i] = T(0);
+            data[i] = value;
     }
 
     struct iterator
@@ -381,10 +382,13 @@ auto tile_map(Fwd op,
     auto out_iter = out.iter();
     auto a_iter = a.iter();
 
+    WP_PRAGMA_UNROLL
     for (; out_iter.valid(); ++out_iter, ++a_iter)
-    {
         *out_iter = op(*a_iter);
-    }
+
+    // WP_PRAGMA_UNROLL
+    // for (int i=0; i < Tile::NumRegs; ++i)
+    //     out.data[i] = op(a.data[i]);
 
     return out;
 }
@@ -400,6 +404,7 @@ void adj_tile_map(Fwd op,
     auto adj_a_iter = adj_a.iter();
     auto adj_ret_iter = adj_ret.iter();
 
+    WP_PRAGMA_UNROLL
     for (; a_iter.valid(); ++a_iter, ++adj_a_iter, ++adj_ret_iter)
     {
         adj_op(*a_iter, *adj_a_iter, *adj_ret_iter);
@@ -418,10 +423,14 @@ auto tile_map(Fwd op,
     auto a_iter = a.iter();
     auto b_iter = b.iter();
 
+    WP_PRAGMA_UNROLL
     for (; out_iter.valid(); ++out_iter, ++a_iter, ++b_iter)
-    {
         *out_iter = op(*a_iter, *b_iter);
-    }
+
+    // WP_PRAGMA_UNROLL
+    // for (int i=0; i < TileA::NumRegs; ++i)
+    //     out.data[i] = op(a.data[i], b.data[i]);
+
 
     return out;
 }
@@ -441,6 +450,7 @@ void adj_tile_map(Fwd op,
     auto adj_b_iter = adj_b.iter();    
     auto adj_ret_iter = adj_ret.iter();
 
+    WP_PRAGMA_UNROLL
     for (; a_iter.valid(); ++a_iter, ++b_iter, ++adj_a_iter, ++adj_b_iter, ++adj_ret_iter)
     {
         adj_op(*a_iter, *b_iter, *adj_a_iter, *adj_b_iter, *adj_ret_iter);
@@ -509,6 +519,67 @@ CUDA_CALLABLE inline auto tile_mul_impl(typename Tile::Type s, Tile& t,
 #define tile_add(a, b) tile_add_impl(a, b adj_##a, adj_##b)
 */
 
+template <typename TileA, typename TileB>
+auto tile_add(TileA& a, TileB& b)
+{
+    return tile_binary_map(add, a, b);
+}
+
+template <typename TileA, typename TileB, typename AdjTile>
+void adj_tile_add(TileA& a, TileB& b, TileA& adj_a, TileB& adj_b, AdjTile& adj_c)
+{
+    adj_tile_binary_map(add, a, b, adj_add, adj_a, adj_b, adj_c);
+}
+
+// tile*scalar
+template <typename Tile>
+auto tile_mul(Tile& a, const typename Tile::Type& s)
+{
+    // promote scalar to a constant tile
+    auto s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>(s);
+
+    return tile_binary_map(mul, a, s_tile);
+}
+
+template <typename Tile, typename AdjTile>
+void adj_tile_mul(Tile& a, const typename Tile::Type& s,
+                  Tile& adj_a, typename Tile::Type& adj_s,
+                  AdjTile& adj_c)
+{
+    // auto s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>(s);
+    // auto adj_s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>();
+
+    // adj_tile_binary_map(mul, a, s_tile, adj_mul, adj_a, adj_s_tile, adj_c);
+
+    // todo: sum up contribution from all adj_s_tile onto original scalar
+    //adj_tile_sum()
+}
+
+
+// scalar*tile
+template <typename Tile>
+auto tile_mul(const typename Tile::Type& s, Tile& a)
+{
+    // promote scalar to a constant tile
+    auto s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>(s);
+
+    return tile_binary_map(mul, s_tile, a);
+}
+
+template <typename Tile, typename AdjTile>
+void adj_tile_mul(const typename Tile::Type& s, Tile& a,
+                  typename Tile::Type& adj_s, Tile& adj_a,
+                  AdjTile& adj_c)
+{
+    // auto s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>(s);
+    // auto adj_s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>();
+
+    // adj_tile_binary_map(mul, a, s_tile, adj_mul, adj_a, adj_s_tile, adj_c);
+
+    // todo: sum up contribution from all adj_s_tile onto original scalar
+    //adj_tile_sum()
+}
+
 
 } // namespace wp
 
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index 5cf8ba04..faf807ad 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -228,7 +228,7 @@ inline CUDA_CALLABLE void tile_matmul_scalar(const TileA& A,
     using T = typename TileA::Type;
 
     WP_PRAGMA_UNROLL
-    for (int t=threadIdx.x; t < length; t += blockDim.x)
+    for (int t=threadIdx.x; t < length; t += WP_TILE_BLOCK_DIM)
     {  
         // compute output index
         const int i = t/out.N;
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 6365d91d..2a025362 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -6,15 +6,18 @@
 wp.init()
 wp.set_module_options({"enable_backward": True})
 wp.set_device("cuda:0")
-wp.config.mode = "debug"
-wp.config.verify_cuda = True
+#wp.config.mode = "debug"
+#wp.config.verify_cuda = True
 
 wp.build.clear_kernel_cache()
 
-TILE_M = wp.constant(32)
-TILE_N = wp.constant(16)
+TILE_M = wp.constant(16)
+TILE_N = wp.constant(8)
 TILE_K = wp.constant(8)
 
+# num threads per-tile
+TILE_DIM = 64
+
 @wp.kernel
 def tile_copy(A: wp.array2d(dtype=float),
               B: wp.array2d(dtype=float)):
@@ -40,7 +43,7 @@ def test_tile_copy():
     B_wp = wp.array(B, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_copy, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
+        wp.launch(tile_copy, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=TILE_DIM)
 
     # verify forward pass
     assert(np.allclose(A, B_wp.numpy(), rtol=1.e-4))
@@ -87,7 +90,7 @@ def test_tile_unary_map():
     B_wp = wp.zeros_like(A_wp, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_unary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=8)
+        wp.launch(tile_unary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=TILE_DIM)
 
     # verify forward pass
     assert(np.allclose(B, B_wp.numpy(), rtol=1.e-4))
@@ -140,7 +143,7 @@ def test_tile_binary_map():
     C_wp = wp.zeros_like(A_wp, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_binary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp, C_wp], tile_size=8)
+        wp.launch(tile_binary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp, C_wp], tile_size=TILE_DIM)
 
     # verify forward pass
     assert(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
@@ -156,49 +159,6 @@ def test_tile_binary_map():
     print("Binary map backward passed")
 
 
-@wp.kernel
-def tile_operators(input: wp.array3d(dtype=float),
-                   output: wp.array3d(dtype=float)):
-
-    # output tile index
-    i = wp.tid()
-
-    a = wp.tile_load(input[i], 0, 0, m=32, n=8)
-    
-    # neg
-    b = -a
-
-    # scalar multiply
-#    c = b*0.5
-
-    # # add tiles
-    # c = a + b    
-    
-    wp.tile_store(output[i], 0, 0, b)
-
-
-def test_tile_operators():
-
-    batch_count = 56
-
-    M = 32
-    N = 8
-
-    rng = np.random.default_rng(42)
-    input = rng.random((batch_count, M, N), dtype=np.float32)
-    output = -input
-
-    input_wp = wp.array(input)
-    output_wp = wp.zeros_like(input_wp)
-
-    wp.launch(tile_operators, dim=batch_count, inputs=[input_wp, output_wp], tile_size=8)
-
-    assert(np.allclose(output, output_wp.numpy(), rtol=1.e-4))
-
-    print("operators forward passed")
-
-
-
 @wp.kernel
 def tile_grouped_gemm(A: wp.array3d(dtype=float),
                       B: wp.array3d(dtype=float),
@@ -235,7 +195,7 @@ def test_tile_batched_gemm():
     C_wp = wp.array(C, requires_grad=True)
 
     with wp.Tape() as tape:    
-        wp.launch(tile_grouped_gemm, dim=batch_count, inputs=[A_wp, B_wp, C_wp], tile_size=8)
+        wp.launch(tile_grouped_gemm, dim=batch_count, inputs=[A_wp, B_wp, C_wp], tile_size=TILE_DIM)
 
     # bring back to host
     C_host = C_wp.numpy()
@@ -287,7 +247,7 @@ def test_tile_gemm():
     C_wp = wp.array(C, requires_grad=True)
 
     with wp.Tape() as tape:    
-        wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=32)
+        wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=TILE_DIM)
 
     assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
 
@@ -305,6 +265,51 @@ def test_tile_gemm():
 
 
 
+@wp.kernel
+def tile_operators(input: wp.array3d(dtype=float),
+                   output: wp.array3d(dtype=float)):
+
+    # output tile index
+    i = wp.tid()
+
+    a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
+    
+    # neg
+    b = -a
+
+    # right scalar multiply
+    c = b*0.5
+
+    # left scalar multiply
+    d = 0.5*c
+
+    # add tiles
+    e = a + d
+    
+    wp.tile_store(output[i], 0, 0, e)
+
+
+def test_tile_operators():
+
+    batch_count = 56
+
+    M = TILE_M
+    N = TILE_N
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, M, N), dtype=np.float32)
+    output = input*0.75
+
+    input_wp = wp.array(input)
+    output_wp = wp.zeros_like(input_wp)
+
+    wp.launch(tile_operators, dim=batch_count, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
+
+    assert(np.allclose(output, output_wp.numpy(), rtol=1.e-4))
+
+    print("operators forward passed")
+
+
 
 test_tile_copy()
 test_tile_unary_map()

From 77172ad297ab76dc5d8c99871dfece72218abbba Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 11 Sep 2024 12:55:28 +1200
Subject: [PATCH 022/102] Code-gen improvements, force ops. to promote shared
 memory tiles to registers before execution

---
 warp/native/tile.h      | 485 ++++++++++++++++++++++++----------------
 warp/tests/test_tile.py |   7 +-
 2 files changed, 302 insertions(+), 190 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index b08cdf4c..b271ccd3 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -93,29 +93,9 @@ struct is_same<T, T> {
     static constexpr bool value = true;
 };
 
-template <typename T>
-void print_tile(T& t)
-{
-    t.print();
-
-    printf("[");
-    for (int i=0; i < T::M; ++i)
-    {
-        printf("%*s[", i>0, "");
-        for (int j=0; j < T::N; ++j)
-        {
-            printf("%5.2f ", t.data[i*T::N + j]);
-        }
-
-        if (i == T::M-1)
-            printf("]]\n");
-        else
-            printf("]\n");
-    }
-}
 
 template <typename Tile>
-int tile_size(Tile& t) { return Tile::M*Tile::N; }
+constexpr int tile_size(Tile& t) { return Tile::M*Tile::N; }
 
 constexpr int tile_regcount(int m, int n) {
     return (m*n + WP_TILE_BLOCK_DIM - 1) / WP_TILE_BLOCK_DIM;
@@ -140,23 +120,96 @@ inline CUDA_CALLABLE T* tile_alloc_shared()
 }
 
 
+template <typename T, int M_, int N_>
+struct tile_register_t
+{
+    using Type = T;
+    static constexpr int M = M_;
+    static constexpr int N = N_;
+    static constexpr int Size = M*N;
+
+    static constexpr int NumRegs = tile_regcount(M, N);
+
+    T data[NumRegs];
+   
+    inline CUDA_CALLABLE tile_register_t(T value=T(0.0)) 
+    {
+        // zero-initialize by default necessary for tile adjoints
+        // need to check if this results in worse codegen
+        // than doing adj_var = tile_zeros() explicitly
+        // in backwards pass and letting default constructor
+        // avoid initialization
+        
+        for (int i=0; i < NumRegs; ++i)
+            data[i] = value;
+    }
+
+    // compute linear tile index from a local register index
+    inline CUDA_CALLABLE int index(int reg) const
+    {
+        return threadIdx.x + reg*WP_TILE_BLOCK_DIM;
+    }
+
+    // compute tile coordinate from linear index
+    inline CUDA_CALLABLE coord_t coord(int index) const
+    {
+        return {index/N, index%N};
+    }
+
+    // Returns the number of valid registers for this tile
+    // i.e.: how many registers map to a valid coordinate.
+    // When a tile's size is not aligned to the block dimension
+    // some of the trailing registers may lie outside the valid range
+    inline CUDA_CALLABLE int valid() const
+    {
+        return (Size - threadIdx.x)/WP_TILE_BLOCK_DIM;
+    }    
+
+    // return the in-register version of this tile (nop)
+    inline CUDA_CALLABLE auto& get() { return *this; }
+
+    inline CUDA_CALLABLE void assign(const tile_register_t<T, M, N>& tile) 
+    { 
+        for (int i=0; i < NumRegs; ++i)
+            data[i] = tile.data[i];
+    }
+
+    
+    inline CUDA_CALLABLE void print()
+    {
+        printf("tid: %d ", threadIdx.x);
+
+        for (int i=0; i < NumRegs; ++i)
+        {
+            printf("%f ", data[i]);
+        }
+
+        printf("\n");
+    }
+        
+};
+
+
+
 template <typename T, int M_, int N_, int StrideM_=N_, int StrideN_=1>
 struct tile_shared_t
 {
     using Type = T;
     static constexpr int M = M_;
     static constexpr int N = N_;
+    static constexpr int Size = M*N;
+
     static constexpr int StrideM = StrideM_;
     static constexpr int StrideN = StrideN_;
 
     T* data = NULL;
 
-    tile_shared_t() {}
-    tile_shared_t(T* smem) : data(smem)
+    inline CUDA_CALLABLE tile_shared_t() {}
+    inline CUDA_CALLABLE tile_shared_t(T* smem) : data(smem)
     {
     }
 
-    inline T& operator()(int i, int j)
+    inline CUDA_CALLABLE T& operator()(int i, int j)
     {
         assert(i < M);
         assert(j < N);
@@ -164,7 +217,7 @@ struct tile_shared_t
         return data[i*StrideM + j*StrideN];
     }
 
-    inline const T& operator()(int i, int j) const
+    inline CUDA_CALLABLE const T& operator()(int i, int j) const
     {
         assert(i < M);
         assert(j < N);
@@ -172,99 +225,116 @@ struct tile_shared_t
         return data[i*StrideM + j*StrideN];
     }
 
-    struct iterator
+    inline CUDA_CALLABLE T& operator()(int index)
     {
-        tile_shared_t<Type, M, N, StrideM, StrideN>& tile;
-        int offset;
-        
-        template <typename Tile>
-        inline CUDA_CALLABLE iterator(Tile& t, int i) : tile(t), offset(i) {}
-        inline CUDA_CALLABLE T& operator*() const 
-        {
-            assert(offset < tile_size(tile));
+        assert(index < M*N);
 
-            return tile.data[offset];
-        }
+        // unravel
+        int i = index/N;
+        int j = index%N;
 
-        inline CUDA_CALLABLE iterator& operator++() { offset += WP_TILE_BLOCK_DIM; return *this; }        
-        inline CUDA_CALLABLE bool valid() const { return index() < tile_size(tile); }
+        return (*this)(i,j);
+    }    
 
-        // linear index into the tile's data (assuming row-major layout)
-        inline CUDA_CALLABLE int index() const { return offset; }
-        inline CUDA_CALLABLE coord_t coord() const
-        {
-            int i = index();
-            return {i/N, i%N};
-        }
-    };    
+    inline CUDA_CALLABLE const T& operator()(int index) const
+    {
+        assert(index < M*N);
 
-    iterator iter() { return iterator(*this, threadIdx.x); }
+        // unravel
+        int i = index/N;
+        int j = index%N;
 
-    void zero()
+        return (*this)(i,j);
+    }    
+
+    // in-place zero
+    inline CUDA_CALLABLE void zero()
     {
-        // todo: make this subtile (stride aware)?
+        // todo: make this subtile (stride aware)
         for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
             data[i] = T(0);
     }
-};
-
-template <typename Tile>
-auto tile_transpose(Tile& t)
-{
-    // alias incoming tile 
-    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
-}
-
-
-template <typename T, int M_, int N_>
-struct tile_register_t
-{
-    using Type = T;
-    static constexpr int M = M_;
-    static constexpr int N = N_;
-    static constexpr int NumRegs = tile_regcount(M, N);
 
-    T data[NumRegs];
-   
-    tile_register_t(T value=T(0.0)) 
+    // compute linear tile index from a local register index
+    inline CUDA_CALLABLE int index(int reg) const
     {
-        // zero-initialize by default necessary for tile adjoints
-        // need to check if this results in worse codegen
-        // than doing adj_var = tile_zeros() explicitly
-        // in backwards pass and letting default constructor
-        // avoid initialization
-        for (int i=0; i < NumRegs; ++i)
-            data[i] = value;
+        return threadIdx.x + reg*WP_TILE_BLOCK_DIM;
     }
 
-    struct iterator
+    // compute tile coordinate from linear index
+    inline CUDA_CALLABLE coord_t coord(int index) const
     {
-        tile_register_t<Type, M, N>& tile;
-        int offset;
-       
-        inline CUDA_CALLABLE iterator(tile_register_t<Type, M, N>& t) : tile(t), offset(0) { }
+        return {index/N, index%N};
+    }
 
-        inline CUDA_CALLABLE T& operator*() const 
+    // copy shared tile to register
+    inline CUDA_CALLABLE tile_register_t<T, M, N> get() 
+    { 
+        tile_register_t<T, M, N> out;
+
+        WP_PRAGMA_UNROLL
+        for (int i=0; i < out.NumRegs; ++i)
         {
-            assert(offset < NumRegs);
+            const int linear = out.index(i);
+
+            // handle case where tile size is not
+            // aligned to block dimensions
+            if (linear > Size)
+                break;
 
-            return tile.data[offset]; 
+            out.data[i] = (*this)(linear);
         }
-        inline CUDA_CALLABLE iterator& operator++() { ++offset; return *this; }
-        inline CUDA_CALLABLE bool valid() const { return index() < tile_size(tile); }
 
-        // linear index into the tile's data (assuming row-major layout)
-        inline CUDA_CALLABLE int index() const { return threadIdx.x + offset*WP_TILE_BLOCK_DIM;}
-        inline CUDA_CALLABLE coord_t coord() const
+        return out;
+    }
+
+    // copy register tile to shared
+    inline CUDA_CALLABLE void assign(const tile_register_t<T, M, N>& tile)
+    { 
+        WP_PRAGMA_UNROLL
+        for (int i=0; i < tile.NumRegs; ++i)
         {
-            int i = index();
-            return {i/N, i%N};
+            const int linear = tile.index(i);
+
+            // handle case where tile size is not
+            // aligned to block dimensions
+            if (linear > Size)
+                break;
+
+            // todo: should use coord here to handle cases where
+            // shared tile is a slice?
+            data[linear] = tile.data[i];
         }
-    };    
+    }
 
-    iterator iter() { return iterator(*this); }
+    inline CUDA_CALLABLE void print()
+    {
+        if (threadIdx.x == 0)
+        {
+            printf("[");
+            for (int i=0; i < M; ++i)
+            {
+                printf("%*s[", i>0, "");
+                for (int j=0; j < N; ++j)
+                {
+                    printf("%5.2f ", data(i, j));
+                }
+
+                if (i == M-1)
+                    printf("]]\n");
+                else
+                    printf("]\n");
+            }
+        }
+    }
 };
 
+template <typename Tile>
+inline CUDA_CALLABLE auto tile_transpose(Tile& t)
+{
+    // alias incoming tile 
+    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
+}
 
 
 //-----------------------------------------------------------------------------------------------------
@@ -287,7 +357,7 @@ inline CUDA_CALLABLE auto tile_zeros()
 }
 
 
-// entry point for store
+// entry point for load
 template <typename T, int M, int N, int Alloc>
 inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x, int y)
 {
@@ -297,12 +367,21 @@ inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x, int y)
 
     tile_shared_t<T, M, N> dest(data);
     
+    const int tile_i = x*M;
+    const int tile_j = y*N;
+
+    // wp.array() indexing generates poor code due to char* casting
+    // here we unroll some of the ops, note this assumes byte strides are 
+    // aligned to the element size
+    T* ptr = &index(src, tile_i, tile_j);
+    const int stride_i = src.strides[0]/sizeof(T);
+    const int stride_j = src.strides[1]/sizeof(T);    
+
     WP_PRAGMA_UNROLL
-    for (auto dst_iter=dest.iter(); dst_iter.valid(); ++dst_iter)
+    for (int i=threadIdx.x; i < length; i += WP_TILE_BLOCK_DIM)
     {  
-        coord_t c = dst_iter.coord();
-
-        *dst_iter = index(src, x*M + c.i, y*N + c.j);
+        coord_t c = dest.coord(i);
+        dest.data[i] = ptr[c.i*stride_i + c.j*stride_j];    //index(src, tile_i + c.i, tile_j + c.j);
     }
 
     return dest;
@@ -312,19 +391,34 @@ inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x, int y)
 template <typename T, typename Tile>
 inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src)
 {
-    const int M = src.M;
-    const int N = src.N;
-   
-    // cooperatively store the tile, using a block-stride iterator
-    WP_PRAGMA_UNROLL
-    for (auto src_iter=src.iter(); src_iter.valid(); ++src_iter)
-    {  
-        coord_t c = src_iter.coord();
+    auto src_reg = src.get();
 
-        index(dest, x*M + c.i, y*N + c.j) = *src_iter;
+    const int tile_i = x*src.M;
+    const int tile_j = y*src.N;
+
+    // wp.array() indexing generates poor code due to char* casting
+    // here we unroll some of the ops, note this assumes byte strides are 
+    // aligned to the element size
+    T* ptr = &index(dest, tile_i, tile_j);
+    const int stride_i = dest.strides[0]/sizeof(T);
+    const int stride_j = dest.strides[1]/sizeof(T);
+    
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < src_reg.NumRegs; ++i)
+    {
+        // handle case where tile size is not 
+        // aligned to block dimensions
+        int index = src_reg.index(i);
+        if (index > src_reg.Size)
+            break;
+
+        coord_t c = src_reg.coord(index);
+        ptr[c.i*stride_i + c.j*stride_j] = src_reg.data[i]; //index(dest, tile_i + c.i, tile_j + c.j);
     }
 }
 
+    
+
 //-------------------------------------
 // Adjoints
 
@@ -332,129 +426,146 @@ template <typename T, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
                                         array_t<T>& adj_src, int adj_x, int adj_y,
                                         AdjTile& adj_ret)
-{    
-    // add gradients to src array   
-    for (auto adj_iter=adj_ret.iter(); adj_iter.valid(); ++adj_iter)
+{
+    // early out
+    // if (!src.grad)
+    //     return;
+
+    auto adj_reg = adj_ret.get();
+
+    const int tile_i = x*adj_reg.M;
+    const int tile_j = y*adj_reg.N;
+
+    // add gradients to src array
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < adj_reg.NumRegs; ++i)
     {  
-        coord_t c = adj_iter.coord();
+        int linear = adj_reg.index(i);
+        if (linear > adj_reg.Size)
+            break;
 
-        int i = x*adj_ret.M + c.i;
-        int j = y*adj_ret.N + c.j;
+        coord_t coord = adj_reg.coord(linear);
 
-        auto grad = *adj_iter;
+        auto grad = adj_reg.data[i];
 
         if (adj_src.data)
-            adj_atomic_add(&index(adj_src, i, j), grad);
+             adj_atomic_add(&index(adj_src, tile_i + coord.i, tile_j + coord.j), grad);
         else if (src.grad)
-            adj_atomic_add(&index_grad(src, i, j), grad);
+            adj_atomic_add(&index_grad(src, tile_i + coord.i, tile_j + coord.j), grad);
     }
 }
 
 template <typename T, typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, AdjTile& adj_t)
 {  
-    const int M = t.M;
-    const int N = t.N;
+    // if (!dest.grad)
+    //     return;
+
+    // convert to register if necessary
+    auto adj_reg = adj_t.get();
+
+    const int tile_i = x*adj_reg.M;
+    const int tile_j = y*adj_reg.N;
 
     // load gradients from output
     WP_PRAGMA_UNROLL
-    for (auto adj_iter=adj_t.iter(); adj_iter.valid(); ++adj_iter)
+    for (int i=0; i < adj_reg.NumRegs; ++i)
     {  
-        coord_t c = adj_iter.coord();
+        int linear = adj_reg.index(i);
+        if (linear > adj_reg.Size)
+            break;
 
-        int i = x*M + c.i;
-        int j = y*N + c.j;
+        coord_t coord = adj_reg.coord(linear);
 
-        if (adj_dest.data)
-            *adj_iter += index(adj_dest, i, j);
+         if (adj_dest.data)
+             adj_reg.data[i] += index(adj_dest, tile_i + coord.i, tile_j + coord.j);
         else if (dest.grad)
-            *adj_iter += index_grad(dest, i, j);        
+            adj_reg.data[i] += index_grad(dest, tile_i + coord.i, tile_j + coord.j);
     }
+
+    // store adjoint back to tile
+    adj_t.assign(adj_reg);
 }
 
 // unary map
 template <typename Tile, typename Fwd>
-auto tile_map(Fwd op,
-              Tile &a)
+inline CUDA_CALLABLE auto tile_map(Fwd op,
+                                   Tile &a)
 {
     auto out = tile_register_t<typename Tile::Type, Tile::M, Tile::N>();
-
-    auto out_iter = out.iter();
-    auto a_iter = a.iter();
-
+    auto a_reg = a.get();
+    
     WP_PRAGMA_UNROLL
-    for (; out_iter.valid(); ++out_iter, ++a_iter)
-        *out_iter = op(*a_iter);
-
-    // WP_PRAGMA_UNROLL
-    // for (int i=0; i < Tile::NumRegs; ++i)
-    //     out.data[i] = op(a.data[i]);
+    for (int i=0; i < out.NumRegs; ++i)
+    {
+        out.data[i] = op(a_reg.data[i]);
+    }
 
     return out;
 }
 
 template <typename Tile, typename AdjTile, typename Fwd, typename Adj>
-void adj_tile_map(Fwd op,
-                  Tile &a,
-                  Adj adj_op,
-                  Tile &adj_a,
-                  AdjTile &adj_ret)
+inline CUDA_CALLABLE void adj_tile_map(Fwd op,
+                                       Tile& a,
+                                       Adj adj_op,
+                                       Tile& adj_a,
+                                       AdjTile& adj_ret)
 {
-    auto a_iter = a.iter();   
-    auto adj_a_iter = adj_a.iter();
-    auto adj_ret_iter = adj_ret.iter();
+    auto a_reg = a.get();   
+    auto adj_a_reg = adj_a.get();
+    auto adj_ret_reg = adj_ret.get();
 
     WP_PRAGMA_UNROLL
-    for (; a_iter.valid(); ++a_iter, ++adj_a_iter, ++adj_ret_iter)
-    {
-        adj_op(*a_iter, *adj_a_iter, *adj_ret_iter);
+    for (int i=0; i < a_reg.NumRegs; ++i)
+    {        
+        adj_op(a_reg.data[i], adj_a_reg.data[i], adj_ret_reg.data[i]);
     }
+
+    // write adjoints back
+    adj_a.assign(adj_a_reg);
 }
 
 // binary map
 template <typename TileA, typename TileB, typename Fwd>
-auto tile_map(Fwd op,
-              TileA &a,
-              TileB &b)
+inline CUDA_CALLABLE auto tile_map(Fwd op,
+                                   TileA& a,
+                                   TileB& b)
 {
     auto out = tile_register_t<typename TileA::Type, TileA::M, TileA::N>();
 
-    auto out_iter = out.iter();
-    auto a_iter = a.iter();
-    auto b_iter = b.iter();
+    auto a_reg = a.get();
+    auto b_reg = b.get();
 
     WP_PRAGMA_UNROLL
-    for (; out_iter.valid(); ++out_iter, ++a_iter, ++b_iter)
-        *out_iter = op(*a_iter, *b_iter);
-
-    // WP_PRAGMA_UNROLL
-    // for (int i=0; i < TileA::NumRegs; ++i)
-    //     out.data[i] = op(a.data[i], b.data[i]);
-
+    for (int i=0; i < out.NumRegs; ++i)
+        out.data[i] = op(a_reg.data[i], b_reg.data[i]);
 
     return out;
 }
 
 template <typename TileA, typename TileB, typename Fwd, typename Adj, typename AdjTile>
-void adj_tile_map(Fwd op,
-                  TileA &a,
-                  TileB &b,
-                  Adj adj_op,
-                  TileA &adj_a,
-                  TileB &adj_b,
-                  AdjTile &adj_ret)
+inline CUDA_CALLABLE void adj_tile_map(Fwd op,
+                                       TileA &a,
+                                       TileB &b,
+                                       Adj adj_op,
+                                       TileA &adj_a,
+                                       TileB &adj_b,
+                                       AdjTile &adj_ret)
 {
-    auto a_iter = a.iter();   
-    auto b_iter = b.iter();
-    auto adj_a_iter = adj_a.iter();
-    auto adj_b_iter = adj_b.iter();    
-    auto adj_ret_iter = adj_ret.iter();
+    auto a_reg = a.get();   
+    auto b_reg = b.get();
+    auto adj_a_reg = adj_a.get();
+    auto adj_b_reg = adj_b.get();    
+    auto adj_ret_reg = adj_ret.get();
 
     WP_PRAGMA_UNROLL
-    for (; a_iter.valid(); ++a_iter, ++b_iter, ++adj_a_iter, ++adj_b_iter, ++adj_ret_iter)
+    for (int i=0; i < a_reg.NumRegs; ++i)
     {
-        adj_op(*a_iter, *b_iter, *adj_a_iter, *adj_b_iter, *adj_ret_iter);
+        adj_op(a_reg.data[i], b_reg.data[i], adj_a_reg.data[i], adj_b_reg.data[i], adj_ret_reg.data[i]);
     }
+
+    adj_a.assign(adj_a_reg);
+    adj_b.assign(adj_b_reg);
 }
 
 // wrap the operator in a lambda so that we don't have to do overload resolution for things like e.g.: wp.sin()
@@ -468,10 +579,10 @@ void adj_tile_map(Fwd op,
 
 // unary neg
 template <typename Tile>
-auto tile_neg(Tile& a) { return tile_unary_map(wp::neg, a); }
+inline CUDA_CALLABLE auto tile_neg(Tile& a) { return tile_unary_map(wp::neg, a); }
 
 template <typename Tile, typename AdjTile>
-void adj_tile_neg(Tile& a, Tile& adj_a, AdjTile& adj_ret) { adj_tile_unary_map(wp::neg, a, wp::adj_neg, adj_a, adj_ret); }
+inline CUDA_CALLABLE void adj_tile_neg(Tile& a, Tile& adj_a, AdjTile& adj_ret) { adj_tile_unary_map(wp::neg, a, wp::adj_neg, adj_a, adj_ret); }
 
 
 /*
@@ -520,20 +631,20 @@ CUDA_CALLABLE inline auto tile_mul_impl(typename Tile::Type s, Tile& t,
 */
 
 template <typename TileA, typename TileB>
-auto tile_add(TileA& a, TileB& b)
+inline CUDA_CALLABLE auto tile_add(TileA& a, TileB& b)
 {
     return tile_binary_map(add, a, b);
 }
 
 template <typename TileA, typename TileB, typename AdjTile>
-void adj_tile_add(TileA& a, TileB& b, TileA& adj_a, TileB& adj_b, AdjTile& adj_c)
+inline CUDA_CALLABLE void adj_tile_add(TileA& a, TileB& b, TileA& adj_a, TileB& adj_b, AdjTile& adj_c)
 {
     adj_tile_binary_map(add, a, b, adj_add, adj_a, adj_b, adj_c);
 }
 
 // tile*scalar
 template <typename Tile>
-auto tile_mul(Tile& a, const typename Tile::Type& s)
+inline CUDA_CALLABLE auto tile_mul(Tile& a, const typename Tile::Type& s)
 {
     // promote scalar to a constant tile
     auto s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>(s);
@@ -542,9 +653,9 @@ auto tile_mul(Tile& a, const typename Tile::Type& s)
 }
 
 template <typename Tile, typename AdjTile>
-void adj_tile_mul(Tile& a, const typename Tile::Type& s,
-                  Tile& adj_a, typename Tile::Type& adj_s,
-                  AdjTile& adj_c)
+inline CUDA_CALLABLE void adj_tile_mul(Tile& a, const typename Tile::Type& s,
+                                       Tile& adj_a, typename Tile::Type& adj_s,
+                                       AdjTile& adj_c)
 {
     // auto s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>(s);
     // auto adj_s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>();
@@ -558,7 +669,7 @@ void adj_tile_mul(Tile& a, const typename Tile::Type& s,
 
 // scalar*tile
 template <typename Tile>
-auto tile_mul(const typename Tile::Type& s, Tile& a)
+inline CUDA_CALLABLE auto tile_mul(const typename Tile::Type& s, Tile& a)
 {
     // promote scalar to a constant tile
     auto s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>(s);
@@ -567,9 +678,9 @@ auto tile_mul(const typename Tile::Type& s, Tile& a)
 }
 
 template <typename Tile, typename AdjTile>
-void adj_tile_mul(const typename Tile::Type& s, Tile& a,
-                  typename Tile::Type& adj_s, Tile& adj_a,
-                  AdjTile& adj_c)
+inline CUDA_CALLABLE void adj_tile_mul(const typename Tile::Type& s, Tile& a,
+                                       typename Tile::Type& adj_s, Tile& adj_a,
+                                       AdjTile& adj_c)
 {
     // auto s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>(s);
     // auto adj_s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>();
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 2a025362..e52e0b10 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -6,13 +6,14 @@
 wp.init()
 wp.set_module_options({"enable_backward": True})
 wp.set_device("cuda:0")
+wp.set_module_options({"fast_math": True})
 #wp.config.mode = "debug"
 #wp.config.verify_cuda = True
 
 wp.build.clear_kernel_cache()
 
-TILE_M = wp.constant(16)
-TILE_N = wp.constant(8)
+TILE_M = wp.constant(32)
+TILE_N = wp.constant(16)
 TILE_K = wp.constant(8)
 
 # num threads per-tile
@@ -93,7 +94,7 @@ def test_tile_unary_map():
         wp.launch(tile_unary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=TILE_DIM)
 
     # verify forward pass
-    assert(np.allclose(B, B_wp.numpy(), rtol=1.e-4))
+    assert(np.allclose(B, B_wp.numpy(), atol=1.e-4))
     print("Unary map forward passed")
 
     # verify backward pass

From 7d8ddbc003dc8d264da3c86111b8a86a63cae5e1 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 16 Sep 2024 13:03:06 +1200
Subject: [PATCH 023/102] Automatically set tile storage to shared based on
 GEMM usage

---
 warp/builtins.py        |   5 +
 warp/codegen.py         |   2 +-
 warp/native/tile.h      | 371 ++++++++++++++++++++++++++--------------
 warp/native/tile_gemm.h |   6 +-
 warp/tests/test_tile.py |  12 +-
 warp/types.py           |  11 +-
 6 files changed, 264 insertions(+), 143 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 7e3c5722..1757d469 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1875,6 +1875,11 @@ def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     b = arg_values["b"]
     out = arg_values["out"]
 
+    # set the storage type to the inputs to shared
+    a.type.storage = "shared"
+    b.type.storage = "shared"
+    out.type.storage = "shared"
+
     # template_args.append(dtype)
     # template_args.append(m)
     # template_args.append(n)
diff --git a/warp/codegen.py b/warp/codegen.py
index 88c12d8a..93997b07 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -3033,7 +3033,7 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
         ctype = var.ctype(value_type=True)
                
         if is_tile(var.type) and var.type.storage == "shared":
-            lines += [f"{ctype} {name} = wp::tile_alloc_shared<{Var.type_to_ctype(var.type.dtype)},{var.type.M},{var.type.N},{var.type.alloc()}>();\n"]
+            lines += [f"{ctype} {name} = {{0}};\n"]
         else:
             lines += [f"{ctype} {name} = {{}};\n"]
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index b271ccd3..5df1e670 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -119,7 +119,23 @@ inline CUDA_CALLABLE T* tile_alloc_shared()
     return data;
 }
 
+// represents a tile stored in global memory with dynamic strides
+// only used to represent the source for tile loads to register/shared
+template <typename T, int M_, int N_>
+struct tile_global_t
+{
+    using Type = T;
 
+    array_t<T> data;
+    int x;
+    int y;
+
+    tile_global_t(array_t<T>& a, int x, int y) : data(a), x(x), y(y)
+    {
+    }
+};
+
+// represents a tile stored in registers across a block
 template <typename T, int M_, int N_>
 struct tile_register_t
 {
@@ -130,6 +146,8 @@ struct tile_register_t
 
     static constexpr int NumRegs = tile_regcount(M, N);
 
+    static constexpr bool Aligned = Size%WP_TILE_BLOCK_DIM == 0;
+
     T data[NumRegs];
    
     inline CUDA_CALLABLE tile_register_t(T value=T(0.0)) 
@@ -144,6 +162,33 @@ struct tile_register_t
             data[i] = value;
     }
 
+    inline CUDA_CALLABLE tile_register_t(tile_global_t<T, M, N>& t)
+    {
+        // construct from a global tile
+        copy_from_global(t.data, t.x, t.y);
+    }
+
+
+    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, M, N>& t)
+    {
+        // assign from a global tile
+        copy_from_global(t.data, t.x, t.y);
+        return *this;
+    }
+
+
+    inline CUDA_CALLABLE T& operator()(int index)
+    {
+        assert(index < NumRegs);
+        return data[index];
+    }    
+
+    inline CUDA_CALLABLE const T& operator()(int index) const
+    {
+        assert(index < NumRegs);
+        return data[index];
+    }    
+
     // compute linear tile index from a local register index
     inline CUDA_CALLABLE int index(int reg) const
     {
@@ -165,50 +210,133 @@ struct tile_register_t
         return (Size - threadIdx.x)/WP_TILE_BLOCK_DIM;
     }    
 
-    // return the in-register version of this tile (nop)
-    inline CUDA_CALLABLE auto& get() { return *this; }
-
     inline CUDA_CALLABLE void assign(const tile_register_t<T, M, N>& tile) 
     { 
         for (int i=0; i < NumRegs; ++i)
             data[i] = tile.data[i];
     }
 
-    
-    inline CUDA_CALLABLE void print()
+    // return the in-register version of this tile (nop)
+    inline CUDA_CALLABLE auto& copy_to_register() { return *this; }
+
+
+    void copy_to_global(array_t<T> dest, int x, int y)
     {
-        printf("tid: %d ", threadIdx.x);
+        const int tile_i = x*M;
+        const int tile_j = y*N;
 
+        // wp.array() indexing generates poor code due to char* casting
+        // here we unroll some of the ops, note this assumes byte strides are 
+        // aligned to the element size
+        T* ptr = &wp::index(dest, tile_i, tile_j);
+        const int stride_i = dest.strides[0]/sizeof(T);
+        const int stride_j = dest.strides[1]/sizeof(T);
+
+        WP_PRAGMA_UNROLL
         for (int i=0; i < NumRegs; ++i)
         {
-            printf("%f ", data[i]);
+            // handle case where tile size is not 
+            // aligned to block dimensions
+            int linear = index(i);
+            if (!Aligned && linear >= Size)
+                break;
+
+            coord_t c = coord(linear);
+            ptr[c.i*stride_i + c.j*stride_j] = data[i]; 
         }
+    }
+
+    inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x, int y)
+    {
+        // todo: use async pipelines or TMA here
+        const int tile_i = x*M;
+        const int tile_j = y*N;
+
+        // wp.array() indexing generates poor code due to char* casting
+        // here we unroll some of the ops, note this assumes array byte strides are 
+        // aligned to the element size
+        const T* ptr = &wp::index(src, tile_i, tile_j);
+
+        assert(src.strides[0]%sizeof(T) == 0);
+        assert(src.strides[1]%sizeof(T) == 0);
 
-        printf("\n");
+        const int stride_i = src.strides[0]/sizeof(T);
+        const int stride_j = src.strides[1]/sizeof(T);
+
+        WP_PRAGMA_UNROLL
+        for (int i=0; i < NumRegs; ++i)
+        {  
+            int linear = index(i);
+            if (!Aligned && linear >= Size)
+                break;
+
+            coord_t c = coord(linear);
+            data[i] = ptr[c.i*stride_i + c.j*stride_j];
+        }
     }
         
 };
 
 
 
-template <typename T, int M_, int N_, int StrideM_=N_, int StrideN_=1>
+template <typename T, int M_, int N_, int Alloc_, int StrideM_=N_, int StrideN_=1>
 struct tile_shared_t
 {
     using Type = T;
     static constexpr int M = M_;
     static constexpr int N = N_;
     static constexpr int Size = M*N;
+    static constexpr int Alloc = Alloc_;
 
     static constexpr int StrideM = StrideM_;
     static constexpr int StrideN = StrideN_;
 
+    static constexpr bool Aligned = Size%WP_TILE_BLOCK_DIM == 0;
+
     T* data = NULL;
 
-    inline CUDA_CALLABLE tile_shared_t() {}
+    // default initialization (non-initialized)
+    inline CUDA_CALLABLE tile_shared_t() 
+    {
+        data = tile_alloc_shared<T, M, N, Alloc>();
+    }
+
+    // zero initialization, handles adj_tile = {0} syntax
+    inline CUDA_CALLABLE tile_shared_t(int nil) 
+    {
+        data = tile_alloc_shared<T, M, N, Alloc>();
+        zero();
+    }    
+
+    // initialize from an existing tile's memory
     inline CUDA_CALLABLE tile_shared_t(T* smem) : data(smem)
     {
+    }    
+
+    // construct from a global tile
+    inline CUDA_CALLABLE tile_shared_t(tile_global_t<T, M, N>& t)
+    {        
+        copy_from_global(t.array, t.x, t.y);
+    }
+
+    // assign from a global tile
+    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, M, N>& t)
+    {
+        copy_from_global(t.data, t.x, t.y);
+        return *this;
     }
 
+    // assign from a constant value
+    inline CUDA_CALLABLE auto& operator=(const T& x)
+    {
+        // todo: make this subtile (stride aware)
+        for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
+            data[i] = x;
+
+        return *this;
+    }
+
+    
     inline CUDA_CALLABLE T& operator()(int i, int j)
     {
         assert(i < M);
@@ -247,45 +375,18 @@ struct tile_shared_t
         return (*this)(i,j);
     }    
 
-    // in-place zero
-    inline CUDA_CALLABLE void zero()
-    {
-        // todo: make this subtile (stride aware)
-        for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
-            data[i] = T(0);
-    }
-
-    // compute linear tile index from a local register index
-    inline CUDA_CALLABLE int index(int reg) const
-    {
-        return threadIdx.x + reg*WP_TILE_BLOCK_DIM;
-    }
-
     // compute tile coordinate from linear index
     inline CUDA_CALLABLE coord_t coord(int index) const
     {
         return {index/N, index%N};
     }
 
-    // copy shared tile to register
-    inline CUDA_CALLABLE tile_register_t<T, M, N> get() 
-    { 
-        tile_register_t<T, M, N> out;
-
-        WP_PRAGMA_UNROLL
-        for (int i=0; i < out.NumRegs; ++i)
-        {
-            const int linear = out.index(i);
-
-            // handle case where tile size is not
-            // aligned to block dimensions
-            if (linear > Size)
-                break;
-
-            out.data[i] = (*this)(linear);
-        }
-
-        return out;
+    // in-place zero
+    inline CUDA_CALLABLE void zero()
+    {
+        // todo: make this subtile (stride aware)
+        for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
+            data[i] = T(0);
     }
 
     // copy register tile to shared
@@ -298,12 +399,10 @@ struct tile_shared_t
 
             // handle case where tile size is not
             // aligned to block dimensions
-            if (linear > Size)
+            if (!Aligned && linear >= Size)
                 break;
 
-            // todo: should use coord here to handle cases where
-            // shared tile is a slice?
-            data[linear] = tile.data[i];
+            (*this)(linear) = tile.data[i];
         }
     }
 
@@ -317,7 +416,7 @@ struct tile_shared_t
                 printf("%*s[", i>0, "");
                 for (int j=0; j < N; ++j)
                 {
-                    printf("%5.2f ", data(i, j));
+                    printf("%5.2f ", (*this)(i, j));
                 }
 
                 if (i == M-1)
@@ -327,33 +426,91 @@ struct tile_shared_t
             }
         }
     }
+
+    // copy shared tile to register
+    inline CUDA_CALLABLE tile_register_t<T, M, N> copy_to_register() 
+    { 
+        tile_register_t<T, M, N> out;
+
+        WP_PRAGMA_UNROLL
+        for (int i=0; i < out.NumRegs; ++i)
+        {
+            const int linear = out.index(i);
+
+            // handle case where tile size is not
+            // aligned to block dimensions
+            if (!Aligned && linear >= Size)
+                break;
+
+            out(i) = (*this)(linear);
+        }
+
+        return out;
+    }
+
+    inline CUDA_CALLABLE void copy_to_global(array_t<T> dest, int x, int y)
+    {
+        // todo: use TMA here
+        const int tile_i = x*M;
+        const int tile_j = y*N;
+
+        // wp.array() indexing generates poor code due to char* casting
+        // here we unroll some of the ops, note this assumes byte strides are 
+        // aligned to the element size
+        T* ptr = &wp::index(dest, tile_i, tile_j);
+        const int stride_i = dest.strides[0]/sizeof(T);
+        const int stride_j = dest.strides[1]/sizeof(T);    
+
+        WP_PRAGMA_UNROLL
+        for (int i=threadIdx.x; i < Size; i += WP_TILE_BLOCK_DIM)
+        {
+            coord_t c = coord(i);
+            ptr[c.i*stride_i + c.j*stride_j] = (*this)(c.i, c.j);
+        }
+    }
+
+    inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x, int y)
+    {
+        // todo: use async pipelines or TMA here
+        const int tile_i = x*M;
+        const int tile_j = y*N;
+
+        // wp.array() indexing generates poor code due to char* casting
+        // here we unroll some of the ops, note this assumes array byte strides are 
+        // aligned to the element size
+        const T* ptr = &wp::index(src, tile_i, tile_j);
+
+        assert(src.strides[0]%sizeof(T) == 0);
+        assert(src.strides[1]%sizeof(T) == 0);
+
+        const int stride_i = src.strides[0]/sizeof(T);
+        const int stride_j = src.strides[1]/sizeof(T);
+
+        WP_PRAGMA_UNROLL
+        for (int i=threadIdx.x; i < Size; i += WP_TILE_BLOCK_DIM)
+        {  
+            coord_t c = coord(i);
+            (*this)(c.i, c.j) = ptr[c.i*stride_i + c.j*stride_j];
+        }
+    }
 };
 
 template <typename Tile>
 inline CUDA_CALLABLE auto tile_transpose(Tile& t)
-{
+{    
     // alias incoming tile 
-    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
+    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::Alloc, Tile::StrideN, Tile::StrideM>(t.data);
 }
 
 
 //-----------------------------------------------------------------------------------------------------
 // High level entry points for each op (correspond to one Warp builtin)
 
-template <typename T, int M, int N, int Index>
+template <typename T, int M, int N, int Alloc>
 inline CUDA_CALLABLE auto tile_zeros()
 {
-    const int length = M*N;
-
-    WP_TILE_SHARED __align__(16) T data[length];
-    
-    WP_PRAGMA_UNROLL
-    for (int t=threadIdx.x; t < length; t += WP_TILE_BLOCK_DIM)
-    {  
-        data[t] = T(0.0);
-    }
-
-    return tile_shared_t<T, M, N>(data);
+    // tile variable assignment operator will handle initialization
+    return T(0.0);
 }
 
 
@@ -361,64 +518,20 @@ inline CUDA_CALLABLE auto tile_zeros()
 template <typename T, int M, int N, int Alloc>
 inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x, int y)
 {
-    const int length = M*N;
-
-    WP_TILE_SHARED __align__(16) T data[length];
-
-    tile_shared_t<T, M, N> dest(data);
-    
-    const int tile_i = x*M;
-    const int tile_j = y*N;
-
-    // wp.array() indexing generates poor code due to char* casting
-    // here we unroll some of the ops, note this assumes byte strides are 
-    // aligned to the element size
-    T* ptr = &index(src, tile_i, tile_j);
-    const int stride_i = src.strides[0]/sizeof(T);
-    const int stride_j = src.strides[1]/sizeof(T);    
-
-    WP_PRAGMA_UNROLL
-    for (int i=threadIdx.x; i < length; i += WP_TILE_BLOCK_DIM)
-    {  
-        coord_t c = dest.coord(i);
-        dest.data[i] = ptr[c.i*stride_i + c.j*stride_j];    //index(src, tile_i + c.i, tile_j + c.j);
-    }
-
-    return dest;
+    // just return a ref. to the global memory
+    // it will be loaded to shared or registers
+    // on assignment to the variable
+    return tile_global_t<T, M, N>(src, x, y);
 }
 
 // entry point for store
 template <typename T, typename Tile>
 inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src)
 {
-    auto src_reg = src.get();
-
-    const int tile_i = x*src.M;
-    const int tile_j = y*src.N;
-
-    // wp.array() indexing generates poor code due to char* casting
-    // here we unroll some of the ops, note this assumes byte strides are 
-    // aligned to the element size
-    T* ptr = &index(dest, tile_i, tile_j);
-    const int stride_i = dest.strides[0]/sizeof(T);
-    const int stride_j = dest.strides[1]/sizeof(T);
-    
-    WP_PRAGMA_UNROLL
-    for (int i=0; i < src_reg.NumRegs; ++i)
-    {
-        // handle case where tile size is not 
-        // aligned to block dimensions
-        int index = src_reg.index(i);
-        if (index > src_reg.Size)
-            break;
-
-        coord_t c = src_reg.coord(index);
-        ptr[c.i*stride_i + c.j*stride_j] = src_reg.data[i]; //index(dest, tile_i + c.i, tile_j + c.j);
-    }
+    // dispatch to tile type
+    src.copy_to_global(dest, x, y);
 }
 
-    
-
 //-------------------------------------
 // Adjoints
 
@@ -431,7 +544,7 @@ inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
     // if (!src.grad)
     //     return;
 
-    auto adj_reg = adj_ret.get();
+    auto adj_reg = adj_ret.copy_to_register();
 
     const int tile_i = x*adj_reg.M;
     const int tile_j = y*adj_reg.N;
@@ -441,7 +554,7 @@ inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
     for (int i=0; i < adj_reg.NumRegs; ++i)
     {  
         int linear = adj_reg.index(i);
-        if (linear > adj_reg.Size)
+        if (!adj_reg.Aligned && linear >= adj_reg.Size)
             break;
 
         coord_t coord = adj_reg.coord(linear);
@@ -449,7 +562,7 @@ inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
         auto grad = adj_reg.data[i];
 
         if (adj_src.data)
-             adj_atomic_add(&index(adj_src, tile_i + coord.i, tile_j + coord.j), grad);
+            adj_atomic_add(&index(adj_src, tile_i + coord.i, tile_j + coord.j), grad);
         else if (src.grad)
             adj_atomic_add(&index_grad(src, tile_i + coord.i, tile_j + coord.j), grad);
     }
@@ -462,7 +575,7 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t
     //     return;
 
     // convert to register if necessary
-    auto adj_reg = adj_t.get();
+    auto adj_reg = adj_t.copy_to_register();
 
     const int tile_i = x*adj_reg.M;
     const int tile_j = y*adj_reg.N;
@@ -472,13 +585,13 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t
     for (int i=0; i < adj_reg.NumRegs; ++i)
     {  
         int linear = adj_reg.index(i);
-        if (linear > adj_reg.Size)
+        if (!adj_reg.Aligned && linear >= adj_reg.Size)
             break;
 
         coord_t coord = adj_reg.coord(linear);
 
          if (adj_dest.data)
-             adj_reg.data[i] += index(adj_dest, tile_i + coord.i, tile_j + coord.j);
+            adj_reg.data[i] += index(adj_dest, tile_i + coord.i, tile_j + coord.j);
         else if (dest.grad)
             adj_reg.data[i] += index_grad(dest, tile_i + coord.i, tile_j + coord.j);
     }
@@ -493,7 +606,7 @@ inline CUDA_CALLABLE auto tile_map(Fwd op,
                                    Tile &a)
 {
     auto out = tile_register_t<typename Tile::Type, Tile::M, Tile::N>();
-    auto a_reg = a.get();
+    auto a_reg = a.copy_to_register();
     
     WP_PRAGMA_UNROLL
     for (int i=0; i < out.NumRegs; ++i)
@@ -511,9 +624,9 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
                                        Tile& adj_a,
                                        AdjTile& adj_ret)
 {
-    auto a_reg = a.get();   
-    auto adj_a_reg = adj_a.get();
-    auto adj_ret_reg = adj_ret.get();
+    auto a_reg = a.copy_to_register();   
+    auto adj_a_reg = adj_a.copy_to_register();
+    auto adj_ret_reg = adj_ret.copy_to_register();
 
     WP_PRAGMA_UNROLL
     for (int i=0; i < a_reg.NumRegs; ++i)
@@ -533,8 +646,8 @@ inline CUDA_CALLABLE auto tile_map(Fwd op,
 {
     auto out = tile_register_t<typename TileA::Type, TileA::M, TileA::N>();
 
-    auto a_reg = a.get();
-    auto b_reg = b.get();
+    auto a_reg = a.copy_to_register();
+    auto b_reg = b.copy_to_register();
 
     WP_PRAGMA_UNROLL
     for (int i=0; i < out.NumRegs; ++i)
@@ -552,11 +665,11 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
                                        TileB &adj_b,
                                        AdjTile &adj_ret)
 {
-    auto a_reg = a.get();   
-    auto b_reg = b.get();
-    auto adj_a_reg = adj_a.get();
-    auto adj_b_reg = adj_b.get();    
-    auto adj_ret_reg = adj_ret.get();
+    auto a_reg = a.copy_to_register();   
+    auto b_reg = b.copy_to_register();
+    auto adj_a_reg = adj_a.copy_to_register();
+    auto adj_b_reg = adj_b.copy_to_register();    
+    auto adj_ret_reg = adj_ret.copy_to_register();
 
     WP_PRAGMA_UNROLL
     for (int i=0; i < a_reg.NumRegs; ++i)
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index faf807ad..1ca668d3 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -321,11 +321,11 @@ void tile_matmul(TileA& a, TileB& b, TileC& c)
 }
 
 
-template <typename TileA, typename TileB, typename TileC>
+template <typename TileA, typename TileB, typename TileC,
+          typename AdjTileA, typename AdjTileB, typename AdjTileC>
 void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
-                     TileA& adj_a, TileB& adj_b, TileC& adj_c)
+                     AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c)
 {
-
     tile_matmul_scalar(adj_c, wp::tile_transpose(b), adj_a);
     tile_matmul_scalar(wp::tile_transpose(a), adj_c, adj_b);
 }
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index e52e0b10..9f9b079b 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -12,8 +12,8 @@
 
 wp.build.clear_kernel_cache()
 
-TILE_M = wp.constant(32)
-TILE_N = wp.constant(16)
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
 TILE_K = wp.constant(8)
 
 # num threads per-tile
@@ -154,8 +154,8 @@ def test_tile_binary_map():
     C_wp.grad = wp.ones_like(C_wp)
     tape.backward()
 
-    assert(np.allclose(A_wp.grad.numpy(), A_grad))
-    assert(np.allclose(B_wp.grad.numpy(), B_grad))
+    assert(np.allclose(A_wp.grad.numpy(), A_grad, rtol=1.e-2))
+    assert(np.allclose(B_wp.grad.numpy(), B_grad, rtol=1.e-2))
     
     print("Binary map backward passed")
 
@@ -235,8 +235,8 @@ def tile_gemm(A: wp.array2d(dtype=float),
 def test_tile_gemm():
 
     M = TILE_M*7
-    K = TILE_K*5
-    N = TILE_N*2
+    K = TILE_K*6
+    N = TILE_N*5
 
     rng = np.random.default_rng(42)
     A = rng.random((M, K), dtype=np.float32)
diff --git a/warp/types.py b/warp/types.py
index 9bc6f7d7..f990a49e 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2881,13 +2881,16 @@ def ctype(self):
         if self.storage == "register":
             return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
         elif self.storage == "shared":
-            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+            
+            # every shared memory tile will create a new static shared memory allocation
+            # this just needs to be a unique-id for templated allocation functions         
+            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>"
 
     # generate a unique allocation index for shared memory
     @classmethod
     def alloc(cls):
-        index = cls.allocation
-        cls.allocation += 1
+        index = Tile.allocation
+        Tile.allocation += 1
         return index
 
 class TileZeros(Tile):
@@ -2905,7 +2908,7 @@ def __init__(self, dtype, M, N):
 class TileLoad(Tile):
 
     def __init__(self, array, M, N):
-        Tile.__init__(self, array.dtype, M, N, op="load", storage="shared")
+        Tile.__init__(self, array.dtype, M, N, op="load", storage="register")
         
 
 class TileUnaryMap(Tile):

From f31bef61c3d1ac4e93defa86d7a6c1af2529738b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 16 Sep 2024 15:59:02 +1200
Subject: [PATCH 024/102] Implement operator backward pass

---
 warp/native/tile.h      |  72 ++++++------------------
 warp/tests/test_tile.py | 118 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 130 insertions(+), 60 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index 5df1e670..c3d0e965 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -690,7 +690,7 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
 #define tile_binary_map(op, a, b) tile_map([](auto x, auto y) { return op(x, y);}, a, b)
 #define adj_tile_binary_map(op, a, b, adj_op, adj_a, adj_b, adj_ret) adj_tile_map([](auto x, auto y) { return op(x, y);}, a, b, [](auto x, auto y, auto& adj_x, auto& adj_y, auto adj_ret) { adj_op(x, y, adj_x, adj_y, adj_ret);}, adj_a, adj_b, adj_ret)
 
-// unary neg
+// -tile (unary neg)
 template <typename Tile>
 inline CUDA_CALLABLE auto tile_neg(Tile& a) { return tile_unary_map(wp::neg, a); }
 
@@ -698,51 +698,7 @@ template <typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_neg(Tile& a, Tile& adj_a, AdjTile& adj_ret) { adj_tile_unary_map(wp::neg, a, wp::adj_neg, adj_a, adj_ret); }
 
 
-/*
-// handle tile*scalar
-template<typename Tile>
-CUDA_CALLABLE inline auto tile_mul_impl(Tile& t, typename Tile::Type s,
-                                        Tile& adj_t, typename Tile::Type adj_s)
-{
-    typedef typename Tile::Type T;
-    typedef tile_constant_t<T, Tile::M, Tile::N> Constant;
-
-    typedef tile_binary_map_t<Tile, Constant> Op;
-
-    typename Op::FwdOp fwd = [](T a, T b) { return mul(a, b); };
-    typename Op::AdjOp adj = [](T a, T b, T& adj_a, T& adj_b, T& adj_ret) { adj_mul(a, b, adj_a, adj_b, adj_ret); };
-
-    // promote scalar to constant tile
-    Constant c(s, adj_s);
-
-    return Op(t, c, fwd, adj);
-}
-
-// handle scalar*tile
-template<typename Tile>
-CUDA_CALLABLE inline auto tile_mul_impl(typename Tile::Type s, Tile& t,
-                                        typename Tile::Type adj_s, Tile& adj_t)
-{
-    typedef typename Tile::Type T;
-    typedef tile_constant_t<T, Tile::M, Tile::N> Constant;
-
-    typedef tile_binary_map_t<Constant, Tile> Op;
-
-    typename Op::FwdOp fwd = [](T a, T b) { return mul(a, b); };
-    typename Op::AdjOp adj = [](T a, T b, T& adj_a, T& adj_b, T& adj_ret) { adj_mul(a, b, adj_a, adj_b, adj_ret); };
-
-    // promote scalar to constant tile
-    Constant c(s, adj_s);
-
-    return Op(c, t, fwd, adj);
-
-}
-
-
-#define tile_mul(a, b) tile_mul_impl(a, b adj_##a, adj_##b)
-#define tile_add(a, b) tile_add_impl(a, b adj_##a, adj_##b)
-*/
-
+// tile + tile
 template <typename TileA, typename TileB>
 inline CUDA_CALLABLE auto tile_add(TileA& a, TileB& b)
 {
@@ -770,13 +726,15 @@ inline CUDA_CALLABLE void adj_tile_mul(Tile& a, const typename Tile::Type& s,
                                        Tile& adj_a, typename Tile::Type& adj_s,
                                        AdjTile& adj_c)
 {
-    // auto s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>(s);
-    // auto adj_s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>();
+    auto s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>(s);
+    auto adj_s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>();
 
-    // adj_tile_binary_map(mul, a, s_tile, adj_mul, adj_a, adj_s_tile, adj_c);
+    adj_tile_binary_map(mul, a, s_tile, adj_mul, adj_a, adj_s_tile, adj_c);
 
-    // todo: sum up contribution from all adj_s_tile onto original scalar
-    //adj_tile_sum()
+    for (int i=0; i < adj_s_tile.NumRegs; ++i)
+    {
+        adj_s += adj_s_tile.data[i];
+    }
 }
 
 
@@ -795,13 +753,15 @@ inline CUDA_CALLABLE void adj_tile_mul(const typename Tile::Type& s, Tile& a,
                                        typename Tile::Type& adj_s, Tile& adj_a,
                                        AdjTile& adj_c)
 {
-    // auto s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>(s);
-    // auto adj_s_tile = tile_register_t<Tile::Type, Tile::M, Tile::N>();
+    auto s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>(s);
+    auto adj_s_tile = tile_register_t<typename Tile::Type, Tile::M, Tile::N>();
 
-    // adj_tile_binary_map(mul, a, s_tile, adj_mul, adj_a, adj_s_tile, adj_c);
+    adj_tile_binary_map(mul, s_tile, a, adj_mul, adj_s_tile, adj_a, adj_c);
 
-    // todo: sum up contribution from all adj_s_tile onto original scalar
-    //adj_tile_sum()
+    for (int i=0; i < adj_s_tile.NumRegs; ++i)
+    {
+        adj_s += adj_s_tile.data[i];
+    }
 }
 
 
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 9f9b079b..90bd3d66 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -301,15 +301,24 @@ def test_tile_operators():
     input = rng.random((batch_count, M, N), dtype=np.float32)
     output = input*0.75
 
-    input_wp = wp.array(input)
-    output_wp = wp.zeros_like(input_wp)
+    input_wp = wp.array(input, requires_grad=True)
+    output_wp = wp.zeros_like(input_wp, requires_grad=True)
 
-    wp.launch(tile_operators, dim=batch_count, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
+    with wp.Tape() as tape:
+        wp.launch(tile_operators, dim=batch_count, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
 
     assert(np.allclose(output, output_wp.numpy(), rtol=1.e-4))
 
     print("operators forward passed")
 
+    output_wp.grad.fill_(1.0)
+
+    tape.backward()
+
+    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.75, rtol=1.e-4))
+
+    print("operators backward passed")    
+
 
 
 test_tile_copy()
@@ -317,4 +326,105 @@ def test_tile_operators():
 test_tile_binary_map()
 test_tile_batched_gemm()
 test_tile_gemm()
-test_tile_operators()
\ No newline at end of file
+test_tile_operators()
+
+
+# #-----------------------------------------
+# # center of mass computation
+
+# start = offset[i] 
+# end = offset[i+1] 
+
+# com = wp.tile_zeros(dtype=wp.vec3, M=1)
+
+# # load chunks of indices
+# for i in range(start, end, N):
+
+#     count = wp.min(N, end-i)
+    
+#     idx = wp.tile_load(indices, i, N, max_col=count)
+#     p = wp.tile_load(points, idx, max_col=count)
+
+#     com += wp.tile_sum(p)
+
+
+# wp.tile_store(out[i], com)
+
+
+
+# #-------------------------------------------
+# # compute deformation gradient
+
+# i = 
+# j =
+# k = 
+# l =
+
+# f = wp.tile(F)  # generate a block size tile of feature vectors
+
+# # layer 1
+# w1 = wp.tile_load(weights)
+# b1 = wp.tile_load(bias)
+
+# z = wp.tile_matmul(w1, f) + b1
+# z = wp.tile_map(relu, z)
+
+# # layer 2
+# w2 = wp.tile_load(weights)
+# b2 = wp.tile_load(bias)
+
+# z = wp.tile_matmul(w2, z) + b2
+# z = wp.tile_map(relu, z)
+
+# o = wp.untile(f)
+
+
+# #----------------------------------
+# # MLP with helper function for linear layers
+# # where shape is only partially known
+# # at compile time, and the other dims 
+# # are inferred from the input vector
+
+# f = wp.tile(F)
+
+# z = wp.tile_linear(weights1, bias1, f, hidden=16)
+# z = wp.tile_map(relu, z)
+
+# z = wp.tile_linear(weights2, bias2, f, hidden=8)
+# z = wp.tile_map(relu, z)
+
+# z = wp.tile_linear(weights3, bias3, f, hidden=4)
+# z = wp.tile_map(relu, z)
+
+# o = wp.untile(z)
+
+
+
+# #----------------------------------
+# # softmax
+
+# def softmax(z: Any):
+    
+#     e = wp.tile_map(wp.exp, z)
+#     s = wp.tile_sum(e, dim=0)
+
+#     return z/s[0]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 5e18a5f964515ec0caf02a61e1e6cf33a0e245f2 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 17 Sep 2024 15:59:53 +1200
Subject: [PATCH 025/102] Added `wp.tile_sum()` for whole tile reductions

---
 warp/builtins.py          |  43 +++++++++----
 warp/native/builtin.h     |   1 +
 warp/native/tile_reduce.h | 129 ++++++++++++++++++++++++++++++++++++++
 warp/tests/test_tile.py   |  46 ++++++++++++++
 4 files changed, 206 insertions(+), 13 deletions(-)
 create mode 100644 warp/native/tile_reduce.h

diff --git a/warp/builtins.py b/warp/builtins.py
index 1757d469..e10204e5 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1880,19 +1880,7 @@ def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     b.type.storage = "shared"
     out.type.storage = "shared"
 
-    # template_args.append(dtype)
-    # template_args.append(m)
-    # template_args.append(n)
-
-    # global shared_memory_id
-
     template_args = []
-    # template_args.append(shared_memory_id)
-
-    # # matmul makes two allocations (one for each of its arguments)
-    # shared_memory_id += 1        
-    # shared_memory_id += 1
-
     return ((a, b, out), template_args)
 
 
@@ -1902,11 +1890,40 @@ def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     value_func=tile_matmul_value_func,
     dispatch_func=tile_matmul_dispatch_func,
     variadic=True,
-    doc="Compute matrix product and accumulate out += a*b, a and b will be realized before evaluation, and output must already be realized.", 
+    doc="Compute matrix product and accumulate out += a*b.", 
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_sum_value_func(arg_types, arg_values):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+
+    if len(arg_types) != 2:
+        raise RuntimeError("tile_sum() requires 2 positional args")
+
+    a = arg_types["a"]
+
+    if not is_tile(a):
+        raise RuntimeError("tile_sum() argument 0 must be a tile")
+
+    return Tile(dtype=a.dtype, M=1, N=1, op="sum")
+
+
+add_builtin(
+    "tile_sum",
+    input_types={"a": Tile, "axis": Any},
+    value_func=tile_sum_value_func,
+    variadic=True,
+    doc="Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored", 
     group="Tile Primitives",
     export=False,
 )
 
+
+
 def tile_eval_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index a899d9a7..8409b810 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1591,4 +1591,5 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 #if defined(__CUDACC_RTC__)
 #include "tile.h"
 #include "tile_gemm.h"
+#include "tile_reduce.h"
 #endif
\ No newline at end of file
diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
new file mode 100644
index 00000000..f9cfd23d
--- /dev/null
+++ b/warp/native/tile_reduce.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include "tile.h"
+
+#define WP_TILE_WARP_SIZE 32
+
+namespace wp
+{
+
+template <typename T>
+inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset)
+{
+    typedef unsigned int Word;
+
+    union
+    {
+        T output;       
+        Word output_storage;
+    };
+
+    union
+    {
+        T input;
+        Word input_storage;
+    };
+
+    input = val;
+
+    Word* dest = reinterpret_cast<Word*>(&output);
+    Word* src  = reinterpret_cast<Word*>(&input);
+
+    unsigned int shuffle_word;
+    unsigned int mask = 0xffffffff;
+
+    constexpr int word_count = (sizeof(T) + sizeof(Word) - 1) / sizeof(Word);
+
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < word_count; ++i)
+    {
+        shuffle_word = __shfl_down_sync(mask, src[i], offset, WP_TILE_WARP_SIZE);
+        dest[i] = shuffle_word;
+    }
+
+  return output;
+}
+
+template <typename T>
+inline CUDA_CALLABLE T warp_reduce(T val)
+{
+    T sum = val;
+
+    for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
+    {
+        sum += warp_shuffle_down(sum, offset);
+    }
+
+    return sum;
+}
+
+
+// non-axis version which computes sum 
+// across the entire tile using the whole block
+template <typename Tile>
+auto tile_sum(Tile& t, int axis)
+{
+    using T = typename Tile::Type;
+
+    auto input = t.copy_to_register();
+    auto output = tile_register_t<T, 1, 1>();
+
+    const int warp_count = (WP_TILE_BLOCK_DIM + WP_TILE_WARP_SIZE - 1)/WP_TILE_WARP_SIZE;
+    const int warp_index = threadIdx.x/WP_TILE_WARP_SIZE;
+    const int lane_index = threadIdx.x%WP_TILE_WARP_SIZE;
+
+    T thread_sum = input.data[0];
+
+    // thread reduction
+    WP_PRAGMA_UNROLL
+    for (int i=1; i < input.NumRegs; ++i)
+        thread_sum += input.data[i];
+
+    // warp reduction
+    T warp_sum = warp_reduce(thread_sum);
+
+    // fixed size scratch pad for partial results
+    __shared__ T partials[warp_count];
+
+    if (lane_index == 0)
+    {
+        partials[warp_index] = warp_sum;
+    }
+
+    __syncthreads();
+
+    // reduce across block, todo: use warp_reduce() here
+    if (threadIdx.x == 0)
+    {
+        T block_sum = partials[0];
+        
+        WP_PRAGMA_UNROLL
+        for (int i=1; i < warp_count; ++i)
+            block_sum += partials[i];
+
+        output.data[0] = block_sum;
+    }
+
+    return output;
+}
+
+template <typename Tile, typename AdjTile>
+void adj_tile_sum(Tile& t, int axis, Tile& adj_t, int adj_axis, AdjTile& adj_ret)
+{
+    using T = typename Tile::Type;
+
+    // broadcast incoming adjoint to block
+    __shared__ T scratch;
+    if (threadIdx.x == 0)
+        scratch = adj_ret.data[0];
+
+    __syncthreads();
+
+    auto adj_t_reg = adj_t.copy_to_register();
+    auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, -1, 0, 0>(&scratch).copy_to_register();
+
+    adj_t.assign(tile_add(adj_t_reg, adj_ret_reg));
+}
+
+
+} // namespace wp
\ No newline at end of file
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 90bd3d66..f6aa9188 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -320,6 +320,51 @@ def test_tile_operators():
     print("operators backward passed")    
 
 
+@wp.kernel
+def tile_sum_kernel(input: wp.array3d(dtype=float),
+                    output: wp.array(dtype=float)):
+
+    # output tile index
+    i = wp.tid()
+
+    a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
+    s = wp.tile_sum(a, axis=-1)*0.5
+    wp.tile_store(output, i, 0, s)
+
+def test_tile_sum():
+
+    batch_count = 2
+
+    M = TILE_M
+    N = TILE_N
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, M, N), dtype=np.float32)
+
+    input_wp = wp.array(input, requires_grad=True)
+    output_wp = wp.zeros(batch_count, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_sum_kernel, dim=batch_count, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
+
+
+    for i in range(batch_count):
+        sum_np = np.sum(input[i])*0.5
+        sum_wp = output_wp.numpy()[i]
+
+        assert(np.allclose(sum_np, sum_wp, rtol=1.e-4))
+
+    print("Sum forward passed")
+
+    output_wp.grad.fill_(1.0)
+
+    tape.backward()
+
+    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.5, rtol=1.e-4))
+
+    print("Sum backward passed")
+
+
 
 test_tile_copy()
 test_tile_unary_map()
@@ -327,6 +372,7 @@ def test_tile_operators():
 test_tile_batched_gemm()
 test_tile_gemm()
 test_tile_operators()
+test_tile_sum()
 
 
 # #-----------------------------------------

From 670445d73326101b5b508aaed14f799f77ccaca0 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 19 Sep 2024 15:54:39 +1200
Subject: [PATCH 026/102] Add support for extracting tile elements to regular
 Warp types, remove axis parameter from tile_sum()

---
 warp/builtins.py          | 119 +++++++++----------------
 warp/codegen.py           |  10 ++-
 warp/native/tile.h        | 181 ++++++++++++++++++++------------------
 warp/native/tile_reduce.h |  97 ++++++++++++++++++--
 warp/tests/test_tile.py   |  62 ++++++++++---
 5 files changed, 286 insertions(+), 183 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index e10204e5..077d6cd0 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1844,6 +1844,31 @@ def tile_store_value_func(arg_types, arg_values):
 )
 
 
+def tile_extract_value_func(arg_types, arg_values):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None    
+    
+    if len(arg_types) != 3: 
+        raise RuntimeError("tile_extract() requires 3 positional args")
+
+    if not is_tile(arg_types["a"]):
+        raise RuntimeError("tile_extract() argument 0 must be a tile")
+    
+    return arg_types["a"].dtype
+
+
+add_builtin(
+    "tile_extract",
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "i": int, "j": int},
+    value_func=tile_extract_value_func,
+    variadic=True,
+    doc="Extract element at index (i, j) of the tile and return the native type",
+    group="Tile Primitives",
+    export=False,
+)
+
 
 def tile_matmul_value_func(arg_types, arg_values):
     
@@ -1901,8 +1926,8 @@ def tile_sum_value_func(arg_types, arg_values):
     if arg_types is None:
         return None
 
-    if len(arg_types) != 2:
-        raise RuntimeError("tile_sum() requires 2 positional args")
+    if len(arg_types) != 1:
+        raise RuntimeError("tile_sum() requires 1 positional args")
 
     a = arg_types["a"]
 
@@ -1914,7 +1939,7 @@ def tile_sum_value_func(arg_types, arg_values):
 
 add_builtin(
     "tile_sum",
-    input_types={"a": Tile, "axis": Any},
+    input_types={"a": Tile},
     value_func=tile_sum_value_func,
     variadic=True,
     doc="Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored", 
@@ -1924,43 +1949,6 @@ def tile_sum_value_func(arg_types, arg_values):
 
 
 
-def tile_eval_value_func(arg_types, arg_values):
-    
-    # return generic type (for doc builds)
-    if arg_types is None:
-        return None
-
-    if not is_tile(arg_types["t"]):
-        raise RuntimeError("tile_eval() argument must be a tile")
-
-    return TileShared(arg_types["t"])
-
-def tile_eval_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
-
-    t = arg_values["t"]
-
-    global shared_memory_id
-
-    template_args = []
-    template_args.append(shared_memory_id)
-
-    # matmul makes two allocations (one for each of its arguments)
-    shared_memory_id += 1        
-
-    return ((t,), template_args)
-
-add_builtin(
-    "tile_eval",
-    input_types={"t": Tile},
-    value_func=tile_eval_value_func,
-    dispatch_func=tile_eval_dispatch_func,
-    variadic=True,
-    doc="Force evaluation of a tile expression into shared memory", 
-    group="Tile Primitives",
-    export=False,
-)
-
-
 # does type propagation for load()
 def tile_unary_map_value_func(arg_types, arg_values):
 
@@ -2034,17 +2022,6 @@ def tile_binary_map_value_func(arg_types, arg_values):
     export=False,
 )
 
-add_builtin(
-    "add",
-    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)},
-    value_func=tile_binary_map_value_func,
-    #dispatch_func=tile_map_dispatch_func,
-    #variadic=True,
-    native_func="tile_add",
-    doc="Add each element of two tiles together", 
-    group="Tile Primitives",
-    export=False,
-)
 
 # ---------------------------------
 # Linear Algebra
@@ -4538,35 +4515,33 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
         return TileBinaryMap(TileConstant(x, y.M, y.N), y)
 
 
-
-# def tile_binary_value_func(arg_types, arg_values):
-
-#     if arg_types is None:
-#         return Tile(dtype=Any, M=Any, N=Any)
-
-#     a = arg_types[0]
-    
-
-#     if not is_tile(t):
-#         raise RuntimeError("Expected tile for unary expression")
-    
-#     return TileUnaryMap(t.dtype, t.M, t.N)
-
 add_builtin(
     "neg",
     input_types={"x": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_unary_value_func,
-    doc="",
+    doc="Negate each element of a tile",
     export=False,
     native_func="tile_neg",
     group="Operators",
 )
 
+add_builtin(
+    "add",
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)},
+    value_func=tile_binary_map_value_func,
+    #dispatch_func=tile_map_dispatch_func,
+    #variadic=True,
+    native_func="tile_add",
+    doc="Add each element of two tiles together", 
+    group="Tile Primitives",
+    export=False,
+)
+
 add_builtin(
     "mul",
     input_types={"x": Tile(dtype=Any, M=Any, N=Any), "y": Scalar},
     value_func=tile_scalar_mul_value_func,
-    doc="",
+    doc="Multiply each element of a tile by a scalar",
     export=False,
     native_func="tile_mul",
     group="Operators",
@@ -4576,18 +4551,10 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
     "mul",
     input_types={"x": Scalar, "y": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_scalar_mul_value_func,
-    doc="",
+    doc="Multiply each element of a tile by a scalar",
     export=False,
     native_func="tile_mul",
     group="Operators",
 )
 
-# add_builtin(
-#     "mul",
-#     input_types={"x": Tile, "s": Scalar},
-#     value_func=tile_binary_value_func,
-#     doc="",
-#     group="Operators",
-# )
-
 
diff --git a/warp/codegen.py b/warp/codegen.py
index 93997b07..941a8f4b 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -2187,8 +2187,8 @@ def emit_Subscript(adj, node):
             return var
 
         target, indices = adj.eval_subscript(node)
-
         target_type = strip_reference(target.type)
+
         if is_array(target_type):
             if len(indices) == target_type.ndim:
                 # handles array loads (where each dimension has an index specified)
@@ -2209,6 +2209,14 @@ def emit_Subscript(adj, node):
                     out.is_read = target.is_read
                     out.is_write = target.is_write
 
+        elif is_tile(target_type):
+            if len(indices) == 2:
+                # handles extracting a single element from a tile
+                out = adj.add_builtin_call("tile_extract", [target, *indices])
+            else:
+                # handles tile views
+                out = adj.add_builtin_call("tile_view", [target, *indices])
+
         else:
             # handles non-array type indexing, e.g: vec3, mat33, etc
             out = adj.add_builtin_call("extract", [target, *indices])
diff --git a/warp/native/tile.h b/warp/native/tile.h
index c3d0e965..3f3845c9 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -31,20 +31,42 @@
 
 /* Tile Expressions
 
-[x] Forward / Backward code-gen
-[ ] wp.tile_map()
+[ ] Tiles
+    [x] Register, Shared, Global
+    [ ] Layouts
+        [x] Simple
+        [ ] Cute
+    [ ] Remove Alloc type from tile_shared_t
+    
+[ ] Load/Store
+    [ ] 1D load/store variants
+    [ ] max_coord option for non-aligned loads
+    [ ] Indexed load
+    [ ] wp.tile_atomic_add()
+[ ] Maps
     [x] Support user functions
     [x] Support built-in functions
     [ ] Support for lambda functions
     [ ] Infer tile_map() output from operator type (e.g.: dot for each element)
-[x] wp.tile_matmul()
+[ ] Reductions
+    [x] Sum
+        [x] Forward
+        [x] Reverse
+    [ ] Min
+    [ ] Max
+    [ ] Custom
+[x] MatMul
     [x] Forward
     [x] Reverse
-[ ] wp.tile_atomic_add()   
-[ ] Support for n-d shape tiles / broadcasting / slicing / transpose?
-[x] Compile-time block dimensions
-[ ] Support for CUB reductions
-[ ] Support for CUB sorts
+[ ] Reshape
+    [ ] Broadcasting
+    [ ] Transpose
+        [x] Shared
+        [ ] Register
+    [ ] Slice
+[ ] Runtime
+    [x] Compile-time block dimensions
+    [ ] Switch between SIMT / Tile based execution if `tile_dim` not provided to wp.launch()
 [ ] Examples
     [ ] GEMM
     [ ] Batched MLP
@@ -216,6 +238,44 @@ struct tile_register_t
             data[i] = tile.data[i];
     }
 
+    // extract a single tile element to a native type
+    inline CUDA_CALLABLE Type extract(int i, int j)
+    {
+        // map from logical coords (i, j) -> (thread, reg)
+        const int linear = i*N + j;
+
+        const int thread = linear/NumRegs;
+        const int reg = linear%NumRegs;
+
+        WP_TILE_SHARED Type scratch;
+
+        if (threadIdx.x == thread)
+        {
+            scratch = data[reg];
+        }
+
+        WP_TILE_SYNC();
+
+        return scratch;
+    }
+    
+
+    // backward version of scalar extract
+    inline CUDA_CALLABLE void adj_extract(int i, int j, Type adj_ret)
+    {
+        // map from logical coords (i, j) -> (thread, reg)
+        const int linear = i*N + j;
+
+        const int thread = linear/NumRegs;
+        const int reg = linear%NumRegs;
+
+        if (threadIdx.x == thread)
+        {
+            data[reg] += adj_ret;
+        }
+    }
+
+
     // return the in-register version of this tile (nop)
     inline CUDA_CALLABLE auto& copy_to_register() { return *this; }
 
@@ -389,6 +449,20 @@ struct tile_shared_t
             data[i] = T(0);
     }
 
+    // extract a single tile element to a native type
+    inline CUDA_CALLABLE Type extract(int i, int j)
+    {
+        return (*this)(i, j);
+    }
+        
+    // backward of scalar extraction
+    inline CUDA_CALLABLE void adj_extract(int i, int j, Type adj_ret)
+    {
+        if (threadIdx.x == 0)
+            (*this)(i, j) += adj_ret;
+    }
+
+
     // copy register tile to shared
     inline CUDA_CALLABLE void assign(const tile_register_t<T, M, N>& tile)
     { 
@@ -765,92 +839,25 @@ inline CUDA_CALLABLE void adj_tile_mul(const typename Tile::Type& s, Tile& a,
 }
 
 
-} // namespace wp
-
-#if 0
-
-//-----------------------------------------------------
-// c = a + b
-
-// forward
-auto var_0 = wp::tile_load<wp::float32,8,4>(var_A, x, y);
-auto var_1 = wp::tile_load<wp::float32,8,4>(var_B, x, y);
-auto var_2 = wp::tile_add(var_0, var_1);
-wp::tile_store(var_C, x, y, var_2)
-
-// reverse
-wp::adj_store(var_C, x, y, var_2, adj_C, _, _, adj_2)
-wp::adj_tile_add(var_0, var_1, adj_0, adj_1, adj_2)
-wp::adj_tile_load(var_B, x, y, adj_B, _, _, adj_1);
-wp::adj_tile_load(var_B, x, y, adj_B, _, _, adj_0);
 
-
-//-----------------------------------------------------
-// x = a[0]
-// c = x*2.0 + x
-
-// forward
-auto var_0 = wp::tile_load<wp::float32,8,4>(var_A, x, y);
-auto var_1 = wp::tile_mul(var_0, 2.0);
-auto var_2 = wp::tile_add(var_0, var_1);
-wp::tile_store(var_C, x, y, var_2)
-
-struct adj_store_t
-{
-    adj_store_t()
-    {
-
-    }
-
-    float bwd(int i, float adj_ret)
-    {
-        return array.grad[i];
-    }
-};
-
-template <typename P>
-struct adj_add_t
+template<typename Tile>
+typename Tile::Type tile_extract(Tile& t, int i, int j)
 {
-    adj_add_t(P& parent)
-    {
-        
-    }
-
-    float bwd(int i, float& adj_a, float& adj_b)
-    {
-        // evaluate parent
-        float adj_ret = parent.bwd(i);
+    assert(i < Tile::M);
+    assert(j < Tile::N);
 
-        adj_a += adj_ret;
-        adj_b += adj_ret;
-    }
-};
+    return t.extract(i, j);
+}
 
-template <typename T>
-struct adj_tile
+template<typename Tile, typename AdjTile>
+void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_j, typename Tile::Type adj_ret)
 {
-    adj_tile(T& parent)
-    {
-
-    }
+    assert(i < Tile::M);
+    assert(j < Tile::N);
 
-
-
-};
-
-void adj_tile_load(A, x, y, adj_A, adj_x, adj_y, adj_ret)
-{
-    for i in A(x,y):
-        adj_A[i] += adj_ret(i);
+    adj_t.adj_extract(i, j, adj_ret);
 }
 
 
+} // namespace wp
 
-// reverse
-wp::adj_store(var_C, x, y, var_2, adj_C, _, _, adj_2)   // adj_2->adj_C
-wp::adj_tile_add(var_0, var_1, adj_0, adj_1, adj_2)     // adj_0->adj_2->adj_C, adj_1->adj_2->adj_C
-wp::adj_tile_mul(var_0, 2.0, adj_0, _, adj_1);          // adj_0->adj_1->adj_2->adj_C
-wp::adj_tile_load(var_A, x, y, adj_A, _, _, adj_0);     // adj_A->adj_0->adj_1->adj_2->adj_C
-
-
-#endif
\ No newline at end of file
diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index f9cfd23d..5a5b4d81 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -45,7 +45,7 @@ inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset)
 }
 
 template <typename T>
-inline CUDA_CALLABLE T warp_reduce(T val)
+inline CUDA_CALLABLE T warp_reduce_sum(T val)
 {
     T sum = val;
 
@@ -57,11 +57,24 @@ inline CUDA_CALLABLE T warp_reduce(T val)
     return sum;
 }
 
+template <typename T, typename Op>
+inline CUDA_CALLABLE T warp_reduce(T val, Op op)
+{
+    T sum = val;
+
+    for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
+    {
+        sum = op(sum, warp_shuffle_down(sum, offset));
+    }
+
+    return sum;
+}
+
 
 // non-axis version which computes sum 
 // across the entire tile using the whole block
 template <typename Tile>
-auto tile_sum(Tile& t, int axis)
+auto tile_sum(Tile& t)
 {
     using T = typename Tile::Type;
 
@@ -80,17 +93,18 @@ auto tile_sum(Tile& t, int axis)
         thread_sum += input.data[i];
 
     // warp reduction
-    T warp_sum = warp_reduce(thread_sum);
+    T warp_sum = warp_reduce_sum(thread_sum);
 
-    // fixed size scratch pad for partial results
-    __shared__ T partials[warp_count];
+    // fixed size scratch pad for partial results in shared memory
+    WP_TILE_SHARED T partials[warp_count];
 
     if (lane_index == 0)
     {
         partials[warp_index] = warp_sum;
     }
 
-    __syncthreads();
+    // ensure partials are ready
+    WP_TILE_SYNC();
 
     // reduce across block, todo: use warp_reduce() here
     if (threadIdx.x == 0)
@@ -108,16 +122,16 @@ auto tile_sum(Tile& t, int axis)
 }
 
 template <typename Tile, typename AdjTile>
-void adj_tile_sum(Tile& t, int axis, Tile& adj_t, int adj_axis, AdjTile& adj_ret)
+void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
     using T = typename Tile::Type;
 
     // broadcast incoming adjoint to block
-    __shared__ T scratch;
+    WP_TILE_SHARED T scratch;
     if (threadIdx.x == 0)
         scratch = adj_ret.data[0];
 
-    __syncthreads();
+    WP_TILE_SYNC();
 
     auto adj_t_reg = adj_t.copy_to_register();
     auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, -1, 0, 0>(&scratch).copy_to_register();
@@ -126,4 +140,69 @@ void adj_tile_sum(Tile& t, int axis, Tile& adj_t, int adj_axis, AdjTile& adj_ret
 }
 
 
+template <typename Tile, typename Fwd>
+auto tile_reduce(Fwd op, Tile& t, int axis)
+{
+    using T = typename Tile::Type;
+
+    auto input = t.copy_to_register();
+    auto output = tile_register_t<T, 1, 1>();
+
+    const int warp_count = (WP_TILE_BLOCK_DIM + WP_TILE_WARP_SIZE - 1)/WP_TILE_WARP_SIZE;
+    const int warp_index = threadIdx.x/WP_TILE_WARP_SIZE;
+    const int lane_index = threadIdx.x%WP_TILE_WARP_SIZE;
+
+    T thread_sum = input.data[0];
+
+    // thread reduction
+    WP_PRAGMA_UNROLL
+    for (int i=1; i < input.NumRegs; ++i)
+        thread_sum = op(thread_sum, input.data[i]);
+
+    // warp reduction
+    T warp_sum = warp_reduce(thread_sum, op);
+
+    // fixed size scratch pad for partial results
+    WP_TILE_SHARED T partials[warp_count];
+
+    if (lane_index == 0)
+    {
+        partials[warp_index] = warp_sum;
+    }
+
+    WP_TILE_SYNC();
+
+    // reduce across block, todo: use warp_reduce() here
+    if (threadIdx.x == 0)
+    {
+        T block_sum = partials[0];
+        
+        WP_PRAGMA_UNROLL
+        for (int i=1; i < warp_count; ++i)
+            block_sum = op(block_sum, partials[i]);
+
+        output.data[0] = block_sum;
+    }
+
+    return output;
+}
+
+template <typename Tile, typename AdjTile, typename Fwd>
+void adj_tile_reduce(Tile& t, int axis, Tile& adj_t, int adj_axis, AdjTile& adj_ret)
+{
+    using T = typename Tile::Type;
+
+    // broadcast incoming adjoint to block
+    WP_TILE_SHARED T scratch;
+    if (threadIdx.x == 0)
+        scratch = adj_ret.data[0];
+
+    WP_TILE_SYNC();
+
+    auto adj_t_reg = adj_t.copy_to_register();
+    auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, -1, 0, 0>(&scratch).copy_to_register();
+
+    adj_t.assign(tile_add(adj_t_reg, adj_ret_reg));
+}
+
 } // namespace wp
\ No newline at end of file
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index f6aa9188..3153ac1b 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -178,7 +178,7 @@ def tile_grouped_gemm(A: wp.array3d(dtype=float),
     wp.tile_store(C[i], 0, 0, sum)
 
 
-def test_tile_batched_gemm():
+def test_tile_grouped_gemm():
 
     batch_count = 56
 
@@ -202,7 +202,7 @@ def test_tile_batched_gemm():
     C_host = C_wp.numpy()
 
     # GEMM forward passed
-    print("batched matmul forward passed")
+    print("Batched matmul forward passed")
 
 
 @wp.kernel
@@ -253,7 +253,7 @@ def test_tile_gemm():
     assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
 
     # GEMM forward passed
-    print("matmul forward passed")
+    print("Tiled matmul forward passed")
 
     adj_C = np.ones_like(C)
 
@@ -262,7 +262,7 @@ def test_tile_gemm():
     assert(np.allclose(adj_C@B.T, A_wp.grad.numpy(), rtol=1.e-4))
     assert(np.allclose(A.T@adj_C, B_wp.grad.numpy(), rtol=1.e-4))
 
-    print("matmul backward passed")
+    print("Tiled matmul backward passed")
 
 
 
@@ -309,7 +309,7 @@ def test_tile_operators():
 
     assert(np.allclose(output, output_wp.numpy(), rtol=1.e-4))
 
-    print("operators forward passed")
+    print("Operators forward passed")
 
     output_wp.grad.fill_(1.0)
 
@@ -317,7 +317,7 @@ def test_tile_operators():
 
     assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.75, rtol=1.e-4))
 
-    print("operators backward passed")    
+    print("Operators backward passed")    
 
 
 @wp.kernel
@@ -328,12 +328,13 @@ def tile_sum_kernel(input: wp.array3d(dtype=float),
     i = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
-    s = wp.tile_sum(a, axis=-1)*0.5
+    s = wp.tile_sum(a)*0.5
+
     wp.tile_store(output, i, 0, s)
 
 def test_tile_sum():
 
-    batch_count = 2
+    batch_count = 56
 
     M = TILE_M
     N = TILE_N
@@ -365,15 +366,56 @@ def test_tile_sum():
     print("Sum backward passed")
 
 
+@wp.kernel
+def tile_extract_kernel(input: wp.array2d(dtype=float),
+                        output: wp.array2d(dtype=float)):
+
+    # output tile index
+    i = wp.tid()
+
+    t = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
+
+    # perform a scalar copy, extracting each
+    # tile element individually
+    for i in range(TILE_M):
+        for j in range(TILE_N):
+            output[i,j] = t[i,j]
+
+def test_tile_extract():
+
+    M = TILE_M
+    N = TILE_N
+
+    rng = np.random.default_rng(42)
+    input = rng.random((M, N), dtype=np.float32)
+
+    input_wp = wp.array(input, requires_grad=True)
+    output_wp = wp.zeros_like(input_wp, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_extract_kernel, dim=1, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
+
+    assert(np.allclose(input_wp.numpy(), output_wp.numpy(), rtol=1.e-4))
+
+    print("Extract forward passed")
+
+    output_wp.grad.fill_(1.0)
+
+    tape.backward()
+
+    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input), rtol=1.e-4))
+
+    print("Extract backward passed")
+
 
 test_tile_copy()
 test_tile_unary_map()
 test_tile_binary_map()
-test_tile_batched_gemm()
+test_tile_grouped_gemm()
 test_tile_gemm()
 test_tile_operators()
 test_tile_sum()
-
+test_tile_extract()
 
 # #-----------------------------------------
 # # center of mass computation

From b57ff025c3bbef871351fd40651e48ef155d1b29 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 19 Sep 2024 16:43:29 +1200
Subject: [PATCH 027/102] Remove Alloc parameter from tile_shared_t

---
 warp/builtins.py          |  8 -------
 warp/codegen.py           | 14 +++++++----
 warp/native/tile.h        | 49 +++++++++++++++++++--------------------
 warp/native/tile_reduce.h |  2 +-
 warp/types.py             | 21 ++++++++++++++---
 5 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 077d6cd0..fbb526fe 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1729,10 +1729,6 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     template_args.append(m.constant)
     template_args.append(n.constant)
 
-    global shared_memory_id
-    template_args.append(shared_memory_id)
-    shared_memory_id += 1
-
     return ([], template_args)
 
 
@@ -1790,10 +1786,6 @@ def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     template_args.append(m)
     template_args.append(n)
 
-    global shared_memory_id
-    template_args.append(shared_memory_id)
-    shared_memory_id += 1
-
     return ((array, x, y), template_args)
 
 
diff --git a/warp/codegen.py b/warp/codegen.py
index 941a8f4b..5bbaef68 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -2991,7 +2991,9 @@ def codegen_func_forward(adj, func_type="kernel", device="cpu"):
         if var.ctype() == "auto":
             continue
 
-        if var.constant is None:
+        if is_tile(var.type):
+            lines += [f"{var.ctype()} {var.emit()} = {var.type.cinit()};\n"]
+        elif var.constant is None:
             lines += [f"{var.ctype()} {var.emit()};\n"]
         else:
             lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
@@ -3027,8 +3029,10 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
 
     for var in adj.variables:
 
-        if var.constant is None:
-            lines += [f"{var.ctype()} {var.emit()};\n"]
+        if is_tile(var.type):
+            lines += [f"{var.ctype()} {var.emit()} = {var.type.cinit()};\n"]
+        elif var.constant is None:
+            lines += [f"{var.ctype()} {var.emit()};\n"]        
         else:
             lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
 
@@ -3040,8 +3044,8 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
         name = var.emit_adj()
         ctype = var.ctype(value_type=True)
                
-        if is_tile(var.type) and var.type.storage == "shared":
-            lines += [f"{ctype} {name} = {{0}};\n"]
+        if is_tile(var.type):
+            lines += [f"{ctype} {name} = {var.type.cinit(adjoint=True)};\n"]
         else:
             lines += [f"{ctype} {name} = {{}};\n"]
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 3f3845c9..6a65481e 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -130,17 +130,6 @@ struct coord_t
 };
 
 
-template <typename T, int M, int N, int Alloc>
-inline CUDA_CALLABLE T* tile_alloc_shared()
-{
-    WP_TILE_SHARED __align__(16) T data[M*N];
-
-    for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
-        data[i] = T(0);
-
-    return data;
-}
-
 // represents a tile stored in global memory with dynamic strides
 // only used to represent the source for tile loads to register/shared
 template <typename T, int M_, int N_>
@@ -339,15 +328,14 @@ struct tile_register_t
 
 
 
-template <typename T, int M_, int N_, int Alloc_, int StrideM_=N_, int StrideN_=1>
+template <typename T, int M_, int N_, int StrideM_=N_, int StrideN_=1>
 struct tile_shared_t
 {
     using Type = T;
     static constexpr int M = M_;
     static constexpr int N = N_;
     static constexpr int Size = M*N;
-    static constexpr int Alloc = Alloc_;
-
+    
     static constexpr int StrideM = StrideM_;
     static constexpr int StrideN = StrideN_;
 
@@ -358,16 +346,8 @@ struct tile_shared_t
     // default initialization (non-initialized)
     inline CUDA_CALLABLE tile_shared_t() 
     {
-        data = tile_alloc_shared<T, M, N, Alloc>();
     }
 
-    // zero initialization, handles adj_tile = {0} syntax
-    inline CUDA_CALLABLE tile_shared_t(int nil) 
-    {
-        data = tile_alloc_shared<T, M, N, Alloc>();
-        zero();
-    }    
-
     // initialize from an existing tile's memory
     inline CUDA_CALLABLE tile_shared_t(T* smem) : data(smem)
     {
@@ -569,18 +549,37 @@ struct tile_shared_t
     }
 };
 
+// helpers to allocate shared tiles
+template <typename T, int M, int N, int Alloc>
+inline CUDA_CALLABLE auto tile_alloc_empty()
+{
+    WP_TILE_SHARED __align__(16) T data[M*N];
+    return tile_shared_t<T, M, N>(data);
+}
+
+template <typename T, int M, int N, int Alloc>
+inline CUDA_CALLABLE auto tile_alloc_zeros()
+{
+    WP_TILE_SHARED __align__(16) T data[M*N];
+
+    for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
+        data[i] = T(0);
+
+    return tile_shared_t<T, M, N>(data);
+}
+
 template <typename Tile>
 inline CUDA_CALLABLE auto tile_transpose(Tile& t)
 {    
     // alias incoming tile 
-    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::Alloc, Tile::StrideN, Tile::StrideM>(t.data);
+    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
 }
 
 
 //-----------------------------------------------------------------------------------------------------
 // High level entry points for each op (correspond to one Warp builtin)
 
-template <typename T, int M, int N, int Alloc>
+template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_zeros()
 {
     // tile variable assignment operator will handle initialization
@@ -589,7 +588,7 @@ inline CUDA_CALLABLE auto tile_zeros()
 
 
 // entry point for load
-template <typename T, int M, int N, int Alloc>
+template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x, int y)
 {
     // just return a ref. to the global memory
diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index 5a5b4d81..1f618f6d 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -134,7 +134,7 @@ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
     WP_TILE_SYNC();
 
     auto adj_t_reg = adj_t.copy_to_register();
-    auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, -1, 0, 0>(&scratch).copy_to_register();
+    auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, 0, 0>(&scratch).copy_to_register();
 
     adj_t.assign(tile_add(adj_t_reg, adj_ret_reg));
 }
diff --git a/warp/types.py b/warp/types.py
index f990a49e..ec36adc3 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2875,16 +2875,31 @@ def __init__(self, dtype, M, N, op=None, storage="register"):
         self.op = op
         self.storage = storage
 
+    # generates C-type string
     def ctype(self):
         from warp.codegen import Var
 
         if self.storage == "register":
             return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+        elif self.storage == "shared":           
+            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+
+    # generates C-initializer string
+    def cinit(self, adjoint=False):
+        from warp.codegen import Var
+
+        if self.storage == "register":          
+            return f"{0}"
         elif self.storage == "shared":
+
+            if adjoint:
+                # backward pass requires zeroed memory
+                return f"wp::tile_alloc_zeros<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"
+            else:
+                # forward mode can be uninitialized until first used by the kernel
+                return f"wp::tile_alloc_empty<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"
             
-            # every shared memory tile will create a new static shared memory allocation
-            # this just needs to be a unique-id for templated allocation functions         
-            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>"
+
 
     # generate a unique allocation index for shared memory
     @classmethod

From 1e039fd3fa4bae7e52ed4dbc2b4b4f9a0172ecf9 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 24 Sep 2024 16:10:03 +1200
Subject: [PATCH 028/102] Modify the way tiled kernels are launched, this
 change makes it so the block dimension is inserted as an optional additional
 launch dimension. This makes it so regular Warp kernels behavior is
 unchanged, and they can still use tile*() primitives.

---
 warp/codegen.py         | 32 ++++----------------------------
 warp/context.py         | 16 +++++++++-------
 warp/tape.py            | 10 +++++-----
 warp/tests/test_tile.py | 30 +++++++++++++++---------------
 4 files changed, 33 insertions(+), 55 deletions(-)

diff --git a/warp/codegen.py b/warp/codegen.py
index 5bbaef68..fb8dcce2 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -2714,38 +2714,14 @@ def get_constant_references(adj) -> Dict[str, Any]:
 
 """
 
-# cuda_kernel_template = """
-
-# extern "C" __global__ void {name}_cuda_kernel_forward(
-#     {forward_args})
-# {{
-#     for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-#          _idx < dim.size;
-#          _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
-#     {{
-# {forward_body}    }}
-# }}
-
-# extern "C" __global__ void {name}_cuda_kernel_backward(
-#     {reverse_args})
-# {{
-#     for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
-#          _idx < dim.size;
-#          _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
-#     {{
-# {reverse_body}    }}
-# }}
-
-# """
-
 cuda_kernel_template = """
 
 extern "C" __global__ void {name}_cuda_kernel_forward(
     {forward_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockIdx.x);
+    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
          _idx < dim.size;
-         _idx += static_cast<size_t>(gridDim.x))
+         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
     {{
 {forward_body}    }}
 }}
@@ -2753,9 +2729,9 @@ def get_constant_references(adj) -> Dict[str, Any]:
 extern "C" __global__ void {name}_cuda_kernel_backward(
     {reverse_args})
 {{
-    for (size_t _idx = static_cast<size_t>(blockIdx.x);
+    for (size_t _idx = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x);
          _idx < dim.size;
-         _idx += static_cast<size_t>(gridDim.x))
+         _idx += static_cast<size_t>(blockDim.x) * static_cast<size_t>(gridDim.x))
     {{
 {reverse_body}    }}
 }}
diff --git a/warp/context.py b/warp/context.py
index 95f36afb..9d13059e 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -4638,7 +4638,7 @@ def launch(
     record_tape=True,
     record_cmd=False,
     max_blocks=0,
-    tile_size=0,
+    block_dim=0,
 ):
     """Launch a Warp kernel on the target device
 
@@ -4658,7 +4658,7 @@ def launch(
         record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
         max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
             If negative or zero, the maximum hardware value will be used.
-        tile_size: The number of threads per-program instance
+        block_dim: The number of threads per-block
     """
 
     init()
@@ -4713,7 +4713,7 @@ def pack_args(args, params, adjoint=False):
 
         # delay load modules, including new overload if needed
         module = kernel.module
-        if not module.load(device, tile_size):
+        if not module.load(device, block_dim):
             return
 
         # late bind
@@ -4760,7 +4760,7 @@ def pack_args(args, params, adjoint=False):
                     )
 
                 runtime.core.cuda_launch_kernel(
-                    device.context, hooks.backward, bounds.size, max_blocks, tile_size, kernel_params, stream.cuda_stream
+                    device.context, hooks.backward, bounds.size, max_blocks, block_dim, kernel_params, stream.cuda_stream
                 )
 
             else:
@@ -4783,7 +4783,7 @@ def pack_args(args, params, adjoint=False):
                 else:
                     # launch
                     runtime.core.cuda_launch_kernel(
-                        device.context, hooks.forward, bounds.size, max_blocks, tile_size, kernel_params, stream.cuda_stream
+                        device.context, hooks.forward, bounds.size, max_blocks, block_dim, kernel_params, stream.cuda_stream
                     )
 
             try:
@@ -4797,7 +4797,7 @@ def pack_args(args, params, adjoint=False):
         # record file, lineno, func as metadata
         frame = inspect.currentframe().f_back
         caller = {"file": frame.f_code.co_filename, "lineno": frame.f_lineno, "func": frame.f_code.co_name}
-        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device, tile_size, metadata={"caller": caller})
+        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device, block_dim, metadata={"caller": caller})
 
         # detect illegal inter-kernel read/write access patterns if verification flag is set
         if warp.config.verify_autograd_array_access:
@@ -5348,7 +5348,9 @@ def type_str(t):
     elif typing.get_origin(t) in (List, Mapping, Sequence, Union, Tuple):
         args_repr = ", ".join(type_str(x) for x in typing.get_args(t))
         return f"{t.__name__}[{args_repr}]"
-
+    elif warp.types.is_tile(t):
+        return "Tile"
+    
     return t.__name__
 
 
diff --git a/warp/tape.py b/warp/tape.py
index 15aebf81..9905a2cd 100644
--- a/warp/tape.py
+++ b/warp/tape.py
@@ -129,7 +129,7 @@ def backward(self, loss: wp.array = None, grads: dict = None):
                 inputs = launch[3]
                 outputs = launch[4]
                 device = launch[5]
-                tile_size = launch[6]
+                block_dim = launch[6]
                 
                 adj_inputs = []
                 adj_outputs = []
@@ -152,14 +152,14 @@ def backward(self, loss: wp.array = None, grads: dict = None):
                     device=device,
                     adjoint=True,
                     max_blocks=max_blocks,
-                    tile_size=tile_size
+                    block_dim=block_dim
                 )
 
     # record a kernel launch on the tape
-    def record_launch(self, kernel, dim, max_blocks, inputs, outputs, device, tile_size=0, metadata=None):
+    def record_launch(self, kernel, dim, max_blocks, inputs, outputs, device, block_dim=0, metadata=None):
         if metadata is None:
             metadata = {}
-        self.launches.append([kernel, dim, max_blocks, inputs, outputs, device, tile_size, metadata])
+        self.launches.append([kernel, dim, max_blocks, inputs, outputs, device, block_dim, metadata])
 
     def record_func(self, backward, arrays):
         """
@@ -614,7 +614,7 @@ def emit_kernel_launch_node(
         self.array_grad_stats.insert(0, grad_stats)
 
 
-Launch = namedtuple("Launch", ["id", "kernel", "dim", "max_blocks", "inputs", "outputs", "device", "tile_size", "metadata"])
+Launch = namedtuple("Launch", ["id", "kernel", "dim", "max_blocks", "inputs", "outputs", "device", "block_dim", "metadata"])
 RepeatedSequence = namedtuple("RepeatedSequence", ["start", "end", "repetitions"])
 
 
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 3153ac1b..e1bfd21b 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -24,7 +24,7 @@ def tile_copy(A: wp.array2d(dtype=float),
               B: wp.array2d(dtype=float)):
     
     # tile index
-    i, j = wp.tid() 
+    i, j, _ = wp.tid() 
     
     a = wp.tile_load(A, i, j, m=TILE_M, n=TILE_N)
     wp.tile_store(B, i, j, a)
@@ -44,7 +44,7 @@ def test_tile_copy():
     B_wp = wp.array(B, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_copy, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=TILE_DIM)
+        wp.launch(tile_copy, dim=[int(M/TILE_M), int(N/TILE_N), TILE_DIM], inputs=[A_wp, B_wp], block_dim=TILE_DIM)
 
     # verify forward pass
     assert(np.allclose(A, B_wp.numpy(), rtol=1.e-4))
@@ -66,7 +66,7 @@ def tile_unary_map(input: wp.array2d(dtype=float),
                    output: wp.array2d(dtype=float)):
     
     # tile index
-    i, j = wp.tid() 
+    i, j, _ = wp.tid() 
     
     a = wp.tile_load(input, i, j, m=TILE_M, n=TILE_N)
     
@@ -91,7 +91,7 @@ def test_tile_unary_map():
     B_wp = wp.zeros_like(A_wp, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_unary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp], tile_size=TILE_DIM)
+        wp.launch(tile_unary_map, dim=[int(M/TILE_M), int(N/TILE_N), TILE_DIM], inputs=[A_wp, B_wp], block_dim=TILE_DIM)
 
     # verify forward pass
     assert(np.allclose(B, B_wp.numpy(), atol=1.e-4))
@@ -115,7 +115,7 @@ def tile_binary_map(input_a: wp.array2d(dtype=float),
                    output: wp.array2d(dtype=float)):
     
     # tile index
-    i, j = wp.tid() 
+    i, j, _= wp.tid() 
     
     a = wp.tile_load(input_a, i, j, m=TILE_M, n=TILE_N)
     b = wp.tile_load(input_b, i, j, m=TILE_M, n=TILE_N)
@@ -144,7 +144,7 @@ def test_tile_binary_map():
     C_wp = wp.zeros_like(A_wp, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_binary_map, dim=[int(M/TILE_M), int(N/TILE_N)], inputs=[A_wp, B_wp, C_wp], tile_size=TILE_DIM)
+        wp.launch(tile_binary_map, dim=[int(M/TILE_M), int(N/TILE_N), TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
 
     # verify forward pass
     assert(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
@@ -196,7 +196,7 @@ def test_tile_grouped_gemm():
     C_wp = wp.array(C, requires_grad=True)
 
     with wp.Tape() as tape:    
-        wp.launch(tile_grouped_gemm, dim=batch_count, inputs=[A_wp, B_wp, C_wp], tile_size=TILE_DIM)
+        wp.launch(tile_grouped_gemm, dim=[batch_count, TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
 
     # bring back to host
     C_host = C_wp.numpy()
@@ -211,7 +211,7 @@ def tile_gemm(A: wp.array2d(dtype=float),
               C: wp.array2d(dtype=float)):
 
     # output tile index
-    i, j = wp.tid()
+    i, j, _= wp.tid()
 
     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
@@ -248,7 +248,7 @@ def test_tile_gemm():
     C_wp = wp.array(C, requires_grad=True)
 
     with wp.Tape() as tape:    
-        wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=TILE_DIM)
+        wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N), TILE_DIM), inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
 
     assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
 
@@ -271,7 +271,7 @@ def tile_operators(input: wp.array3d(dtype=float),
                    output: wp.array3d(dtype=float)):
 
     # output tile index
-    i = wp.tid()
+    i, _ = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
     
@@ -305,7 +305,7 @@ def test_tile_operators():
     output_wp = wp.zeros_like(input_wp, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_operators, dim=batch_count, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
+        wp.launch(tile_operators, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
 
     assert(np.allclose(output, output_wp.numpy(), rtol=1.e-4))
 
@@ -325,7 +325,7 @@ def tile_sum_kernel(input: wp.array3d(dtype=float),
                     output: wp.array(dtype=float)):
 
     # output tile index
-    i = wp.tid()
+    i, _ = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
     s = wp.tile_sum(a)*0.5
@@ -346,7 +346,7 @@ def test_tile_sum():
     output_wp = wp.zeros(batch_count, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_sum_kernel, dim=batch_count, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
+        wp.launch(tile_sum_kernel, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
 
 
     for i in range(batch_count):
@@ -371,7 +371,7 @@ def tile_extract_kernel(input: wp.array2d(dtype=float),
                         output: wp.array2d(dtype=float)):
 
     # output tile index
-    i = wp.tid()
+    i, _ = wp.tid()
 
     t = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
 
@@ -393,7 +393,7 @@ def test_tile_extract():
     output_wp = wp.zeros_like(input_wp, requires_grad=True)
 
     with wp.Tape() as tape:
-        wp.launch(tile_extract_kernel, dim=1, inputs=[input_wp, output_wp], tile_size=TILE_DIM)
+        wp.launch(tile_extract_kernel, dim=[1, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
 
     assert(np.allclose(input_wp.numpy(), output_wp.numpy(), rtol=1.e-4))
 

From 394b2b036ad7ab54ef1336bedd0cc322716c99e3 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 24 Sep 2024 16:18:48 +1200
Subject: [PATCH 029/102] Fix for shared memory race condition when extracting
 elements from register tiles

---
 warp/native/tile.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index 6a65481e..e7808f41 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -238,11 +238,15 @@ struct tile_register_t
 
         WP_TILE_SHARED Type scratch;
 
+        // ensure any prevoiusly scheduled threads have finished reading from scratch
+        WP_TILE_SYNC();
+
         if (threadIdx.x == thread)
         {
             scratch = data[reg];
         }
 
+        // ensure extraction thread has updated smem
         WP_TILE_SYNC();
 
         return scratch;

From 3ac843167216126fe92fd7c310692b921b8c154b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 24 Sep 2024 16:29:51 +1200
Subject: [PATCH 030/102] Fix for regular Warp kernel code-gen on CPU, set
 default block_dim=256 explicitly

---
 warp/codegen.py | 12 ++++++------
 warp/context.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/warp/codegen.py b/warp/codegen.py
index fb8dcce2..6eeba5fd 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -2625,10 +2625,10 @@ def get_constant_references(adj) -> Dict[str, Any]:
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
 
-#define builtin_tid1d() wp::tid(_idx)
-#define builtin_tid2d(x, y) wp::tid(x, y, _idx, dim)
-#define builtin_tid3d(x, y, z) wp::tid(x, y, z, _idx, dim)
-#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, _idx, dim)
+#define builtin_tid1d() wp::tid(task_index)
+#define builtin_tid2d(x, y) wp::tid(x, y, task_index, dim)
+#define builtin_tid3d(x, y, z) wp::tid(x, y, z, task_index, dim)
+#define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, task_index, dim)
 
 """
 
@@ -2761,7 +2761,7 @@ def get_constant_references(adj) -> Dict[str, Any]:
 WP_API void {name}_cpu_forward(
     {forward_args})
 {{
-    for (size_t _idx = 0; _idx < dim.size; ++_idx)
+    for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
         {name}_cpu_kernel_forward(
             {forward_params});
@@ -2771,7 +2771,7 @@ def get_constant_references(adj) -> Dict[str, Any]:
 WP_API void {name}_cpu_backward(
     {reverse_args})
 {{
-    for (size_t _idx = 0; _idx < dim.size; ++_idx)
+    for (size_t task_index = 0; task_index < dim.size; ++task_index)
     {{
         {name}_cpu_kernel_backward(
             {reverse_params});
diff --git a/warp/context.py b/warp/context.py
index 9d13059e..59356599 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -4638,7 +4638,7 @@ def launch(
     record_tape=True,
     record_cmd=False,
     max_blocks=0,
-    block_dim=0,
+    block_dim=256,
 ):
     """Launch a Warp kernel on the target device
 

From 38a27d45352f277fdcd45ec3945a03db53094839 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Tue, 24 Sep 2024 17:40:18 -0700
Subject: [PATCH 031/102] MathDx support in Warp + Tile

---
 build_lib.py            |   2 +-
 examples/tile_fft.py    |  34 +++++
 examples/tile_matmul.py |  35 +++++
 warp/build.py           |  18 ++-
 warp/build_dll.py       |   8 +-
 warp/builtins.py        | 249 ++++++++++++++++++++++++++++++
 warp/codegen.py         |  28 ++--
 warp/context.py         |  67 ++++++++-
 warp/native/tile.h      |  52 +++++++
 warp/native/warp.cpp    |   2 +-
 warp/native/warp.cu     | 326 ++++++++++++++++++++++++++++++++++++++--
 warp/native/warp.h      |   4 +-
 warp/tests/test_tile.py |  77 ++++++++++
 warp/types.py           |   4 +-
 14 files changed, 867 insertions(+), 39 deletions(-)
 create mode 100644 examples/tile_fft.py
 create mode 100644 examples/tile_matmul.py

diff --git a/build_lib.py b/build_lib.py
index 436c9cde..3827c4cd 100644
--- a/build_lib.py
+++ b/build_lib.py
@@ -52,7 +52,7 @@
 parser.set_defaults(fast_math=False)
 
 parser.add_argument("--quick", action="store_true", help="Only generate PTX code, disable CUTLASS ops")
-parser.set_defaults(quick=True)
+parser.set_defaults(quick=False)
 
 parser.add_argument("--build_llvm", action="store_true", help="Build Clang/LLVM compiler from source, default disabled")
 parser.add_argument("--no_build_llvm", dest="build_llvm", action="store_false")
diff --git a/examples/tile_fft.py b/examples/tile_fft.py
new file mode 100644
index 00000000..f6cf23f9
--- /dev/null
+++ b/examples/tile_fft.py
@@ -0,0 +1,34 @@
+import numpy as np
+import warp as wp
+import numpy as np
+
+wp.init()
+wp.set_module_options({"enable_backward": False})
+wp.set_device("cuda:0")
+wp.build.clear_kernel_cache()
+
+BLOCK_DIM = 8
+TILE_M = 1
+TILE_N = 32
+
+@wp.kernel
+def fft_tiled(x: wp.array2d(dtype=wp.vec2d),
+              y: wp.array2d(dtype=wp.vec2d)):
+    
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft_dx(a)
+    wp.tile_ifft_dx(a)
+    wp.tile_store(y, i, j, a)
+
+
+x_h = np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+x_h[:,:,1] = 0
+y_h = 3 * np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+
+wp.launch(fft_tiled, dim=[1, 1, BLOCK_DIM], inputs=[x_wp, y_wp], block_dim=BLOCK_DIM)
+
+print("inputs:\n", x_wp) # [1+0i, 1+0i, 1+0i, ...]
+print("output:\n", y_wp) # [32+0i, 0, 0, ...]
diff --git a/examples/tile_matmul.py b/examples/tile_matmul.py
new file mode 100644
index 00000000..3d980592
--- /dev/null
+++ b/examples/tile_matmul.py
@@ -0,0 +1,35 @@
+import numpy as np
+import warp as wp
+
+wp.init()
+wp.build.clear_kernel_cache()
+
+BLOCK_DIM = 32
+M, N, K = 4, 8, 16
+
+@wp.kernel
+def matmul_tiled(ga: wp.array2d(dtype=wp.float64),
+                 gb: wp.array2d(dtype=wp.float64),
+                 gc: wp.array2d(dtype=wp.float64)):
+    
+    i, j, _ = wp.tid()
+    a = wp.tile_load(ga, i, j, m=M, n=K)
+    b = wp.tile_load(gb, i, j, m=K, n=N)
+    c = wp.tile_zeros(m=M, n=N, dtype=wp.float64)
+    wp.tile_matmul_dx(a, b, c)
+    wp.tile_store(gc, i, j, c)
+
+
+A = np.ones((M, K), dtype=np.float64)
+B = 3 * np.ones((K, N), dtype=np.float64)
+C = np.zeros((M, N), dtype=np.float64)
+
+A_wp = wp.array2d(A, dtype=wp.float64)
+B_wp = wp.array2d(B, dtype=wp.float64)
+C_wp = wp.array2d(C, dtype=wp.float64)
+
+wp.launch(matmul_tiled, dim=[1, 1, BLOCK_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=BLOCK_DIM)
+wp.synchronize()
+
+print("inputs:\n", A, '\n', B)
+print("output (should be = 48 * np.ones(4, 8)):\n", C_wp)
diff --git a/warp/build.py b/warp/build.py
index 7eee8e29..024e5ebc 100644
--- a/warp/build.py
+++ b/warp/build.py
@@ -9,22 +9,36 @@
 
 import warp.config
 from warp.thirdparty import appdirs
+import ctypes
 
+def get_mathdx_include_dirs():
+    return (os.environ['MATHDX_HOME'] + '/include').encode("utf-8")
+
+def get_cuda_include_dirs():
+    cuda_inc_path = (os.environ['CUDA_HOME'] + '/include').encode("utf-8")
+    include_dirs = [cuda_inc_path]
+    arr_include_dirs = (ctypes.c_char_p * len(include_dirs))()
+    arr_include_dirs[:] = include_dirs
+    return arr_include_dirs
 
 # builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension)
-def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False):
+def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False, ltoirs=[]):
     with open(cu_path, "rb") as src_file:
         src = src_file.read()
         cu_path = cu_path.encode("utf-8")
         inc_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "native").encode("utf-8")
         output_path = output_path.encode("utf-8")
+        cuda_include_dirs = get_cuda_include_dirs()
 
         if warp.config.llvm_cuda:
             warp.context.runtime.llvm.compile_cuda(src, cu_path, inc_path, output_path, False)
 
         else:
+            num_ltoirs = len(ltoirs)
+            arr_lroirs = (ctypes.c_char_p * num_ltoirs)(*ltoirs)
+            arr_lroir_sizes = (ctypes.c_size_t * num_ltoirs)(*[len(l) for l in ltoirs])
             err = warp.context.runtime.core.cuda_compile_program(
-                src, arch, inc_path, config == "debug", warp.config.verbose, verify_fp, fast_math, output_path
+                src, arch, inc_path, len(cuda_include_dirs), cuda_include_dirs, config == "debug", warp.config.verbose, verify_fp, fast_math, output_path, num_ltoirs, arr_lroirs, arr_lroir_sizes
             )
             if err != 0:
                 raise Exception(f"CUDA kernel build failed with error code {err}")
diff --git a/warp/build_dll.py b/warp/build_dll.py
index 6810d9c7..cecfc105 100644
--- a/warp/build_dll.py
+++ b/warp/build_dll.py
@@ -292,6 +292,8 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             run_cmd(link_cmd)
 
     else:
+        libmathdx_home = os.environ['LIBMATHDX_HOME']
+        libmathdx_includes = f'-I{libmathdx_home}/include'
         cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
         cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
         cuda_includes = f' -I"{cuda_home}/include"' if cu_path else ""
@@ -330,17 +332,17 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             cu_out = cu_path + ".o"
 
             if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
 
             elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
 
             with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
 
                 ld_inputs.append(quote(cu_out))
                 ld_inputs.append(
-                    f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lpthread -ldl -lrt'
+                    f'-L"{cuda_home}/lib64" -L{libmathdx_home}/lib -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lnvJitLink_static -lpthread -ldl -lrt -lmathdx_static'
                 )
 
         if sys.platform == "darwin":
diff --git a/warp/builtins.py b/warp/builtins.py
index fbb526fe..5aa0cee6 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5,12 +5,16 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 import builtins
+import tempfile
+import functools
+import os
 from typing import Any, Callable, Mapping, Sequence
 
 from warp.codegen import Reference, Var, strip_reference
 from warp.types import *
 
 from .context import add_builtin
+from .build import get_cuda_include_dirs, get_mathdx_include_dirs
 
 
 def seq_check_equal(seq_1, seq_2):
@@ -4550,3 +4554,248 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
 )
 
 
+##
+## MathDx, LTOIR-based, Tile functions
+##
+   
+##
+## Matmul
+##
+def tile_matmul_generic_value_func(arg_types, arg_values):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+
+    if len(arg_types) != 3: 
+        raise RuntimeError("tile_matmul() requires 4 positional args")
+
+    if not is_tile(arg_types["a"]):
+        raise RuntimeError("tile_matmul() argument 0 must be a tile")
+
+    if not is_tile(arg_types["b"]):
+        raise RuntimeError("tile_matmul() argument 1 must be an tile")
+
+    if not isinstance(arg_types["out"], Tile):
+        raise RuntimeError("tile_matmul() output argument must be a tile")
+
+    if arg_types["out"].storage != "shared":
+        raise RuntimeError("tile_matmul() output argument must have shared memory storage")
+
+
+    return None
+
+def tile_matmul_generic_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any]):
+    
+    a = arg_values["a"]
+    b = arg_values["b"]
+    out = arg_values["out"]
+
+    if any(not is_tile(arg.type) for arg in [a, b, out]):
+        raise RuntimeError(f"tile_matmul() requires three Tile arguments")
+    
+    if any(arg.type.dtype not in [float16, float32, float64, vec2h, vec2f, vec2d] for arg in [a, b, out]):
+        raise RuntimeError(f"tile_matmul() arguments must be tiles of float16, float32 or float64, vec2h, vec2f, vec2d entries")
+    
+    if any(arg.type.dtype != out.type.dtype for arg in [a, b]):
+        raise RuntimeError(f"tile_matmul() arguments must have the same type")
+
+    if (a.type.N != b.type.M) or (a.type.M != out.type.M) or (b.type.N != out.type.N):
+        raise RuntimeError(f"tile_matmul(A, B, C) requires sizes of A, B and C to be consistent for a matmul")
+
+    # set the storage type to the inputs to shared
+    a.type.storage = "shared"
+    b.type.storage = "shared"
+    out.type.storage = "shared"
+    template_args = []
+
+    # Real
+    if out.type.dtype == float16:
+        dtype = "wp::float16"
+        precision = 2 # COMMONDX_PRECISION_F16
+        element_type = 0 # CUBLASDX_TYPE_REAL
+    elif out.type.dtype == float32:
+        dtype = "wp::float32"
+        precision = 3 # COMMONDX_PRECISION_F32
+        element_type = 0 # CUBLASDX_TYPE_REAL
+    elif out.type.dtype == float64:
+        dtype = "wp::float64"
+        precision = 4 # COMMONDX_PRECISION_F64
+        element_type = 0 # CUBLASDX_TYPE_REAL
+    # Complex
+    elif out.type.dtype == vec2h:
+        dtype = "wp::vec2h"
+        precision = 2 # COMMONDX_PRECISION_F16
+        element_type = 1 # CUBLASDX_TYPE_COMPLEX
+    elif out.type.dtype == vec2f:
+        dtype = "wp::vec2f"
+        precision = 3 # COMMONDX_PRECISION_F32
+        element_type = 1 # CUBLASDX_TYPE_COMPLEX
+    elif out.type.dtype == vec2d:
+        dtype = "wp::vec2d"
+        precision = 4 # COMMONDX_PRECISION_F64
+        element_type = 1 # CUBLASDX_TYPE_COMPLEX
+    else:
+        raise RuntimeError("Unsupported datatype")
+
+    # generate the LTO
+    M, K = a.type.M, a.type.N
+    _, N = b.type.M, b.type.N
+    num_threads = options['tile_size']
+    arch = options['output_arch']
+
+    def make_function(M, N, K, tA, tB):
+        # Warp follows Numpy: matrices are row-major
+        # But cuBLASDx follows BLAS: matrices are col-major
+        # So we have to flip M <-> N and A <-> B
+        def make_transpose(t):
+            if t == 'N':
+                return 0 # CUBLASDX_TRANSPOSE_MODE_NON_TRANSPOSED
+            elif t == 'T':
+                return 1 # CUBLASDX_TRANSPOSE_MODE_TRANSPOSED
+            raise RuntimeError("Invalid transpose mode")
+        lto_symbol = f"dot_{M}_{N}_{K}_{tA}_{tB}_{precision}_{element_type}"
+        lto_code = tempfile.NamedTemporaryFile()
+        include_dirs = get_cuda_include_dirs()
+        result = warp.context.runtime.core.cuda_compile_dot(
+            lto_code.name.encode("utf-8"),  lto_symbol.encode("utf-8"),
+            len(include_dirs), include_dirs, get_mathdx_include_dirs(),
+            arch, N, M, K, precision, element_type, make_transpose(tB), make_transpose(tA), num_threads)
+        if not result:
+            raise RuntimeError("Failed to compile tile_matmul")
+        else:
+            with open(lto_code.name, 'rb') as f:
+                lto_code = f.read()
+            return lto_symbol, lto_code
+
+    (fun_forward, lto_forward) = make_function(M, N, K, 'N', 'N')       #    C += A * B
+    (fun_backward_A, lto_backward_A) = make_function(M, K, N, 'N', 'T') # adjA += adjC * B^T
+    (fun_backward_B, lto_backward_B) = make_function(K, N, M, 'T', 'N') # adjB += A^T * adjC
+
+    return ((Var(fun_forward, str, False, True, False), 
+             Var(fun_backward_A, str, False, True, False), 
+             Var(fun_backward_B, str, False, True, False), 
+             Var(dtype, str, False, True, False),
+             a, 
+             b, 
+             out), 
+             template_args, 
+             [lto_forward, lto_backward_A, lto_backward_B])
+
+add_builtin(
+    "tile_matmul_dx",
+    input_types={"a": Tile, "b": Tile, "out": Tile},
+    value_func=tile_matmul_generic_value_func,
+    lto_dispatch_func=tile_matmul_generic_dispatch_func,
+    variadic=True,
+    doc="Compute matrix product and accumulate out += a*b.", 
+    group="Tile Primitives",
+    export=False,
+    namespace="",
+)
+
+##
+## FFT
+##
+def tile_fft_generic_value_func(arg_types, arg_values):
+    
+    if arg_types is None:
+        return None
+
+    if len(arg_types) != 1: 
+        raise RuntimeError("tile_fft() requires 1 positional args")
+
+    if not is_tile(arg_types["inout"]):
+        raise RuntimeError("tile_fft() argument 0 must be a tile")
+
+    if arg_types["inout"].storage != "register":
+        raise RuntimeError("tile_fft() input/output argument must have register memory storage")
+
+    return None
+
+def tile_fft_generic_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any], direction:str = None):
+    
+    inout = arg_values["inout"]
+    inout.type.storage = "register"
+
+    if (not is_tile(inout.type)):
+        raise RuntimeError(f"tile_fft() arguments must be a single tile with register storage")
+
+    if (inout.type.dtype not in [vec2f, vec2d]):
+        raise RuntimeError(f"tile_fft() argument must be a tile of vec2f or vec2d (interpreted as complex) entries")
+
+    # see libcufftdx.hpp
+    if direction == 'forward':
+        dir = 0 # CUFFTDX_DIRECTION_FORWARD
+    elif direction == 'inverse':
+        dir = 1 # CUFFTDX_DIRECTION_INVERSE
+    else:
+        raise RuntimeError("Invalid direction")
+    
+    if inout.type.dtype == vec2f:
+        dtype = "wp::vec2f"
+        precision = 3 # COMMONDX_PRECISION_F32
+    elif inout.type.dtype == vec2d:
+        dtype = "wp::vec2d"
+        precision = 4 # COMMONDX_PRECISION_F64
+    else:
+        raise RuntimeError("Unsupported datatype")
+
+    # M FFTs of size N each
+    batch, size = inout.type.M, inout.type.N
+    num_threads = options['tile_size']
+    arch = options['output_arch']
+    ept = size // num_threads
+    lto_symbol = f"fft_{size}_{ept}_{arch}_{direction}_{precision}"
+
+    lto_code = tempfile.NamedTemporaryFile()
+    shared_memory_size = ctypes.c_int(0)
+
+    include_dirs = get_cuda_include_dirs()
+
+    result = warp.context.runtime.core.cuda_compile_fft(
+        lto_code.name.encode("utf-8"), 
+        lto_symbol.encode("utf-8"),
+        len(include_dirs), include_dirs,
+        get_mathdx_include_dirs(),
+        arch, size, ept, dir, precision, ctypes.byref(shared_memory_size)
+    )
+
+    if not result:
+        raise RuntimeError("Failed to compile tile_matmul")
+
+    with open(lto_code.name, 'rb') as f:
+        lto_code = f.read()
+
+    return ((Var(lto_symbol, str, False, True, False), 
+             Var(dtype, str, False, True, False),
+             Var(str(shared_memory_size.value), str, False, True, False),
+             Var(str(batch), str, False, True, False),
+             Var(str(ept), str, False, True, False),
+             inout), 
+             [], 
+             [lto_code])
+
+add_builtin(
+    "tile_fft_dx",
+    input_types={"inout": Tile},
+    value_func=tile_fft_generic_value_func,
+    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction='forward'),
+    variadic=True,
+    doc="Compute the FFT along the second dimension of a 2D tile of data.", 
+    group="Tile Primitives",
+    export=False,
+    namespace="",
+)
+
+add_builtin(
+    "tile_ifft_dx",
+    input_types={"inout": Tile},
+    value_func=tile_fft_generic_value_func,
+    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction='inverse'),
+    variadic=True,
+    doc="Compute the inverse FFT along the second dimension of a 2D tile of data.", 
+    group="Tile Primitives",
+    export=False,
+    namespace="",
+)
diff --git a/warp/codegen.py b/warp/codegen.py
index 6eeba5fd..7336ac5e 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -865,6 +865,9 @@ def __init__(
         # for unit testing errors being spit out from kernels.
         adj.skip_build = False
 
+        # Collect the LTOIR required at link-time
+        adj.ltoirs = []
+
     # generate function ssa form and adjoint
     def build(adj, builder, default_builder_options=None):
         # arg Var read/write flags are held during module rebuilds, so we reset here even when skipping a build
@@ -901,6 +904,9 @@ def build(adj, builder, default_builder_options=None):
         # used to generate new label indices
         adj.label_count = 0
 
+        # collect ltoirs
+        adj.ltoirs = []
+
         # update symbol map for each argument
         for a in adj.args:
             adj.symbols[a.label] = a
@@ -926,6 +932,8 @@ def build(adj, builder, default_builder_options=None):
                 elif isinstance(a.type, warp.types.array) and isinstance(a.type.dtype, Struct):
                     builder.build_struct_recursive(a.type.dtype)
 
+            builder.ltoirs.extend(adj.ltoirs)
+
     # code generation methods
     def format_template(adj, template, input_vars, output_var):
         # output var is always the 0th index
@@ -1227,15 +1235,17 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             bound_arg_values,
         )
 
-        if func.dispatch_func is not None:
-            # If we have a built-in that requires special handling to dispatch
-            # the arguments to the underlying C++ function, then we can resolve
-            # these using the `dispatch_func`. Since this is only called from
-            # within codegen, we pass it directly `codegen.Var` objects,
-            # which allows for some more advanced resolution to be performed,
-            # for example by checking whether an argument corresponds to
-            # a literal value or references a variable.
-
+        # If we have a built-in that requires special handling to dispatch
+        # the arguments to the underlying C++ function, then we can resolve
+        # these using the `dispatch_func`. Since this is only called from
+        # within codegen, we pass it directly `codegen.Var` objects,
+        # which allows for some more advanced resolution to be performed,
+        # for example by checking whether an argument corresponds to
+        # a literal value or references a variable.
+        if func.lto_dispatch_func is not None:
+            func_args, template_args, ltoirs = func.lto_dispatch_func(func.input_types, return_type, bound_args, options=adj.builder_options)
+            adj.ltoirs.extend(ltoirs)
+        elif func.dispatch_func is not None:
             func_args, template_args = func.dispatch_func(func.input_types, return_type, bound_args)
         else:
             func_args = tuple(bound_args.values())
diff --git a/warp/context.py b/warp/context.py
index 59356599..a07b9f1d 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -66,6 +66,7 @@ def __init__(
         value_func=None,
         export_func=None,
         dispatch_func=None,
+        lto_dispatch_func=None,
         module=None,
         variadic=False,
         initializer_list_func=None,
@@ -101,6 +102,7 @@ def __init__(
         self.value_func = value_func  # a function that takes a list of args and a list of templates and returns the value type, e.g.: load(array, index) returns the type of value being loaded
         self.export_func = export_func
         self.dispatch_func = dispatch_func
+        self.lto_dispatch_func = lto_dispatch_func
         self.input_types = {}
         self.export = export
         self.doc = doc
@@ -1012,6 +1014,7 @@ def add_builtin(
     value_func=None,
     export_func=None,
     dispatch_func=None,
+    lto_dispatch_func=None,
     doc="",
     namespace="wp::",
     variadic=False,
@@ -1052,6 +1055,9 @@ def add_builtin(
             The arguments returned must be of type `codegen.Var`.
             If not provided, all arguments passed by the users when calling
             the built-in are passed as-is as runtime arguments to the C++ function.
+        lto_dispatch_func (Callable): Same as dispatch_func, but takes an 'option' dict
+            as extra argument (indicating tile_size and target architecture) and returns
+            an LTO-IR buffer as extra return value
         doc (str): Used to generate the Python's docstring and the HTML documentation.
         namespace: Namespace for the underlying C++ function.
         variadic (bool): Whether the function declares variadic arguments.
@@ -1190,6 +1196,7 @@ def initializer_list_func(args, return_type):
                     value_type=return_type,
                     export_func=export_func,
                     dispatch_func=dispatch_func,
+                    lto_dispatch_func=lto_dispatch_func,
                     doc=doc,
                     namespace=namespace,
                     variadic=variadic,
@@ -1212,6 +1219,7 @@ def initializer_list_func(args, return_type):
         value_func=value_func,
         export_func=export_func,
         dispatch_func=dispatch_func,
+        lto_dispatch_func=lto_dispatch_func,
         variadic=variadic,
         initializer_list_func=initializer_list_func,
         export=export,
@@ -1296,6 +1304,7 @@ def __init__(self, module, options):
         self.options = options
         self.module = module
         self.deferred_functions = []
+        self.ltoirs = []
 
         # build all functions declared in the module
         for func in module.functions.values():
@@ -1750,6 +1759,9 @@ def load(self, device, tile_size=0) -> bool:
                     output_arch = device.arch
                     output_name = f"module_codegen.sm{output_arch}.cubin"
 
+                # Some of the Tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
+                self.options["output_arch"] = output_arch
+
             # final object binary path
             binary_path = os.path.join(module_dir, output_name)
 
@@ -1828,6 +1840,7 @@ def load(self, device, tile_size=0) -> bool:
                                 config=self.options["mode"],
                                 fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
+                                ltoirs=builder.ltoirs,
                             )
 
                     except Exception as e:
@@ -3069,17 +3082,55 @@ def __init__(self):
             self.core.cuda_graph_destroy.restype = ctypes.c_bool
 
             self.core.cuda_compile_program.argtypes = [
-                ctypes.c_char_p,
-                ctypes.c_int,
-                ctypes.c_char_p,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_char_p,
+                ctypes.c_char_p, # cuda_src
+                ctypes.c_int, # arch
+                ctypes.c_char_p, # include_dir
+                ctypes.c_int, # num_cuda_include_dirs
+                ctypes.POINTER(ctypes.c_char_p), # cuda include dirs
+                ctypes.c_bool, # debug
+                ctypes.c_bool, # verbose
+                ctypes.c_bool, # verify_fp
+                ctypes.c_bool, # fast_math
+                ctypes.c_char_p, # output_path
+                ctypes.c_size_t, # num_ltoirs
+                ctypes.POINTER(ctypes.c_char_p), # ltoirs
+                ctypes.POINTER(ctypes.c_size_t), # ltoir_sizes
             ]
             self.core.cuda_compile_program.restype = ctypes.c_size_t
 
+            self.core.cuda_compile_fft.argtypes = [
+                ctypes.c_char_p, # lto
+                ctypes.c_char_p, # function name
+                ctypes.c_int, # num include dirs
+                ctypes.POINTER(ctypes.c_char_p), # include dirs
+                ctypes.c_char_p, # mathdx include dir
+                ctypes.c_int, # arch
+                ctypes.c_int, # size
+                ctypes.c_int, # ept
+                ctypes.c_int, # direction
+                ctypes.c_int, # precision
+                ctypes.POINTER(ctypes.c_int) # smem (out)
+            ]
+            self.core.cuda_compile_fft.restype = ctypes.c_bool
+
+            self.core.cuda_compile_dot.argtypes = [
+                ctypes.c_char_p, # lto
+                ctypes.c_char_p, # function name
+                ctypes.c_int, # num include dirs
+                ctypes.POINTER(ctypes.c_char_p), # include dirs
+                ctypes.c_char_p, # mathdx include dir
+                ctypes.c_int, # arch
+                ctypes.c_int, # M
+                ctypes.c_int, # N
+                ctypes.c_int, # K
+                ctypes.c_int, # precision
+                ctypes.c_int, # type
+                ctypes.c_int, # tA
+                ctypes.c_int, # tB
+                ctypes.c_int  # num threads
+            ]
+            self.core.cuda_compile_dot.restype = ctypes.c_bool
+
             self.core.cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
             self.core.cuda_load_module.restype = ctypes.c_void_p
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index e7808f41..10ea0830 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -861,6 +861,58 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
     adj_t.adj_extract(i, j, adj_ret);
 }
 
+// But cuBLASDx follows the BLAS convention: matrices are col-major, so we swap A & B in the code below
+
+#define tile_matmul_dx(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C) \
+    do { \
+        void fun_forward(dtype, dtype*, dtype*, dtype, dtype*); \
+        WP_TILE_SYNC(); \
+        fun_forward(dtype(1.0), B.data, A.data, dtype(1.0), C.data); \
+        WP_TILE_SYNC(); \
+    } while (0)
+
+// adj_fun_forward, adj_fun_backward_A, adj_fun_backward_B, adj_dtype are in practice ignored
+// but are here because builtins.py creates them even though those are effectively compile time constants
+#define adj_tile_matmul_dx(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C, \
+                           adj_fun_forward, adj_fun_backward_A, adj_fun_backward_B, adj_dtype, \
+                           adjA, adjB, adjC) \
+    do { \
+        void fun_backward_A(dtype, dtype*, dtype*, dtype, dtype*); \
+        void fun_backward_B(dtype, dtype*, dtype*, dtype, dtype*); \
+        WP_TILE_SYNC(); \
+        fun_backward_A(dtype(1.0), B.data, adjC.data, dtype(1.0), adjA.data); \
+        fun_backward_B(dtype(1.0), adjC.data, A.data, dtype(1.0), adjB.data); \
+        WP_TILE_SYNC(); \
+    } while (0)
+
+#define tile_fft_dx(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
+    do { \
+        void function_name(dtype*, dtype*); \
+        WP_TILE_SHARED __align__(16) char buffer[shared_memory_size]; \
+        WP_TILE_SYNC(); \
+        for(int b = 0; b < (int)batch_size; b++) { \
+            function_name(Xinout.data + (int)b * (int)ept, (dtype*)buffer); \
+            WP_TILE_SYNC(); \
+        } \
+    } while (0)
+
+#define tile_ifft_dx tile_fft_dx
+
+// adj_function_name, adj_dtype, adj_shared_memory_size, adj_batch_size, adj_ept are all ignored
+
+#define adj_tile_fft_dx(function_name, dtype, shared_memory_size, batch_size, ept, Xinout, \
+                        adj_function_name, adj_dtype, adj_shared_memory_size, adj_batch_size, adj_ept, \
+                        adj_Xinout) \
+    do { \
+        tile_ifft_dx(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
+    } while (0)
+
+#define adj_tile_ifft_dx(function_name, dtype, shared_memory_size, batch_size, ept, Xinout, \
+                         adj_function_name, adj_dtype, adj_shared_memory_size, adj_batch_size, adj_ept, \
+                         adj_Xinout) \
+    do { \
+        tile_fft_dx(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
+    } while (0)
 
 } // namespace wp
 
diff --git a/warp/native/warp.cpp b/warp/native/warp.cpp
index b7ad19a3..697e4dcf 100644
--- a/warp/native/warp.cpp
+++ b/warp/native/warp.cpp
@@ -1031,7 +1031,7 @@ WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret
 WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; }
 WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; }
 
-WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_file) { return 0; }
+WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes) { return 0; }
 
 WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; }
 WP_API void cuda_unload_module(void* context, void* module) {}
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index 30c8f512..8268ae3b 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -11,9 +11,14 @@
 #include "cuda_util.h"
 #include "error.h"
 
+#include <cstdlib>
+#include <fstream>
 #include <nvrtc.h>
+#include <nvJitLink.h>
 #include <nvPTXCompiler.h>
+#include <libmathdx.hpp>
 
+#include <array>
 #include <algorithm>
 #include <iterator>
 #include <list>
@@ -23,8 +28,39 @@
 #include <unordered_set>
 #include <vector>
 
+#define check_any(result) (check_generic(result, __FILE__, __LINE__))
 #define check_nvrtc(code) (check_nvrtc_result(code, __FILE__, __LINE__))
 #define check_nvptx(code) (check_nvptx_result(code, __FILE__, __LINE__))
+#define check_nvjitlink(handle, code) (check_nvjitlink_result(handle, code, __FILE__, __LINE__))
+#define check_cufftdx(code) (check_cufftdx_result(code, __FILE__, __LINE__))
+#define check_cublasdx(code) (check_cublasdx_result(code, __FILE__, __LINE__))
+#define CHECK_ANY(code) \
+{ \
+    do { \
+        bool out = (check_any(code)); \
+        if(!out) { \
+            return out; \
+        } \
+    } while(0); \
+}
+#define CHECK_CUFFTDX(code) \
+{ \
+    do { \
+        bool out = (check_cufftdx(code)); \
+        if(!out) { \
+            return out; \
+        } \
+    } while(0); \
+}
+#define CHECK_CUBLASDX(code) \
+{ \
+    do { \
+        bool out = (check_cufftdx(code)); \
+        if(!out) { \
+            return out; \
+        } \
+    } while(0); \
+}
 
 bool check_nvrtc_result(nvrtcResult result, const char* file, int line)
 {
@@ -74,6 +110,54 @@ bool check_nvptx_result(nvPTXCompileResult result, const char* file, int line)
     return false;
 }
 
+bool check_nvjitlink_result(nvJitLinkHandle handle, nvJitLinkResult result, const char* file, int line)
+{
+    if (result != NVJITLINK_SUCCESS) {
+        fprintf(stderr, "nvJitLink error: %d on %s:%d\n", (int)result, file, line);
+        size_t lsize;
+        result = nvJitLinkGetErrorLogSize(handle, &lsize);
+        if (result == NVJITLINK_SUCCESS && lsize > 0) {
+            std::vector<char> log(lsize);
+            result = nvJitLinkGetErrorLog(handle, log.data());
+            if (result == NVJITLINK_SUCCESS) {
+                fprintf(stderr, "%s\n", log.data());
+            }
+        }
+        return false;
+    } else {
+        return true;
+    }
+}
+
+bool check_cufftdx_result(commonDxStatusType result, const char* file, int line)
+{
+    if (result != commonDxStatusType::COMMONDX_SUCCESS) {
+        fprintf(stderr, "libmathdx cuFFTDx error: %d on %s:%d\n", (int)result, file, line);
+        return false;
+    } else {
+        return true;
+    }
+}
+
+bool check_cublasdx_result(commonDxStatusType result, const char* file, int line)
+{
+    if (result != commonDxStatusType::COMMONDX_SUCCESS) {
+        fprintf(stderr, "libmathdx cuBLASDx error: %d on %s:%d\n", (int)result, file, line);
+        return false;
+    } else {
+        return true;
+    }
+}
+
+bool check_generic(int result, const char* file, int line)
+{
+    if (!result) {
+        fprintf(stderr, "Error %d on %s:%d\n", (int)result, file, line);
+        return false;
+    } else {
+        return true;
+    }
+}
 
 struct DeviceInfo
 {
@@ -2508,11 +2592,134 @@ bool cuda_graph_destroy(void* context, void* graph_exec)
     return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
 }
 
-size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path)
+bool write_file(const char* data, size_t size, std::string filename, const char* mode)
+{
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
+    if (print_debug) 
+    {
+        printf("Writing %zu B to %s (%s)\n", size, filename.c_str(), mode);
+    }
+    FILE* file = fopen(filename.c_str(), mode);
+    if (file)
+    {
+        if (fwrite(data, 1, size, file) != size) {
+            fprintf(stderr, "Warp error: Failed to write to output file '%s'\n", filename.c_str());
+            return false;
+        }
+        fclose(file);
+        return true;
+    }
+    else
+    {
+        fprintf(stderr, "Warp error: Failed to open file '%s'\n", filename.c_str());
+        return false;
+    }
+}
+
+bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
+{
+
+    CHECK_ANY(ltoir_output_path != nullptr);
+    CHECK_ANY(symbol_name != nullptr);
+    CHECK_ANY(mathdx_include_dir != nullptr);
+    CHECK_ANY(shared_memory_size != nullptr);
+    CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
+
+    bool res = true;
+    cufftdxHandle h;
+    CHECK_CUFFTDX(cufftDxCreate(&h));
+
+    // CUFFTDX_API_BLOCK_LMEM means each thread starts with a subset of the data
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_API, cufftDxApi::CUFFTDX_API_BLOCK_LMEM));
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SIZE, (long long)size));
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_DIRECTION, (cufftDxDirection)direction));
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SM, (long long)(arch * 10)));
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_ELEMENTS_PER_THREAD, (long long)(elements_per_thread)));
+    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_FFTS_PER_BLOCK, 1));
+
+    CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
+    for(int dir = 0; dir < num_include_dirs; dir++) 
+    {
+        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
+    }
+    CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
+
+    size_t lto_size = 0;
+    CHECK_CUFFTDX(cufftDxGetLTOIRSize(h, &lto_size));
+
+    std::vector<char> lto(lto_size);
+    CHECK_CUFFTDX(cufftDxGetLTOIR(h, lto.size(), lto.data()));    
+
+    long long int smem = 0;
+    CHECK_CUFFTDX(cufftDxGetTraitInt64(h, cufftDxTraitType::CUFFTDX_TRAIT_SHARED_MEMORY_SIZE, &smem));
+    *shared_memory_size = (int)smem;
+
+    if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
+        res = false;
+    }
+
+    CHECK_CUFFTDX(cufftDxDestroy(h));
+
+    return res;
+}
+
+bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision, int type, int tA, int tB, int num_threads)
+{
+
+    CHECK_ANY(ltoir_output_path != nullptr);
+    CHECK_ANY(symbol_name != nullptr);
+    CHECK_ANY(mathdx_include_dir != nullptr);
+    CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
+
+    bool res = true;
+    cublasdxHandle h;
+    CHECK_CUBLASDX(cublasDxCreate(&h));
+
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_FUNCTION, cublasDxFunction::CUBLASDX_FUNCTION_MM));
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_API, cublasDxApi::CUBLASDX_API_BLOCK_SMEM));
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SM, (long long)(arch * 10)));
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TYPE, (cublasDxType)type));
+    std::array<long long int, 3> block_dim = {num_threads, 1, 1};
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
+    std::array<long long int, 3> size = {M, N, K};
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SIZE, size.size(), size.data()));
+    std::array<long long int, 2> transpose_mode = {(cublasDxTransposeMode_t)tA, (cublasDxTransposeMode_t)tB};
+    CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TRANSPOSE_MODE, transpose_mode.size(), transpose_mode.data()));
+    
+    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
+    for(int dir = 0; dir < num_include_dirs; dir++) 
+    {
+        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
+    }
+    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
+    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/cublasdx/include").c_str()));
+    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/../external/cutlass/include").c_str()));
+
+    size_t lto_size = 0;
+    CHECK_CUBLASDX(cublasDxGetLTOIRSize(h, &lto_size));
+
+    std::vector<char> lto(lto_size);
+    CHECK_CUBLASDX(cublasDxGetLTOIR(h, lto.size(), lto.data()));    
+
+    if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
+        res = false;
+    }
+
+    CHECK_CUBLASDX(cublasDxDestroy(h));
+
+    return res;
+}
+
+size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes)
 {
     // use file extension to determine whether to output PTX or CUBIN
     const char* output_ext = strrchr(output_path, '.');
     bool use_ptx = output_ext && strcmp(output_ext + 1, "ptx") == 0;
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
 
     // check include dir path len (path + option)
     const int max_path = 4096 + 16;
@@ -2522,17 +2729,37 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         return size_t(-1);
     }
 
+    if (print_debug)
+    {
+        // Not available in all nvJitLink versions
+        // unsigned major = 0;
+        // unsigned minor = 0;
+        // nvJitLinkVersion(&major, &minor);
+        // printf("nvJitLink version %d.%d\n", major, minor);
+        int major = 0;
+        int minor = 0;
+        nvrtcVersion(&major, &minor);
+        printf("NVRTC version %d.%d\n", major, minor);
+    }
+
     char include_opt[max_path];
     strcpy(include_opt, "--include-path=");
     strcat(include_opt, include_dir);
 
     const int max_arch = 128;
     char arch_opt[max_arch];
+    char arch_opt_lto[max_arch];
 
     if (use_ptx)
+    {
         snprintf(arch_opt, max_arch, "--gpu-architecture=compute_%d", arch);
+        snprintf(arch_opt_lto, max_arch, "-arch=compute_%d", arch);
+    }
     else
+    {
         snprintf(arch_opt, max_arch, "--gpu-architecture=sm_%d", arch);
+        snprintf(arch_opt_lto, max_arch, "-arch=sm_%d", arch);
+    }
 
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
@@ -2561,13 +2788,22 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     sprintf(include_cutlass, "--include-path=%s/cutlass/include", include_dir);
     opts.push_back(include_cutlass);
 
-    //opts.push_back("--include-path=_build/target-deps/cuda/include");
-    opts.push_back("--include-path=C:\\packman-repo\\chk\\cuda\\11.8.0_522.06-abe3d9d7-windows-x86_64\\include");
+    std::vector<std::string> cuda_include_opt;
+    for(int i = 0; i < num_cuda_include_dirs; i++)
+    {
+        cuda_include_opt.push_back(std::string("--include-path=") + cuda_include_dirs[i]);
+        opts.push_back(cuda_include_opt.back().c_str());
+    }
 
     opts.push_back("--device-as-default-execution-space");
     opts.push_back("--extra-device-vectorization");
     opts.push_back("--restrict");
 
+    if (num_ltoirs > 0)
+    {
+        opts.push_back("-dlto");
+        opts.push_back("--relocatable-device-code=true");
+    }
 
     nvrtcProgram prog;
     nvrtcResult res;
@@ -2583,6 +2819,13 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     if (!check_nvrtc(res))
         return size_t(res);
 
+    if (print_debug) 
+    {
+        printf("NVRTC options:\n");
+        for(auto o: opts) {
+            printf("%s\n", o);
+        }
+    }
     res = nvrtcCompileProgram(prog, int(opts.size()), opts.data());
 
     if (!check_nvrtc(res) || verbose)
@@ -2612,7 +2855,12 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     nvrtcResult (*get_output_size)(nvrtcProgram, size_t*);
     nvrtcResult (*get_output_data)(nvrtcProgram, char*);
     const char* output_mode;
-    if (use_ptx)
+    if(num_ltoirs > 0) {
+        get_output_size = nvrtcGetLTOIRSize;
+        get_output_data = nvrtcGetLTOIR;
+        output_mode = "wb";
+    }
+    else if (use_ptx)
     {
         get_output_size = nvrtcGetPTXSize;
         get_output_data = nvrtcGetPTX;
@@ -2634,19 +2882,73 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         res = get_output_data(prog, output.data());
         if (check_nvrtc(res))
         {
-            FILE* file = fopen(output_path, output_mode);
-            if (file)
+
+            // LTOIR case - need an extra step
+            if (num_ltoirs > 0) 
             {
-                if (fwrite(output.data(), 1, output_size, file) != output_size)
+                nvJitLinkHandle handle;
+                std::vector<const char *> lopts = {"-dlto", arch_opt_lto};
+                if (use_ptx) {
+                    lopts.push_back("-ptx");
+                }
+                if (print_debug) 
+                {
+                    printf("nvJitLink options:\n");
+                    for(auto o: lopts) {
+                        printf("%s\n", o);
+                    }
+                }
+                if(!check_nvjitlink(handle, nvJitLinkCreate(&handle, lopts.size(), lopts.data())))
                 {
-                    fprintf(stderr, "Warp error: Failed to write output file '%s'\n", output_path);
                     res = nvrtcResult(-1);
                 }
-                fclose(file);
+                // Links
+                if(std::getenv("WARP_DUMP_LTOIR"))
+                {
+                    write_file(output.data(), output.size(), "nvrtc_output.ltoir", "wb");
+                }
+                if(!check_nvjitlink(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, output.data(), output.size(), "nvrtc_output"))) // NVRTC business
+                {
+                    res = nvrtcResult(-1);
+                }
+                for(size_t ltoidx = 0; ltoidx < num_ltoirs; ltoidx++) 
+                {
+                    if(std::getenv("WARP_DUMP_LTOIR"))
+                    {
+                        write_file(ltoirs[ltoidx], ltoir_sizes[ltoidx], std::string("lto_online_") + std::to_string(ltoidx) + ".ltoir", "wb");
+                    }
+                    if(!check_nvjitlink(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, ltoirs[ltoidx], ltoir_sizes[ltoidx], "lto_online"))) // External LTOIR
+                    {
+                        res = nvrtcResult(-1);
+                    }
+                }
+                if(!check_nvjitlink(handle, nvJitLinkComplete(handle)))
+                {
+                    res = nvrtcResult(-1);
+                } 
+                else 
+                {
+                    if(use_ptx) 
+                    {
+                        size_t ptx_size = 0;
+                        check_nvjitlink(handle, nvJitLinkGetLinkedPtxSize(handle, &ptx_size));
+                        std::vector<char> ptx(ptx_size);
+                        check_nvjitlink(handle, nvJitLinkGetLinkedPtx(handle, ptx.data()));
+                        output = ptx;
+                    } 
+                    else
+                    {
+                        size_t cubin_size = 0;
+                        check_nvjitlink(handle, nvJitLinkGetLinkedCubinSize(handle, &cubin_size));
+                        std::vector<char> cubin(cubin_size);
+                        check_nvjitlink(handle, nvJitLinkGetLinkedCubin(handle, cubin.data()));
+                        output = cubin;
+                    }
+                }
+                check_nvjitlink(handle, nvJitLinkDestroy(&handle));
             }
-            else
-            {
-                fprintf(stderr, "Warp error: Failed to open output file '%s'\n", output_path);
+
+            if(!write_file(output.data(), output.size(), output_path, output_mode)) {
                 res = nvrtcResult(-1);
             }
         }
diff --git a/warp/native/warp.h b/warp/native/warp.h
index 52f67664..a089a6c8 100644
--- a/warp/native/warp.h
+++ b/warp/native/warp.h
@@ -307,7 +307,9 @@ extern "C"
     WP_API bool cuda_graph_launch(void* graph, void* stream);
     WP_API bool cuda_graph_destroy(void* context, void* graph);
 
-    WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_file);
+    WP_API bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size);
+    WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision, int type, int tA, int tB, int num_threads);
+    WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes);
 
     WP_API void* cuda_load_module(void* context, const char* ptx);
     WP_API void cuda_unload_module(void* context, void* module);
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index e1bfd21b..cab691ac 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -407,6 +407,81 @@ def test_tile_extract():
 
     print("Extract backward passed")
 
+@wp.kernel()
+def tile_matmul_dx_kernel(ga: wp.array2d(dtype=wp.float64),
+                          gb: wp.array2d(dtype=wp.float64),
+                          gc: wp.array2d(dtype=wp.float64)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
+    b = wp.tile_load(gb, i, j, m=TILE_K, n=TILE_N)
+    c = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
+    wp.tile_matmul_dx(a, b, c)
+    wp.tile_store(gc, i, j, c)
+
+def test_tile_matmul_dx():
+
+    rng = np.random.default_rng(42)
+
+    A = rng.random((TILE_M, TILE_K), dtype=np.float64)
+    B = rng.random((TILE_K, TILE_N), dtype=np.float64)
+    C = np.zeros((TILE_M, TILE_N), dtype=np.float64)
+
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
+    C_wp = wp.array(C, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_matmul_dx_kernel, dim=[1, 1, TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
+
+    # verify forward pass
+    assert(np.allclose(A @ B, C_wp.numpy(), rtol=1.e-4))
+
+    print("Matmul (Dx) forward passed")
+
+    adj_C = np.ones_like(C)
+
+    tape.backward(grads={C_wp: wp.array(adj_C)})
+
+    assert(np.allclose(adj_C@B.T, A_wp.grad.numpy(), rtol=1.e-4))
+    assert(np.allclose(A.T@adj_C, B_wp.grad.numpy(), rtol=1.e-4))
+
+    print("Matmul (Dx) backward passed")
+
+N_FFT = 128
+
+@wp.kernel()
+def tile_fft_dx_kernel(gx: wp.array2d(dtype=wp.vec2f),
+                       gy: wp.array2d(dtype=wp.vec2f)):
+    i, j, _ = wp.tid()
+    xy = wp.tile_load(gx, i, j, m=N_FFT, n=N_FFT)
+    wp.tile_fft_dx(xy)
+    wp.tile_store(gy, i, j, xy)
+
+def test_tile_fft_dx():
+
+    rng = np.random.default_rng(42)
+
+    # Warp doesn't really have a complex64 type, 
+    # so we use 2 float32 to represent a single complex64 number and then convert it to vec2f
+
+    X = rng.random((N_FFT, 2*N_FFT), dtype=np.float32)
+    Y = np.zeros_like(X)
+    
+    X_wp = wp.array2d(X, requires_grad=True, dtype=wp.vec2f)
+    Y_wp = wp.array2d(Y, requires_grad=True, dtype=wp.vec2f)
+    
+    X_c64 = X.view(np.complex64).reshape(N_FFT, N_FFT)
+    Y_c64 = np.fft.fft(X_c64, axis=-1)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_fft_dx_kernel, dim=[1, 1, TILE_DIM], inputs=[X_wp, Y_wp], block_dim=TILE_DIM)
+
+    Y_wp_c64 = Y_wp.numpy().view(np.complex64).reshape(N_FFT, N_FFT)
+    assert(np.allclose(Y_c64, Y_wp_c64, rtol=1.e-4))
+
+    print("FFT (Dx) forward passed")
+
+    # TODO: implement and test backward pass
 
 test_tile_copy()
 test_tile_unary_map()
@@ -416,6 +491,8 @@ def test_tile_extract():
 test_tile_operators()
 test_tile_sum()
 test_tile_extract()
+test_tile_matmul_dx()
+test_tile_fft_dx()
 
 # #-----------------------------------------
 # # center of mass computation
diff --git a/warp/types.py b/warp/types.py
index ec36adc3..6cc00d90 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2888,8 +2888,8 @@ def ctype(self):
     def cinit(self, adjoint=False):
         from warp.codegen import Var
 
-        if self.storage == "register":          
-            return f"{0}"
+        if self.storage == "register":
+            return self.ctype() + "(0.0)"
         elif self.storage == "shared":
 
             if adjoint:

From f0d9abfec26bea3c34c0358dd5d945cfb3dd9f34 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 25 Sep 2024 07:54:26 +0000
Subject: [PATCH 032/102] Add support for constructing tiles from thread local
 values using wp.tile() Add support for wp.tile_atomic_add() to global memory
 Add support for wp.print() on tiles

---
 warp/builtins.py               |  74 ++++++++++++++++++-
 warp/codegen.py                |   6 ++
 warp/native/tile.h             | 127 +++++++++++++++++++++++++++++++--
 warp/tests/test_tile_reduce.py | 122 +++++++++++++++++++++++++++++++
 4 files changed, 320 insertions(+), 9 deletions(-)
 create mode 100644 warp/tests/test_tile_reduce.py

diff --git a/warp/builtins.py b/warp/builtins.py
index 23742e97..3b74d18d 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1712,7 +1712,7 @@ def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     
     # return generic type (for doc builds)
     if arg_types is None:
-        return array_t(shape=(Any, Any), dtype=Scalar)
+        return Tile(dtype=Any, M=Any, N=Any)
 
     # if len(arg_types) > 0:
     #     raise RuntimeError("tile_zero() args must be passed by keyword")
@@ -1759,7 +1759,7 @@ def tile_load_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
-        return array_t(shape=(Any, Any), dtype=Scalar)
+        return Tile(dtype=Any, M=Any, N=Any)
 
     # if len(arg_types) != 3: 
     #     raise RuntimeError("tile_load() requires 3 positional args")
@@ -1841,7 +1841,75 @@ def tile_store_value_func(arg_types, arg_values):
     input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
     value_func=tile_store_value_func,
     variadic=True,
-    doc="Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)",
+    doc="Store tile `t` to an array `a` at offset `(i=x*m, j=y*n)`",
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_atomic_add_value_func(arg_types, arg_values):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile(dtype=Any, M=Any, N=Any)
+
+    if len(arg_types) != 4: 
+        raise RuntimeError("tile_atomic_add() requires 4 positional args")
+
+    if not is_array(arg_types["a"]):
+        raise RuntimeError("tile_atomic_add() argument 0 must be an array")
+
+    if not type_is_int(arg_types["x"]):
+        raise RuntimeError("tile_atomic_add() argument 1 must be an integer")
+
+    if not type_is_int(arg_types["y"]):
+        raise RuntimeError("tile_atomic_add() argument 2 must be an integer")
+
+    if not is_tile(arg_types["t"]):
+        raise RuntimeError("tile_atomic_add() argument 3 must be a tile")
+
+    if arg_types["a"].dtype != arg_types["t"].dtype:
+        raise RuntimeError("tile_atomic_add() tile dtype and array dtype must match")
+
+    return Tile(dtype=arg_types["t"].dtype,
+                M=arg_types["t"].M,
+                N=arg_types["t"].N)
+
+
+
+add_builtin(
+    "tile_atomic_add",
+    input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
+    value_func=tile_atomic_add_value_func,
+    variadic=True,
+    doc="Atomically add a tile `t` worth of data to array `a` at offset `(i=x*m, j=y*n)`",
+    group="Tile Primitives",
+    export=False,
+)
+
+
+def tile_value_func(arg_types, arg_values):
+    
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile
+
+    if len(arg_types) != 1: 
+        raise RuntimeError("tile() requires 1 positional arg")
+
+    # todo: we need a way to pass things like current compiler options
+    # into the value_func, for now we use a single global options dictionary
+    # we should ideally pass in the Adjoint object if it exists
+
+    return Tile(dtype=arg_types["x"], M=1, N=warp.codegen.options["block_dim"], op="Tile")
+
+
+
+add_builtin(
+    "tile",
+    input_types={"x": Any},
+    value_func=tile_value_func,
+    variadic=True,
+    doc="Construct a Tile from a per-thread kernel value, returns a tile with dimensions of `(1, block_dim)` where block_dim is the number of threads specified in `wp.launch()`",
     group="Tile Primitives",
     export=False,
 )
diff --git a/warp/codegen.py b/warp/codegen.py
index e5a21a83..fc2da1f6 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -23,6 +23,9 @@
 import warp.config
 from warp.types import *
 
+# used as a globally accessible copy
+# of current compile options (block_dim) etc
+options = {}
 
 class WarpCodegenError(RuntimeError):
     def __init__(self, message):
@@ -917,6 +920,9 @@ def build(adj, builder, default_builder_options=None):
         else:
             adj.builder_options = default_builder_options
 
+        global options
+        options = adj.builder_options
+
         adj.symbols = {}  # map from symbols to adjoint variables
         adj.variables = []  # list of local variables (in order)
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 10ea0830..7563e0d9 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -268,6 +268,8 @@ struct tile_register_t
         }
     }
 
+    inline CUDA_CALLABLE void print();
+
 
     // return the in-register version of this tile (nop)
     inline CUDA_CALLABLE auto& copy_to_register() { return *this; }
@@ -327,7 +329,6 @@ struct tile_register_t
             data[i] = ptr[c.i*stride_i + c.j*stride_j];
         }
     }
-        
 };
 
 
@@ -466,15 +467,17 @@ struct tile_shared_t
 
     inline CUDA_CALLABLE void print()
     {
+        WP_TILE_SYNC();
+
         if (threadIdx.x == 0)
         {
-            printf("[");
+            printf("Tile(M=%d, N=%d, storage=shared) = [\n", M, N);
             for (int i=0; i < M; ++i)
             {
                 printf("%*s[", i>0, "");
                 for (int j=0; j < N; ++j)
                 {
-                    printf("%5.2f ", (*this)(i, j));
+                    printf("%g ", double((*this)(i, j)));
                 }
 
                 if (i == M-1)
@@ -553,6 +556,52 @@ struct tile_shared_t
     }
 };
 
+template <typename T, int M, int N>
+void tile_register_t<T, M, N>::print()
+{
+    // create a temporary shared tile so that
+    // we can print it deterministically
+    WP_TILE_SHARED T smem[M*N];
+    
+    tile_shared_t<T, M, N> scratch(smem);
+    scratch.assign(*this);
+
+    WP_TILE_SYNC();
+
+    if (threadIdx.x == 0)
+    {
+        printf("Tile(M=%d, N=%d, storage=register) = [\n", M, N);
+        for (int i=0; i < M; ++i)
+        {
+            printf("%*s[", i>0, "");
+            for (int j=0; j < N; ++j)
+            {
+                printf("%g ", double(scratch(i, j)));
+            }
+
+            if (i == M-1)
+                printf("]]\n");
+            else
+                printf("]\n");
+        }
+    }
+
+    WP_TILE_SYNC();
+}
+
+template <typename Tile>
+inline CUDA_CALLABLE void print(Tile& t)
+{
+    t.print();
+}
+
+template <typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_print(Tile& t, AdjTile& a)
+{
+    a.print();
+}
+
+
 // helpers to allocate shared tiles
 template <typename T, int M, int N, int Alloc>
 inline CUDA_CALLABLE auto tile_alloc_empty()
@@ -579,14 +628,44 @@ inline CUDA_CALLABLE auto tile_transpose(Tile& t)
     return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
 }
 
-
 //-----------------------------------------------------------------------------------------------------
 // High level entry points for each op (correspond to one Warp builtin)
 
+// construct a tile from a local SIMT value (one per-thread)
+template <typename T>
+inline CUDA_CALLABLE auto tile(const T& x)
+{
+    tile_register_t<T, 1, WP_TILE_BLOCK_DIM> result;
+    
+    // code-gen should have set the tile to 
+    // have exactly the block dimension so 
+    // there is exactly one value per-thread
+    static_assert(result.NumRegs == 1);
+
+    result.data[0] = x;
+    return result;
+}
+
+// construct a tile from a local SIMT value (one per-thread)
+template <typename T, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, const AdjTile& adj_ret)
+{
+    static_assert(AdjTile::M == 1);
+    static_assert(AdjTile::N == WP_TILE_BLOCK_DIM);
+
+    // code-gen should have set the tile to 
+    // have exactly the block dimension so 
+    // there is exactly one value per-thread
+    static_assert(AdjTile::NumRegs == 1);
+
+    adj_x += adj_ret.data[0];
+}
+
+// zero initialized tile
 template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_zeros()
 {
-    // tile variable assignment operator will handle initialization
+    // tile variable assignment operator will handle initialization (since lhs could be shared/register tile)
     return T(0.0);
 }
 
@@ -609,6 +688,35 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src)
     src.copy_to_global(dest, x, y);
 }
 
+// entry point for store
+template <typename T, typename Tile>
+inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, int y, Tile& src)
+{
+    auto src_reg = src.copy_to_register();
+
+    const int tile_i = x*src_reg.M;
+    const int tile_j = y*src_reg.N;
+
+    tile_register_t<T, src_reg.M, src_reg.N> previous;
+
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < src_reg.NumRegs; ++i)
+    {
+        // handle case where tile size is not 
+        // aligned to block dimensions
+        int linear = src_reg.index(i);
+        if (!src_reg.Aligned && linear >= src_reg.Size)
+            break;
+
+        coord_t c = src_reg.coord(linear);
+        previous.data[i] = atomic_add(dest, tile_i + c.i, tile_j + c.j, src_reg.data[i]);
+    }
+
+    return previous;
+}
+
+
+
 //-------------------------------------
 // Adjoints
 
@@ -674,9 +782,16 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t
     }
 
     // store adjoint back to tile
-    adj_t.assign(adj_reg);
+    adj_t.assign(adj_reg);    
 }
 
+template <typename T, typename Tile, typename AdjTile, typename AdjRet>
+inline CUDA_CALLABLE void adj_tile_atomic_add(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, AdjTile& adj_t, AdjRet& adj_ret)
+{  
+    adj_tile_store(dest, x, y, t, adj_dest, adj_x, adj_y, adj_t);
+}
+
+
 // unary map
 template <typename Tile, typename Fwd>
 inline CUDA_CALLABLE auto tile_map(Fwd op,
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
new file mode 100644
index 00000000..a71e08d3
--- /dev/null
+++ b/warp/tests/test_tile_reduce.py
@@ -0,0 +1,122 @@
+import numpy as np
+import warp as wp
+
+wp.init()
+wp.set_module_options({"enable_backward": True})
+wp.set_device("cuda:0")
+wp.set_module_options({"fast_math": True})
+#wp.config.mode = "debug"
+#wp.config.verify_cuda = True
+
+wp.build.clear_kernel_cache()
+
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+
+# num threads per-tile
+TILE_DIM = 64
+
+
+@wp.kernel
+def tile_sum_kernel(input: wp.array3d(dtype=float),
+                    output: wp.array(dtype=float)):
+
+    # output tile index
+    i, _ = wp.tid()
+
+    a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
+    s = wp.tile_sum(a)*0.5
+
+    wp.tile_store(output, i, 0, s)
+
+def test_tile_sum():
+
+    batch_count = 56
+
+    M = TILE_M
+    N = TILE_N
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, M, N), dtype=np.float32)
+
+    input_wp = wp.array(input, requires_grad=True)
+    output_wp = wp.zeros(batch_count, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_sum_kernel, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
+
+
+    for i in range(batch_count):
+        sum_np = np.sum(input[i])*0.5
+        sum_wp = output_wp.numpy()[i]
+
+        assert(np.allclose(sum_np, sum_wp, rtol=1.e-4))
+
+    print("Sum forward passed")
+
+    output_wp.grad.fill_(1.0)
+
+    tape.backward()
+
+    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.5, rtol=1.e-4))
+
+    print("Sum backward passed")
+
+
+
+@wp.kernel
+def tile_reduce_1d_kernel(output: wp.array(dtype=int)):
+
+    # output tile index
+    i = wp.tid()
+    
+    t = wp.tile(i)      # convert to block wide tile    
+    s = wp.tile_sum(t)  # sum over block
+
+    # update global sum
+    wp.tile_atomic_add(output, i, 0, s)
+
+def test_tile_reduce_1d():
+
+    N = int(TILE_DIM*3/2)
+
+    output = wp.zeros(shape=1, dtype=int, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_reduce_1d_kernel, dim=[N], inputs=[output], block_dim=TILE_DIM)
+
+    assert(np.sum(np.arange(N)), output.numpy())
+
+    print("Sum 1D forward passed")
+
+    # output_wp.grad.fill_(1.0)
+
+    # tape.backward()
+
+    # assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.5, rtol=1.e-4))
+
+    # print("Sum backward passed")
+
+
+test_tile_sum()
+test_tile_reduce_1d()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

From 22a843cf1b0a352ae2bd9e024f9d538a87a97f21 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 25 Sep 2024 16:38:15 -0700
Subject: [PATCH 033/102] Build-related updates for mathdx+tile

---
 .gitlab-ci.yml                                |  27 ++
 .gitlab/ci/common.yml                         |   1 +
 .gitlab/ci/cuda-11-build-and-test.yml         |   2 +-
 .gitlab/ci/debug-build-and-test.yml           |   2 +-
 .gitlab/ci/mathdx-support.yml                 | 169 +++++++
 UNKNOWN.egg-info/PKG-INFO                     |  11 -
 UNKNOWN.egg-info/SOURCES.txt                  |   8 -
 UNKNOWN.egg-info/dependency_links.txt         |   1 -
 UNKNOWN.egg-info/top_level.txt                |   1 -
 build_lib.py                                  |  24 +-
 deps/cuda-toolkit-deps.packman.xml            |   6 +-
 docs/modules/functions.rst                    |  99 ++++
 examples/tile_fft.py                          |  13 +-
 examples/tile_matmul.py                       |   9 +-
 tools/ci/building/build-linux-x86_64/build.sh |   2 +-
 warp/build.py                                 |  34 +-
 warp/build_dll.py                             |  24 +-
 warp/builtins.py                              | 295 ++++++------
 warp/codegen.py                               |  16 +-
 warp/context.py                               | 110 +++--
 warp/examples/benchmarks/benchmark_tile.py    |  85 ++--
 warp/mathdx.py                                | 151 ++++++
 warp/native/mathdx.cpp                        |  56 +++
 warp/native/tile_gemm.h                       |   5 +-
 warp/native/warp.cpp                          |   7 +-
 warp/native/warp.cu                           | 243 +++++-----
 warp/native/warp.h                            |   4 +-
 warp/stubs.py                                 | 102 ++++
 warp/tape.py                                  |  15 +-
 warp/tests/test_mat_scalar_ops.py             |   2 +-
 warp/tests/test_spatial.py                    |   2 +-
 warp/tests/test_tile.py                       | 444 +++++++-----------
 warp/tests/test_tile_mathdx.py                | 116 +++++
 warp/tests/test_tile_reduce.py                | 107 ++---
 warp/tests/unittest_utils.py                  |   4 +
 warp/types.py                                 |  23 +-
 36 files changed, 1431 insertions(+), 789 deletions(-)
 create mode 100644 .gitlab/ci/mathdx-support.yml
 delete mode 100644 UNKNOWN.egg-info/PKG-INFO
 delete mode 100644 UNKNOWN.egg-info/SOURCES.txt
 delete mode 100644 UNKNOWN.egg-info/dependency_links.txt
 delete mode 100644 UNKNOWN.egg-info/top_level.txt
 create mode 100644 warp/mathdx.py
 create mode 100644 warp/native/mathdx.cpp
 create mode 100644 warp/tests/test_tile_mathdx.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7a5e2012..5eb130f6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -30,6 +30,7 @@ variables:
   PIP_CACHE_DIR: '$CI_PROJECT_DIR/.cache/pip'
   CUDA_BIN: '$CI_PROJECT_DIR/_build/target-deps/cuda/bin'
   CUDA: '$CI_PROJECT_DIR/_build/target-deps/cuda'
+  CUDA_HOME: '$CI_PROJECT_DIR/_build/target-deps/cuda'
   PYTHON: '$CI_PROJECT_DIR/_build/target-deps/python/python'
   LINBUILD: '$CI_PROJECT_DIR/_build/host-deps/linbuild/linbuild.sh'
   WARP_CACHE_ROOT: '$CI_PROJECT_DIR/.cache/warp' # Used by the parallel test runner
@@ -493,6 +494,32 @@ debug build and test:
   extends:
     - .trigger_common
 
+trigger mathdx support pipeline:
+  stage: test
+  image: busybox
+  extends:
+    - .runner-utility-linux-x86_64
+  needs: []
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+    - if: $CI_COMMIT_TAG
+    - if: $CI_COMMIT_BRANCH =~ /^release-.*/
+    - when: manual # Can be triggered in all other scenarios
+      allow_failure: true
+  variables:
+    GIT_STRATEGY: none
+  script:
+    - echo "Run this job to test Warp compiled with mathdx support."
+
+# Uses the same Python version as the main pipeline.
+mathdx support:
+  stage: child pipelines
+  needs: [trigger mathdx support pipeline]
+  trigger:
+    include: /.gitlab/ci/mathdx-support.yml
+  extends:
+    - .trigger_common
+
 # Trigger CUDA 11 pipelines
 # Workaround from https://gitlab.com/gitlab-org/gitlab/-/issues/284086
 trigger cuda 11 pipeline:
diff --git a/.gitlab/ci/common.yml b/.gitlab/ci/common.yml
index f0d6463a..e8b8c5ee 100644
--- a/.gitlab/ci/common.yml
+++ b/.gitlab/ci/common.yml
@@ -106,6 +106,7 @@ include:
     PIP_CACHE_DIR: '$PARENT_PROJECT_DIR/.cache/pip'
     CUDA_BIN: '$PARENT_PROJECT_DIR/_build/target-deps/cuda/bin'
     CUDA: '$PARENT_PROJECT_DIR/_build/target-deps/cuda'
+    CUDA_HOME: '$CI_PROJECT_DIR/_build/target-deps/cuda'
     PYTHON: '$PARENT_PROJECT_DIR/_build/target-deps/python/python'
     LINBUILD: '$PARENT_PROJECT_DIR/_build/host-deps/linbuild/linbuild.sh'
     WARP_CACHE_ROOT: '$PARENT_PROJECT_DIR/.cache/warp' # Used by the parallel test runner
diff --git a/.gitlab/ci/cuda-11-build-and-test.yml b/.gitlab/ci/cuda-11-build-and-test.yml
index 483a66bd..735104ea 100644
--- a/.gitlab/ci/cuda-11-build-and-test.yml
+++ b/.gitlab/ci/cuda-11-build-and-test.yml
@@ -187,7 +187,7 @@ create pypi wheels:
     - find . -type d -exec chmod 775 {} +
   artifacts:
     name: $PARENT_COMMIT_REF_SLUG-$PARENT_COMMIT_SHORT_SHA
-    expose_as: "Python Wheels"
+    expose_as: "Python Wheels Cu11"
     paths:
       - "dist/"
     when: always
diff --git a/.gitlab/ci/debug-build-and-test.yml b/.gitlab/ci/debug-build-and-test.yml
index e86f553d..3ebeeade 100644
--- a/.gitlab/ci/debug-build-and-test.yml
+++ b/.gitlab/ci/debug-build-and-test.yml
@@ -196,7 +196,7 @@ create pypi wheels:
     - find . -type d -exec chmod 775 {} +
   artifacts:
     name: $CI_COMMIT_REF_SLUG-$CI_COMMIT_SHORT_SHA
-    expose_as: "Python Wheels"
+    expose_as: "Python Wheels Debug"
     paths:
       - "dist/"
     when: always
diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
new file mode 100644
index 00000000..5bea3383
--- /dev/null
+++ b/.gitlab/ci/mathdx-support.yml
@@ -0,0 +1,169 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# ==============================================================================
+# CI/CD Pipeline Configuration
+# ==============================================================================
+
+include: /.gitlab/ci/common.yml
+
+workflow:
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "parent_pipeline"
+
+stages:
+  - build
+  - test
+  - package
+  - deploy
+
+# ==============================================================================
+# Build Jobs (Release)
+# ==============================================================================
+
+linux-x86_64 build:
+  stage: build
+  image: ubuntu:20.04
+  extends:
+    - .save_warp_bin_artifact
+    - .runner-build-linux-x86_64
+  before_script:
+    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
+    - apt-get update && apt-get install build-essential curl wget --no-install-recommends -y
+    - >
+      wget --header="X-JFrog-Art-Api:$ARTIFACTORY_API_KEY" -nv --no-check-certificate
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/30/libmathdx_build_x86_64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      -O libmathdx.tar.gz
+    - mkdir -p _build/target-deps
+    - tar -xzf libmathdx.tar.gz -C _build/target-deps
+    - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
+    - gcc --version
+    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
+  script:
+    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker # We are already using the builder image
+    - mkdir -p warp/bin/linux-x86_64
+    - mv warp/bin/warp.so warp/bin/linux-x86_64
+    - mv warp/bin/warp-clang.so warp/bin/linux-x86_64
+
+linux-aarch64 build:
+  stage: build
+  image: ubuntu:20.04
+  extends:
+    - .save_warp_bin_artifact
+  before_script:
+    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
+    - apt-get update && apt-get install build-essential curl wget --no-install-recommends -y
+    - >
+      wget --header="X-JFrog-Art-Api:$ARTIFACTORY_API_KEY" -nv --no-check-certificate
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/30/libmathdx_build_aarch64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      -O libmathdx.tar.gz
+    - mkdir -p _build/target-deps
+    - tar -xzf libmathdx.tar.gz -C _build/target-deps
+    - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
+    - gcc --version
+    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
+  script:
+    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker # We are already using the builder image
+    - mkdir -p warp/bin/linux-aarch64
+    - mv warp/bin/warp.so warp/bin/linux-aarch64
+    - mv warp/bin/warp-clang.so warp/bin/linux-aarch64
+  tags:
+    - arch/arm
+
+# ==============================================================================
+# Unit Testing Jobs (MathDx Support)
+#
+# Unlike the main testing jobs defined in /.gitlab-ci.yml, the jobs don't
+# generate code coverage reports.
+# ==============================================================================
+
+linux-x86_64 test:
+  stage: test
+  needs: [linux-x86_64 build]
+  extends:
+    - .omni_nvks_gpu_2x
+    - .save_test_report_artifact
+  before_script:
+    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
+    - df -h
+    # Move compiled binaries out of platform-specific directory
+    - mv warp/bin/linux-x86_64/warp.so warp/bin/
+    - mv warp/bin/linux-x86_64/warp-clang.so warp/bin/
+    - tools/packman/packman install -l _build/target-deps/python python ${DEFAULT_PYTHON}-linux-x86_64
+    - export PATH="$CUDA_BIN:$PATH"
+    - $PYTHON -m venv _venv
+    - source _venv/bin/activate
+    - python -m pip install --upgrade pip
+    - python -m pip install --upgrade usd-core
+    - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
+    - python -m pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+    - python -m pip install --upgrade nvidia-mathdx==24.4.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
+    - python -m pip install -e .
+    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
+    # HACK: disable P2P tests due to misbehaving agents
+    - export WARP_DISABLE_P2P_TESTS=1
+  script:
+    - python -m warp.tests --junit-report-xml rspec.xml -s autodetect --failfast
+
+linux-aarch64 test jetson:
+  image: ubuntu:22.04
+  needs: [linux-aarch64 build]
+  extends:
+    - .save_test_report_artifact
+  before_script:
+    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
+    - !reference [.snippets, install-python+warp-aarch64]
+    - python -m pip install --upgrade nvidia-mathdx==24.4.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
+    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
+  script:
+    - python -m warp.tests --junit-report-xml rspec.xml -s autodetect --failfast
+  tags:
+    - gpu/orin
+
+# ==============================================================================
+# Packaging Jobs
+# ==============================================================================
+
+# Creates wheel files for PyPI
+create pypi wheels:
+  stage: package
+  needs:
+    - linux-aarch64 build
+    - linux-x86_64 build
+  extends:
+    - .runner-utility-linux-x86_64
+  before_script:
+    - python3 -m pip install --upgrade pip
+    - python3 -m pip install build
+  script:
+    - sed -i 's/dependencies = \["numpy"\]/dependencies = \["numpy", "nvidia-mathdx==24.4.0", "nvidia-cuda-cccl-cu12", "nvidia-cuda-runtime-cu12"\]/' pyproject.toml
+    - sed -i "s/^\(.*\)$/\1+tile/" VERSION.md  # Modify VERSION.md with +tile
+    - python3 -m build --wheel -C--build-option=-Plinux-x86_64
+    - python3 -m build --wheel -C--build-option=-Plinux-aarch64
+    - find . -type f -exec chmod 664 {} +
+    - find . -type d -exec chmod 775 {} +
+  artifacts:
+    name: $CI_COMMIT_REF_SLUG-$CI_COMMIT_SHORT_SHA
+    expose_as: "Python Wheels MathDx"
+    paths:
+      - "dist/"
+    when: always
+
+publish wheels to gitlab pypi registry:
+  stage: deploy
+  image: python:3.11-slim
+  needs: ["create pypi wheels"]
+  extends:
+    - .runner-utility-linux-x86_64
+  rules:
+    - when: manual
+      allow_failure: true
+  before_script:
+    - python3 -m pip install --upgrade pip
+    - python3 -m pip install --upgrade build twine
+  script:
+    - TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python3 -m twine upload --verbose --skip-existing --non-interactive --repository-url ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi dist/*
diff --git a/UNKNOWN.egg-info/PKG-INFO b/UNKNOWN.egg-info/PKG-INFO
deleted file mode 100644
index 9f5ddb3d..00000000
--- a/UNKNOWN.egg-info/PKG-INFO
+++ /dev/null
@@ -1,11 +0,0 @@
-Metadata-Version: 2.1
-Name: UNKNOWN
-Version: 0.0.0
-Summary: UNKNOWN
-Home-page: UNKNOWN
-License: UNKNOWN
-Platform: UNKNOWN
-License-File: LICENSE.md
-
-UNKNOWN
-
diff --git a/UNKNOWN.egg-info/SOURCES.txt b/UNKNOWN.egg-info/SOURCES.txt
deleted file mode 100644
index 0bbe9384..00000000
--- a/UNKNOWN.egg-info/SOURCES.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-LICENSE.md
-README.md
-pyproject.toml
-setup.py
-UNKNOWN.egg-info/PKG-INFO
-UNKNOWN.egg-info/SOURCES.txt
-UNKNOWN.egg-info/dependency_links.txt
-UNKNOWN.egg-info/top_level.txt
\ No newline at end of file
diff --git a/UNKNOWN.egg-info/dependency_links.txt b/UNKNOWN.egg-info/dependency_links.txt
deleted file mode 100644
index 8b137891..00000000
--- a/UNKNOWN.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/UNKNOWN.egg-info/top_level.txt b/UNKNOWN.egg-info/top_level.txt
deleted file mode 100644
index 8b137891..00000000
--- a/UNKNOWN.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/build_lib.py b/build_lib.py
index 3827c4cd..7aea44fe 100644
--- a/build_lib.py
+++ b/build_lib.py
@@ -17,6 +17,7 @@
 import argparse
 import glob
 import os
+import platform
 import shutil
 
 from warp.build_dll import build_dll, find_host_compiler, set_msvc_env, verbose_cmd
@@ -26,6 +27,7 @@
 parser.add_argument("--msvc_path", type=str, help="Path to MSVC compiler (optional if already on PATH)")
 parser.add_argument("--sdk_path", type=str, help="Path to WinSDK (optional if already on PATH)")
 parser.add_argument("--cuda_path", type=str, help="Path to CUDA SDK")
+parser.add_argument("--libmathdx_path", type=str, help="Path to libmathdx (optional if LIBMATHDX_HOME is defined)")
 parser.add_argument(
     "--mode",
     type=str,
@@ -70,6 +72,7 @@
 parser.add_argument("--no_standalone", dest="standalone", action="store_false")
 parser.set_defaults(standalone=True)
 
+
 args = parser.parse_args()
 
 # set build output path off this file
@@ -97,7 +100,7 @@ def find_cuda_sdk():
         return cuda_sdk
 
     # check default paths
-    if os.name == "nt":
+    if platform.system() == "Windows":
         cuda_paths = glob.glob("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*.*")
         if len(cuda_paths) >= 1:
             cuda_sdk = cuda_paths[0]
@@ -115,16 +118,21 @@ def find_cuda_sdk():
 
 
 # setup CUDA Toolkit path
-if sys.platform == "darwin":
+if platform.system() == "Darwin":
     args.cuda_path = None
-
 else:
     if not args.cuda_path:
         args.cuda_path = find_cuda_sdk()
 
+    if not args.libmathdx_path:
+        libmathdx_path = os.environ.get("LIBMATHDX_HOME")
+
+        if libmathdx_path:
+            print(f"Using libmathdx path '{libmathdx_path}' provided through the 'LIBMATHDX_HOME' environment variable")
+            args.libmathdx_path = libmathdx_path
 
 # setup MSVC and WinSDK paths
-if os.name == "nt":
+if platform.system() == "Windows":
     if args.msvc_path or args.sdk_path:
         # user provided MSVC and Windows SDK
         assert args.msvc_path and args.sdk_path, "--msvc_path and --sdk_path must be used together."
@@ -141,9 +149,9 @@ def find_cuda_sdk():
 
 # return platform specific shared library name
 def lib_name(name):
-    if sys.platform == "win32":
+    if platform.system() == "Windows":
         return f"{name}.dll"
-    elif sys.platform == "darwin":
+    elif platform.system() == "Darwin":
         return f"lib{name}.dylib"
     else:
         return f"{name}.so"
@@ -189,6 +197,7 @@ def generate_exports_header_file():
         "native/volume.cpp",
         "native/marching.cpp",
         "native/cutlass_gemm.cpp",
+        "native/mathdx.cpp",
     ]
     warp_cpp_paths = [os.path.join(build_path, cpp) for cpp in cpp_sources]
 
@@ -198,6 +207,9 @@ def generate_exports_header_file():
     else:
         warp_cu_path = os.path.join(build_path, "native/warp.cu")
 
+    if args.libmathdx_path is None:
+        print("Warning: libmathdx not found, building without MathDx support")
+
     warp_dll_path = os.path.join(build_path, f"bin/{lib_name('warp')}")
 
     build_dll(args, dll_path=warp_dll_path, cpp_paths=warp_cpp_paths, cu_path=warp_cu_path)
diff --git a/deps/cuda-toolkit-deps.packman.xml b/deps/cuda-toolkit-deps.packman.xml
index 7b3b4e6d..0024ee86 100644
--- a/deps/cuda-toolkit-deps.packman.xml
+++ b/deps/cuda-toolkit-deps.packman.xml
@@ -7,9 +7,9 @@
   </dependency>
 
   <dependency name="cuda-12" linkPath="../_build/target-deps/cuda" tags="cuda-12">
-    <package name="warp-cuda" version="12.5.0-0-windows-x86_64-release" platforms="windows-x86_64"/>
-    <package name="warp-cuda" version="12.5.0-0-linux-x86_64-release" platforms="linux-x86_64"/>
-    <package name="warp-cuda" version="12.5.0-0-linux-aarch64-release" platforms="linux-aarch64"/>
+    <package name="warp-cuda" version="12.6.0-0-windows-x86_64-release" platforms="windows-x86_64"/>
+    <package name="warp-cuda" version="12.6.0-0-linux-x86_64-release" platforms="linux-x86_64"/>
+    <package name="warp-cuda" version="12.6.0-0-linux-aarch64-release" platforms="linux-aarch64"/>
   </dependency>
 
 </project>
diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 080a6e7d..bcd18cc9 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -800,6 +800,77 @@ Spatial Math
 
 
 
+Tile Primitives
+---------------
+.. py:function:: tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile
+
+    Allocate a tile local block of zero'd memory
+
+
+.. py:function:: tile_load(a: Array[Any], x: int32, y: int32, m: int32, n: int32) -> Tile
+
+    Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)
+
+
+.. py:function:: tile_store(a: Array[Any], x: int32, y: int32, t: Any) -> None
+
+    Store tile `t` to an array `a` at offset `(i=x*m, j=y*n)`
+
+
+.. py:function:: tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile
+
+    Atomically add a tile `t` worth of data to array `a` at offset `(i=x*m, j=y*n)`
+
+
+.. py:function:: tile(x: Any) -> Tile
+
+    Construct a Tile from a per-thread kernel value, returns a tile with dimensions of `(1, block_dim)` where block_dim is the number of threads specified in `wp.launch()`
+
+
+.. py:function:: tile_extract(a: Tile, i: int32, j: int32) -> None
+
+    Extract element at index (i, j) of the tile and return the native type
+
+
+.. py:function:: tile_matmul(a: Tile, b: Tile, out: Tile) -> None
+
+    Compute matrix product and accumulate out += a*b.
+
+
+.. py:function:: tile_sum(a: Tile) -> None
+
+    Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored
+
+
+.. py:function:: tile_map(op: Callable, a: Any) -> None
+
+    Map the operation onto each element of the tile
+
+
+.. py:function:: tile_map(op: Callable, a: Any, b: Any) -> None
+    :noindex:
+    :nocontentsentry:
+
+    Map the operation onto each element of the tile
+
+
+.. py:function:: tile_matmul_dx(a: Tile, b: Tile, out: Tile) -> None
+
+    Compute matrix product and accumulate out += a*b.
+
+
+.. py:function:: tile_fft_dx(inout: Tile) -> None
+
+    Compute the FFT along the second dimension of a 2D tile of data.
+
+
+.. py:function:: tile_ifft_dx(inout: Tile) -> None
+
+    Compute the inverse FFT along the second dimension of a 2D tile of data.
+
+
+
+
 Utility
 ---------------
 .. py:function:: mlp(weights: Array[float32], bias: Array[float32], activation: Callable, index: int32, x: Array[float32], out: Array[float32]) -> None
@@ -1961,6 +2032,13 @@ Operators
     :nocontentsentry:
 
 
+.. py:function:: add(a: Tile, b: Tile) -> None
+    :noindex:
+    :nocontentsentry:
+
+    Add each element of two tiles together
+
+
 .. py:function:: sub(a: Scalar, b: Scalar) -> Scalar
 
 
@@ -2052,6 +2130,20 @@ Operators
     :nocontentsentry:
 
 
+.. py:function:: mul(x: Tile, y: Scalar) -> Tile
+    :noindex:
+    :nocontentsentry:
+
+    Multiply each element of a tile by a scalar
+
+
+.. py:function:: mul(x: Scalar, y: Tile) -> Tile
+    :noindex:
+    :nocontentsentry:
+
+    Multiply each element of a tile by a scalar
+
+
 .. py:function:: mod(a: Scalar, b: Scalar) -> Scalar
 
     Modulo operation using truncated division.
@@ -2136,6 +2228,13 @@ Operators
     :nocontentsentry:
 
 
+.. py:function:: neg(x: Tile) -> Tile
+    :noindex:
+    :nocontentsentry:
+
+    Negate each element of a tile
+
+
 .. py:function:: unot(a: bool) -> bool
 
 
diff --git a/examples/tile_fft.py b/examples/tile_fft.py
index f6cf23f9..edc6c101 100644
--- a/examples/tile_fft.py
+++ b/examples/tile_fft.py
@@ -1,6 +1,6 @@
 import numpy as np
+
 import warp as wp
-import numpy as np
 
 wp.init()
 wp.set_module_options({"enable_backward": False})
@@ -11,10 +11,9 @@
 TILE_M = 1
 TILE_N = 32
 
+
 @wp.kernel
-def fft_tiled(x: wp.array2d(dtype=wp.vec2d),
-              y: wp.array2d(dtype=wp.vec2d)):
-    
+def fft_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
     i, j, _ = wp.tid()
     a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
     wp.tile_fft_dx(a)
@@ -23,12 +22,12 @@ def fft_tiled(x: wp.array2d(dtype=wp.vec2d),
 
 
 x_h = np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
-x_h[:,:,1] = 0
+x_h[:, :, 1] = 0
 y_h = 3 * np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
 x_wp = wp.array2d(x_h, dtype=wp.vec2d)
 y_wp = wp.array2d(y_h, dtype=wp.vec2d)
 
 wp.launch(fft_tiled, dim=[1, 1, BLOCK_DIM], inputs=[x_wp, y_wp], block_dim=BLOCK_DIM)
 
-print("inputs:\n", x_wp) # [1+0i, 1+0i, 1+0i, ...]
-print("output:\n", y_wp) # [32+0i, 0, 0, ...]
+print("inputs:\n", x_wp)  # [1+0i, 1+0i, 1+0i, ...]
+print("output:\n", y_wp)  # [32+0i, 0, 0, ...]
diff --git a/examples/tile_matmul.py b/examples/tile_matmul.py
index 3d980592..faedbee6 100644
--- a/examples/tile_matmul.py
+++ b/examples/tile_matmul.py
@@ -1,4 +1,5 @@
 import numpy as np
+
 import warp as wp
 
 wp.init()
@@ -7,11 +8,9 @@
 BLOCK_DIM = 32
 M, N, K = 4, 8, 16
 
+
 @wp.kernel
-def matmul_tiled(ga: wp.array2d(dtype=wp.float64),
-                 gb: wp.array2d(dtype=wp.float64),
-                 gc: wp.array2d(dtype=wp.float64)):
-    
+def matmul_tiled(ga: wp.array2d(dtype=wp.float64), gb: wp.array2d(dtype=wp.float64), gc: wp.array2d(dtype=wp.float64)):
     i, j, _ = wp.tid()
     a = wp.tile_load(ga, i, j, m=M, n=K)
     b = wp.tile_load(gb, i, j, m=K, n=N)
@@ -31,5 +30,5 @@ def matmul_tiled(ga: wp.array2d(dtype=wp.float64),
 wp.launch(matmul_tiled, dim=[1, 1, BLOCK_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=BLOCK_DIM)
 wp.synchronize()
 
-print("inputs:\n", A, '\n', B)
+print("inputs:\n", A, "\n", B)
 print("output (should be = 48 * np.ones(4, 8)):\n", C_wp)
diff --git a/tools/ci/building/build-linux-x86_64/build.sh b/tools/ci/building/build-linux-x86_64/build.sh
index 51940183..e9af605d 100755
--- a/tools/ci/building/build-linux-x86_64/build.sh
+++ b/tools/ci/building/build-linux-x86_64/build.sh
@@ -74,7 +74,7 @@ CUDA="$SCRIPT_DIR/../../../../_build/target-deps/cuda"
 
 # pip deps
 $PYTHON -m pip install --upgrade pip
-$PYTHON -m pip install --upgrade numpy gitpython cmake ninja
+$PYTHON -m pip install --upgrade numpy gitpython cmake ninja nvidia-mathdx==24.4.0
 
 if [ "$GITLAB_CI" = "true" ]; then
     echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
diff --git a/warp/build.py b/warp/build.py
index 024e5ebc..8655201c 100644
--- a/warp/build.py
+++ b/warp/build.py
@@ -5,40 +5,48 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
+import ctypes
 import os
 
 import warp.config
+from warp.mathdx import get_cuda_include_dirs
 from warp.thirdparty import appdirs
-import ctypes
-
-def get_mathdx_include_dirs():
-    return (os.environ['MATHDX_HOME'] + '/include').encode("utf-8")
 
-def get_cuda_include_dirs():
-    cuda_inc_path = (os.environ['CUDA_HOME'] + '/include').encode("utf-8")
-    include_dirs = [cuda_inc_path]
-    arr_include_dirs = (ctypes.c_char_p * len(include_dirs))()
-    arr_include_dirs[:] = include_dirs
-    return arr_include_dirs
 
 # builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension)
-def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False, ltoirs=[]):
+def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fast_math=False, ltoirs=None):
     with open(cu_path, "rb") as src_file:
         src = src_file.read()
         cu_path = cu_path.encode("utf-8")
         inc_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "native").encode("utf-8")
         output_path = output_path.encode("utf-8")
-        cuda_include_dirs = get_cuda_include_dirs()
 
         if warp.config.llvm_cuda:
             warp.context.runtime.llvm.compile_cuda(src, cu_path, inc_path, output_path, False)
 
         else:
+            cuda_include_dirs = get_cuda_include_dirs()
+
+            if ltoirs is None:
+                ltoirs = []
+
             num_ltoirs = len(ltoirs)
             arr_lroirs = (ctypes.c_char_p * num_ltoirs)(*ltoirs)
             arr_lroir_sizes = (ctypes.c_size_t * num_ltoirs)(*[len(l) for l in ltoirs])
             err = warp.context.runtime.core.cuda_compile_program(
-                src, arch, inc_path, len(cuda_include_dirs), cuda_include_dirs, config == "debug", warp.config.verbose, verify_fp, fast_math, output_path, num_ltoirs, arr_lroirs, arr_lroir_sizes
+                src,
+                arch,
+                inc_path,
+                len(cuda_include_dirs),
+                cuda_include_dirs,
+                config == "debug",
+                warp.config.verbose,
+                verify_fp,
+                fast_math,
+                output_path,
+                num_ltoirs,
+                arr_lroirs,
+                arr_lroir_sizes,
             )
             if err != 0:
                 raise Exception(f"CUDA kernel build failed with error code {err}")
diff --git a/warp/build_dll.py b/warp/build_dll.py
index b6dfc81d..b860c146 100644
--- a/warp/build_dll.py
+++ b/warp/build_dll.py
@@ -244,7 +244,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             iter_dbg = "_ITERATOR_DEBUG_LEVEL=2"
             debug = "_DEBUG"
 
-        cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
+        cpp_flags = f'/nologo /std:c++17 /GR- {runtime} /D "{debug}" /D "{cuda_enabled}" /D "{cutlass_enabled}" /D "WP_ENABLE_MATHDX=0" /D "{cuda_compat_enabled}" /D "{iter_dbg}" /I"{native_dir}" {includes} '
 
         if args.mode == "debug":
             cpp_flags += "/Zi /Od /D WP_ENABLE_DEBUG=1"
@@ -282,7 +282,7 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
                 run_cmd(cuda_cmd)
                 linkopts.append(quote(cu_out))
                 linkopts.append(
-                    f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib /LIBPATH:"{cuda_home}/lib/x64"'
+                    f'cudart_static.lib nvrtc_static.lib nvrtc-builtins_static.lib nvptxcompiler_static.lib ws2_32.lib user32.lib nvJitLink_static.lib /LIBPATH:"{cuda_home}/lib/x64"'
                 )
 
         with ScopedTimer("link", active=args.verbose):
@@ -290,19 +290,24 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             run_cmd(link_cmd)
 
     else:
-        libmathdx_home = os.environ['LIBMATHDX_HOME']
-        libmathdx_includes = f'-I{libmathdx_home}/include'
         cpp_includes = f' -I"{warp_home_path.parent}/external/llvm-project/out/install/{mode}-{arch}/include"'
         cpp_includes += f' -I"{warp_home_path.parent}/_build/host-deps/llvm-project/release-{arch}/include"'
         cuda_includes = f' -I"{cuda_home}/include"' if cu_path else ""
         includes = cpp_includes + cuda_includes
 
+        if args.libmathdx_path:
+            libmathdx_includes = f' -I"{args.libmathdx_path}/include"'
+            mathdx_enabled = "WP_ENABLE_MATHDX=1"
+        else:
+            libmathdx_includes = ""
+            mathdx_enabled = "WP_ENABLE_MATHDX=0"
+
         if sys.platform == "darwin":
             version = f"--target={arch}-apple-macos11"
         else:
             version = "-fabi-version=13"  # GCC 8.2+
 
-        cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
+        cpp_flags = f'{version} --std=c++17 -fno-rtti -D{cuda_enabled} -D{cutlass_enabled} -D{mathdx_enabled} -D{cuda_compat_enabled} -fPIC -fvisibility=hidden -D_GLIBCXX_USE_CXX11_ABI=0 -I"{native_dir}" {includes} '
 
         if mode == "debug":
             cpp_flags += "-O0 -g -D_DEBUG -DWP_ENABLE_DEBUG=1 -fkeep-inline-functions"
@@ -330,19 +335,22 @@ def build_dll_for_arch(args, dll_path, cpp_paths, cu_path, libs, arch, mode=None
             cu_out = cu_path + ".o"
 
             if mode == "debug":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -g -G -O0 --compiler-options -fPIC,-fvisibility=hidden -D_DEBUG -D_ITERATOR_DEBUG_LEVEL=0 -line-info {" ".join(nvcc_opts)} -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
 
             elif mode == "release":
-                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
+                cuda_cmd = f'"{cuda_home}/bin/nvcc" --std=c++17 -O3 --compiler-options -fPIC,-fvisibility=hidden {" ".join(nvcc_opts)} -DNDEBUG -DWP_ENABLE_CUDA=1 -I"{native_dir}" -D{cutlass_enabled} {cutlass_includes} -D{mathdx_enabled} {libmathdx_includes} -o "{cu_out}" -c "{cu_path}"'
 
             with ScopedTimer("build_cuda", active=args.verbose):
                 run_cmd(cuda_cmd)
 
                 ld_inputs.append(quote(cu_out))
                 ld_inputs.append(
-                    f'-L"{cuda_home}/lib64" -L{libmathdx_home}/lib -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lnvJitLink_static -lpthread -ldl -lrt -lmathdx_static'
+                    f'-L"{cuda_home}/lib64" -lcudart_static -lnvrtc_static -lnvrtc-builtins_static -lnvptxcompiler_static -lnvJitLink_static -lpthread -ldl -lrt'
                 )
 
+                if args.libmathdx_path:
+                    ld_inputs.append(f"-L{args.libmathdx_path}/lib -lmathdx_static")
+
         if sys.platform == "darwin":
             opt_no_undefined = "-Wl,-undefined,error"
             opt_exclude_libs = ""
diff --git a/warp/builtins.py b/warp/builtins.py
index 3b74d18d..2286b7fc 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5,16 +5,15 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 import builtins
-import tempfile
 import functools
-import os
+import tempfile
 from typing import Any, Callable, Mapping, Sequence
 
 from warp.codegen import Reference, Var, strip_reference
+from warp.mathdx import get_cuda_include_dirs, get_mathdx_include_dirs
 from warp.types import *
 
 from .context import add_builtin
-from .build import get_cuda_include_dirs, get_mathdx_include_dirs
 
 
 def seq_check_equal(seq_1, seq_2):
@@ -1708,8 +1707,8 @@ def spatial_vector_dispatch_func(input_types: Mapping[str, type], return_type: A
 # Tile-based primitives
 shared_memory_id = 0
 
+
 def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
@@ -1731,8 +1730,8 @@ def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
 
     return TileZeros(dtype=dtype, M=m, N=n)
 
-def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
 
+def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
     m, n, dtype = arg_values["m"], arg_values["n"], arg_values["dtype"]
 
     template_args = []
@@ -1743,7 +1742,6 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     return ([], template_args)
 
 
-
 add_builtin(
     "tile_zeros",
     input_types={"m": int, "n": int, "dtype": Scalar},
@@ -1755,13 +1753,13 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     export=False,
 )
 
+
 def tile_load_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
 
-    # if len(arg_types) != 3: 
+    # if len(arg_types) != 3:
     #     raise RuntimeError("tile_load() requires 3 positional args")
 
     if not is_array(arg_types["a"]):
@@ -1786,7 +1784,6 @@ def tile_load_value_func(arg_types, arg_values):
 
 
 def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
-    
     array = arg_values["a"]
     x, y = arg_values["x"], arg_values["y"]
     m, n = arg_values["m"].constant, arg_values["n"].constant
@@ -1811,13 +1808,13 @@ def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     export=False,
 )
 
+
 def tile_store_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return None
 
-    if len(arg_types) != 4: 
+    if len(arg_types) != 4:
         raise RuntimeError("tile_store() requires 4 positional args")
 
     if not is_array(arg_types["a"]):
@@ -1835,7 +1832,6 @@ def tile_store_value_func(arg_types, arg_values):
     return None
 
 
-
 add_builtin(
     "tile_store",
     input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
@@ -1846,13 +1842,13 @@ def tile_store_value_func(arg_types, arg_values):
     export=False,
 )
 
+
 def tile_atomic_add_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
 
-    if len(arg_types) != 4: 
+    if len(arg_types) != 4:
         raise RuntimeError("tile_atomic_add() requires 4 positional args")
 
     if not is_array(arg_types["a"]):
@@ -1870,10 +1866,7 @@ def tile_atomic_add_value_func(arg_types, arg_values):
     if arg_types["a"].dtype != arg_types["t"].dtype:
         raise RuntimeError("tile_atomic_add() tile dtype and array dtype must match")
 
-    return Tile(dtype=arg_types["t"].dtype,
-                M=arg_types["t"].M,
-                N=arg_types["t"].N)
-
+    return Tile(dtype=arg_types["t"].dtype, M=arg_types["t"].M, N=arg_types["t"].N)
 
 
 add_builtin(
@@ -1888,12 +1881,11 @@ def tile_atomic_add_value_func(arg_types, arg_values):
 
 
 def tile_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return Tile
 
-    if len(arg_types) != 1: 
+    if len(arg_types) != 1:
         raise RuntimeError("tile() requires 1 positional arg")
 
     # todo: we need a way to pass things like current compiler options
@@ -1903,7 +1895,6 @@ def tile_value_func(arg_types, arg_values):
     return Tile(dtype=arg_types["x"], M=1, N=warp.codegen.options["block_dim"], op="Tile")
 
 
-
 add_builtin(
     "tile",
     input_types={"x": Any},
@@ -1916,17 +1907,16 @@ def tile_value_func(arg_types, arg_values):
 
 
 def tile_extract_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
-        return None    
-    
-    if len(arg_types) != 3: 
+        return None
+
+    if len(arg_types) != 3:
         raise RuntimeError("tile_extract() requires 3 positional args")
 
     if not is_tile(arg_types["a"]):
         raise RuntimeError("tile_extract() argument 0 must be a tile")
-    
+
     return arg_types["a"].dtype
 
 
@@ -1942,12 +1932,11 @@ def tile_extract_value_func(arg_types, arg_values):
 
 
 def tile_matmul_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return None
 
-    if len(arg_types) != 3: 
+    if len(arg_types) != 3:
         raise RuntimeError("tile_matmul() requires 4 positional args")
 
     if not is_tile(arg_types["a"]):
@@ -1962,11 +1951,10 @@ def tile_matmul_value_func(arg_types, arg_values):
     if arg_types["out"].storage != "shared":
         raise RuntimeError("tile_matmul() output argument must have shared memory storage")
 
-
     return None
 
+
 def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
-    
     a = arg_values["a"]
     b = arg_values["b"]
     out = arg_values["out"]
@@ -1986,13 +1974,13 @@ def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     value_func=tile_matmul_value_func,
     dispatch_func=tile_matmul_dispatch_func,
     variadic=True,
-    doc="Compute matrix product and accumulate out += a*b.", 
+    doc="Compute matrix product and accumulate out += a*b.",
     group="Tile Primitives",
     export=False,
 )
 
+
 def tile_sum_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return None
@@ -2013,16 +2001,14 @@ def tile_sum_value_func(arg_types, arg_values):
     input_types={"a": Tile},
     value_func=tile_sum_value_func,
     variadic=True,
-    doc="Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored", 
+    doc="Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored",
     group="Tile Primitives",
     export=False,
 )
 
 
-
 # does type propagation for load()
 def tile_unary_map_value_func(arg_types, arg_values):
-
     if arg_types is None:
         return None
 
@@ -2045,16 +2031,16 @@ def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, ar
     "tile_map",
     input_types={"op": Callable, "a": Any},
     value_func=tile_unary_map_value_func,
-    #dispatch_func=tile_map_dispatch_func,
-    #variadic=True,
+    # dispatch_func=tile_map_dispatch_func,
+    # variadic=True,
     native_func="tile_unary_map",
-    doc="Map the operation onto each element of the tile", 
+    doc="Map the operation onto each element of the tile",
     group="Tile Primitives",
     export=False,
 )
 
-def tile_binary_map_value_func(arg_types, arg_values):
 
+def tile_binary_map_value_func(arg_types, arg_values):
     if arg_types is None:
         return None
 
@@ -2085,10 +2071,10 @@ def tile_binary_map_value_func(arg_types, arg_values):
     "tile_map",
     input_types={"op": Callable, "a": Any, "b": Any},
     value_func=tile_binary_map_value_func,
-    #dispatch_func=tile_map_dispatch_func,
-    #variadic=True,
+    # dispatch_func=tile_map_dispatch_func,
+    # variadic=True,
     native_func="tile_binary_map",
-    doc="Map the operation onto each element of the tile", 
+    doc="Map the operation onto each element of the tile",
     group="Tile Primitives",
     export=False,
 )
@@ -4705,7 +4691,6 @@ def matmat_mul_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
 
 # Tile operators
 def tile_unary_value_func(arg_types, arg_values):
-
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
 
@@ -4713,29 +4698,33 @@ def tile_unary_value_func(arg_types, arg_values):
 
     if not is_tile(t):
         raise RuntimeError("Expected tile for unary expression")
-    
+
     return TileUnaryMap(t)
 
-def tile_scalar_mul_value_func(arg_types, arg_values):
 
+def tile_scalar_mul_value_func(arg_types, arg_values):
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
 
     x = arg_types["x"]
     y = arg_types["y"]
- 
+
     # tile*scalar
     if is_tile(x):
         if x.dtype != y:
-            raise RuntimeError("Scalar factor should have the same type as tile for tile*scalar, tile type: {x} scalar type: {y}")
-        
+            raise RuntimeError(
+                "Scalar factor should have the same type as tile for tile*scalar, tile type: {x} scalar type: {y}"
+            )
+
         return TileBinaryMap(x, TileConstant(y, x.M, x.N))
-    
+
     # scalar*tile
     if is_tile(y):
         if y.dtype != x:
-            raise RuntimeError("Scalar factor should have the same type as tile for scalar*tile, tile type: {x} scalar type: {y}")
-        
+            raise RuntimeError(
+                "Scalar factor should have the same type as tile for scalar*tile, tile type: {x} scalar type: {y}"
+            )
+
         return TileBinaryMap(TileConstant(x, y.M, y.N), y)
 
 
@@ -4753,10 +4742,10 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
     "add",
     input_types={"a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_binary_map_value_func,
-    #dispatch_func=tile_map_dispatch_func,
-    #variadic=True,
+    # dispatch_func=tile_map_dispatch_func,
+    # variadic=True,
     native_func="tile_add",
-    doc="Add each element of two tiles together", 
+    doc="Add each element of two tiles together",
     group="Tile Primitives",
     export=False,
 )
@@ -4785,17 +4774,17 @@ def tile_scalar_mul_value_func(arg_types, arg_values):
 ##
 ## MathDx, LTOIR-based, Tile functions
 ##
-   
+
+
 ##
 ## Matmul
 ##
 def tile_matmul_generic_value_func(arg_types, arg_values):
-    
     # return generic type (for doc builds)
     if arg_types is None:
         return None
 
-    if len(arg_types) != 3: 
+    if len(arg_types) != 3:
         raise RuntimeError("tile_matmul() requires 4 positional args")
 
     if not is_tile(arg_types["a"]):
@@ -4810,26 +4799,29 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     if arg_types["out"].storage != "shared":
         raise RuntimeError("tile_matmul() output argument must have shared memory storage")
 
-
     return None
 
-def tile_matmul_generic_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any]):
-    
+
+def tile_matmul_generic_dispatch_func(
+    arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any]
+):
     a = arg_values["a"]
     b = arg_values["b"]
     out = arg_values["out"]
 
     if any(not is_tile(arg.type) for arg in [a, b, out]):
-        raise RuntimeError(f"tile_matmul() requires three Tile arguments")
-    
+        raise RuntimeError("tile_matmul() requires three Tile arguments")
+
     if any(arg.type.dtype not in [float16, float32, float64, vec2h, vec2f, vec2d] for arg in [a, b, out]):
-        raise RuntimeError(f"tile_matmul() arguments must be tiles of float16, float32 or float64, vec2h, vec2f, vec2d entries")
-    
+        raise RuntimeError(
+            "tile_matmul() arguments must be tiles of float16, float32 or float64, vec2h, vec2f, vec2d entries"
+        )
+
     if any(arg.type.dtype != out.type.dtype for arg in [a, b]):
-        raise RuntimeError(f"tile_matmul() arguments must have the same type")
+        raise RuntimeError("tile_matmul() arguments must have the same type")
 
     if (a.type.N != b.type.M) or (a.type.M != out.type.M) or (b.type.N != out.type.N):
-        raise RuntimeError(f"tile_matmul(A, B, C) requires sizes of A, B and C to be consistent for a matmul")
+        raise RuntimeError("tile_matmul(A, B, C) requires sizes of A, B and C to be consistent for a matmul")
 
     # set the storage type to the inputs to shared
     a.type.storage = "shared"
@@ -4840,75 +4832,93 @@ def tile_matmul_generic_dispatch_func(arg_types: Mapping[str, type], return_type
     # Real
     if out.type.dtype == float16:
         dtype = "wp::float16"
-        precision = 2 # COMMONDX_PRECISION_F16
-        element_type = 0 # CUBLASDX_TYPE_REAL
+        precision = 2  # COMMONDX_PRECISION_F16
+        element_type = 0  # CUBLASDX_TYPE_REAL
     elif out.type.dtype == float32:
         dtype = "wp::float32"
-        precision = 3 # COMMONDX_PRECISION_F32
-        element_type = 0 # CUBLASDX_TYPE_REAL
+        precision = 3  # COMMONDX_PRECISION_F32
+        element_type = 0  # CUBLASDX_TYPE_REAL
     elif out.type.dtype == float64:
         dtype = "wp::float64"
-        precision = 4 # COMMONDX_PRECISION_F64
-        element_type = 0 # CUBLASDX_TYPE_REAL
+        precision = 4  # COMMONDX_PRECISION_F64
+        element_type = 0  # CUBLASDX_TYPE_REAL
     # Complex
     elif out.type.dtype == vec2h:
         dtype = "wp::vec2h"
-        precision = 2 # COMMONDX_PRECISION_F16
-        element_type = 1 # CUBLASDX_TYPE_COMPLEX
+        precision = 2  # COMMONDX_PRECISION_F16
+        element_type = 1  # CUBLASDX_TYPE_COMPLEX
     elif out.type.dtype == vec2f:
         dtype = "wp::vec2f"
-        precision = 3 # COMMONDX_PRECISION_F32
-        element_type = 1 # CUBLASDX_TYPE_COMPLEX
+        precision = 3  # COMMONDX_PRECISION_F32
+        element_type = 1  # CUBLASDX_TYPE_COMPLEX
     elif out.type.dtype == vec2d:
         dtype = "wp::vec2d"
-        precision = 4 # COMMONDX_PRECISION_F64
-        element_type = 1 # CUBLASDX_TYPE_COMPLEX
+        precision = 4  # COMMONDX_PRECISION_F64
+        element_type = 1  # CUBLASDX_TYPE_COMPLEX
     else:
         raise RuntimeError("Unsupported datatype")
 
     # generate the LTO
     M, K = a.type.M, a.type.N
     _, N = b.type.M, b.type.N
-    num_threads = options['block_dim']
-    arch = options['output_arch']
+    num_threads = options["block_dim"]
+    arch = options["output_arch"]
 
     def make_function(M, N, K, tA, tB):
         # Warp follows Numpy: matrices are row-major
         # But cuBLASDx follows BLAS: matrices are col-major
         # So we have to flip M <-> N and A <-> B
         def make_transpose(t):
-            if t == 'N':
-                return 0 # CUBLASDX_TRANSPOSE_MODE_NON_TRANSPOSED
-            elif t == 'T':
-                return 1 # CUBLASDX_TRANSPOSE_MODE_TRANSPOSED
+            if t == "N":
+                return 0  # CUBLASDX_TRANSPOSE_MODE_NON_TRANSPOSED
+            elif t == "T":
+                return 1  # CUBLASDX_TRANSPOSE_MODE_TRANSPOSED
             raise RuntimeError("Invalid transpose mode")
+
         lto_symbol = f"dot_{M}_{N}_{K}_{tA}_{tB}_{precision}_{element_type}"
         lto_code = tempfile.NamedTemporaryFile()
         include_dirs = get_cuda_include_dirs()
         result = warp.context.runtime.core.cuda_compile_dot(
-            lto_code.name.encode("utf-8"),  lto_symbol.encode("utf-8"),
-            len(include_dirs), include_dirs, get_mathdx_include_dirs(),
-            arch, N, M, K, precision, element_type, make_transpose(tB), make_transpose(tA), num_threads)
+            lto_code.name.encode("utf-8"),
+            lto_symbol.encode("utf-8"),
+            len(include_dirs),
+            include_dirs,
+            get_mathdx_include_dirs(),
+            arch,
+            N,
+            M,
+            K,
+            precision,
+            element_type,
+            make_transpose(tB),
+            make_transpose(tA),
+            num_threads,
+        )
         if not result:
             raise RuntimeError("Failed to compile tile_matmul")
         else:
-            with open(lto_code.name, 'rb') as f:
+            with open(lto_code.name, "rb") as f:
                 lto_code = f.read()
             return lto_symbol, lto_code
 
-    (fun_forward, lto_forward) = make_function(M, N, K, 'N', 'N')       #    C += A * B
-    (fun_backward_A, lto_backward_A) = make_function(M, K, N, 'N', 'T') # adjA += adjC * B^T
-    (fun_backward_B, lto_backward_B) = make_function(K, N, M, 'T', 'N') # adjB += A^T * adjC
+    (fun_forward, lto_forward) = make_function(M, N, K, "N", "N")  #    C += A * B
+    (fun_backward_A, lto_backward_A) = make_function(M, K, N, "N", "T")  # adjA += adjC * B^T
+    (fun_backward_B, lto_backward_B) = make_function(K, N, M, "T", "N")  # adjB += A^T * adjC
+
+    return (
+        (
+            Var(fun_forward, str, False, True, False),
+            Var(fun_backward_A, str, False, True, False),
+            Var(fun_backward_B, str, False, True, False),
+            Var(dtype, str, False, True, False),
+            a,
+            b,
+            out,
+        ),
+        template_args,
+        [lto_forward, lto_backward_A, lto_backward_B],
+    )
 
-    return ((Var(fun_forward, str, False, True, False), 
-             Var(fun_backward_A, str, False, True, False), 
-             Var(fun_backward_B, str, False, True, False), 
-             Var(dtype, str, False, True, False),
-             a, 
-             b, 
-             out), 
-             template_args, 
-             [lto_forward, lto_backward_A, lto_backward_B])
 
 add_builtin(
     "tile_matmul_dx",
@@ -4916,21 +4926,21 @@ def make_transpose(t):
     value_func=tile_matmul_generic_value_func,
     lto_dispatch_func=tile_matmul_generic_dispatch_func,
     variadic=True,
-    doc="Compute matrix product and accumulate out += a*b.", 
+    doc="Compute matrix product and accumulate out += a*b.",
     group="Tile Primitives",
     export=False,
     namespace="",
 )
 
+
 ##
 ## FFT
 ##
 def tile_fft_generic_value_func(arg_types, arg_values):
-    
     if arg_types is None:
         return None
 
-    if len(arg_types) != 1: 
+    if len(arg_types) != 1:
         raise RuntimeError("tile_fft() requires 1 positional args")
 
     if not is_tile(arg_types["inout"]):
@@ -4941,38 +4951,44 @@ def tile_fft_generic_value_func(arg_types, arg_values):
 
     return None
 
-def tile_fft_generic_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any], direction:str = None):
-    
+
+def tile_fft_generic_dispatch_func(
+    arg_types: Mapping[str, type],
+    return_type: Any,
+    arg_values: Mapping[str, Var],
+    options: Mapping[str, Any],
+    direction: str = None,
+):
     inout = arg_values["inout"]
     inout.type.storage = "register"
 
-    if (not is_tile(inout.type)):
-        raise RuntimeError(f"tile_fft() arguments must be a single tile with register storage")
+    if not is_tile(inout.type):
+        raise RuntimeError("tile_fft() arguments must be a single tile with register storage")
 
-    if (inout.type.dtype not in [vec2f, vec2d]):
-        raise RuntimeError(f"tile_fft() argument must be a tile of vec2f or vec2d (interpreted as complex) entries")
+    if inout.type.dtype not in [vec2f, vec2d]:
+        raise RuntimeError("tile_fft() argument must be a tile of vec2f or vec2d (interpreted as complex) entries")
 
     # see libcufftdx.hpp
-    if direction == 'forward':
-        dir = 0 # CUFFTDX_DIRECTION_FORWARD
-    elif direction == 'inverse':
-        dir = 1 # CUFFTDX_DIRECTION_INVERSE
+    if direction == "forward":
+        dir = 0  # CUFFTDX_DIRECTION_FORWARD
+    elif direction == "inverse":
+        dir = 1  # CUFFTDX_DIRECTION_INVERSE
     else:
         raise RuntimeError("Invalid direction")
-    
+
     if inout.type.dtype == vec2f:
         dtype = "wp::vec2f"
-        precision = 3 # COMMONDX_PRECISION_F32
+        precision = 3  # COMMONDX_PRECISION_F32
     elif inout.type.dtype == vec2d:
         dtype = "wp::vec2d"
-        precision = 4 # COMMONDX_PRECISION_F64
+        precision = 4  # COMMONDX_PRECISION_F64
     else:
         raise RuntimeError("Unsupported datatype")
 
     # M FFTs of size N each
     batch, size = inout.type.M, inout.type.N
-    num_threads = options['block_dim']
-    arch = options['output_arch']
+    num_threads = options["block_dim"]
+    arch = options["output_arch"]
     ept = size // num_threads
     lto_symbol = f"fft_{size}_{ept}_{arch}_{direction}_{precision}"
 
@@ -4982,35 +4998,46 @@ def tile_fft_generic_dispatch_func(arg_types: Mapping[str, type], return_type: A
     include_dirs = get_cuda_include_dirs()
 
     result = warp.context.runtime.core.cuda_compile_fft(
-        lto_code.name.encode("utf-8"), 
+        lto_code.name.encode("utf-8"),
         lto_symbol.encode("utf-8"),
-        len(include_dirs), include_dirs,
+        len(include_dirs),
+        include_dirs,
         get_mathdx_include_dirs(),
-        arch, size, ept, dir, precision, ctypes.byref(shared_memory_size)
+        arch,
+        size,
+        ept,
+        dir,
+        precision,
+        ctypes.byref(shared_memory_size),
     )
 
     if not result:
         raise RuntimeError("Failed to compile tile_matmul")
 
-    with open(lto_code.name, 'rb') as f:
+    with open(lto_code.name, "rb") as f:
         lto_code = f.read()
 
-    return ((Var(lto_symbol, str, False, True, False), 
-             Var(dtype, str, False, True, False),
-             Var(str(shared_memory_size.value), str, False, True, False),
-             Var(str(batch), str, False, True, False),
-             Var(str(ept), str, False, True, False),
-             inout), 
-             [], 
-             [lto_code])
+    return (
+        (
+            Var(lto_symbol, str, False, True, False),
+            Var(dtype, str, False, True, False),
+            Var(str(shared_memory_size.value), str, False, True, False),
+            Var(str(batch), str, False, True, False),
+            Var(str(ept), str, False, True, False),
+            inout,
+        ),
+        [],
+        [lto_code],
+    )
+
 
 add_builtin(
     "tile_fft_dx",
     input_types={"inout": Tile},
     value_func=tile_fft_generic_value_func,
-    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction='forward'),
+    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction="forward"),
     variadic=True,
-    doc="Compute the FFT along the second dimension of a 2D tile of data.", 
+    doc="Compute the FFT along the second dimension of a 2D tile of data.",
     group="Tile Primitives",
     export=False,
     namespace="",
@@ -5020,9 +5047,9 @@ def tile_fft_generic_dispatch_func(arg_types: Mapping[str, type], return_type: A
     "tile_ifft_dx",
     input_types={"inout": Tile},
     value_func=tile_fft_generic_value_func,
-    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction='inverse'),
+    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction="inverse"),
     variadic=True,
-    doc="Compute the inverse FFT along the second dimension of a 2D tile of data.", 
+    doc="Compute the inverse FFT along the second dimension of a 2D tile of data.",
     group="Tile Primitives",
     export=False,
     namespace="",
diff --git a/warp/codegen.py b/warp/codegen.py
index fc2da1f6..f9a47f25 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -27,6 +27,7 @@
 # of current compile options (block_dim) etc
 options = {}
 
+
 class WarpCodegenError(RuntimeError):
     def __init__(self, message):
         super().__init__(message)
@@ -1278,7 +1279,9 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
         # for example by checking whether an argument corresponds to
         # a literal value or references a variable.
         if func.lto_dispatch_func is not None:
-            func_args, template_args, ltoirs = func.lto_dispatch_func(func.input_types, return_type, bound_args, options=adj.builder_options)
+            func_args, template_args, ltoirs = func.lto_dispatch_func(
+                func.input_types, return_type, bound_args, options=adj.builder_options
+            )
             adj.ltoirs.extend(ltoirs)
         elif func.dispatch_func is not None:
             func_args, template_args = func.dispatch_func(func.input_types, return_type, bound_args)
@@ -1335,7 +1338,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             replay_call = forward_call
             if func.custom_replay_func is not None:
                 replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
-               
+
         else:
             # handle multiple value functions
 
@@ -1347,7 +1350,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             )
             replay_call = forward_call
 
-
         if func.skip_replay:
             adj.add_forward(forward_call, replay="// " + replay_call)
         else:
@@ -1360,7 +1362,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             adj_args = tuple(strip_reference(x) for x in func_args)
             reverse_has_output_args = (
                 func.require_original_output_arg or len(output_list) > 1
-            ) and func.custom_grad_func is None            
+            ) and func.custom_grad_func is None
             arg_str = adj.format_reverse_call_args(
                 fwd_args,
                 adj_args,
@@ -3094,7 +3096,6 @@ def codegen_func_forward(adj, func_type="kernel", device="cpu"):
     lines += ["// primal vars\n"]
 
     for var in adj.variables:
-        
         # do not predeclare vars with auto type
         if var.ctype() == "auto":
             continue
@@ -3136,11 +3137,10 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
     lines += ["// primal vars\n"]
 
     for var in adj.variables:
-
         if is_tile(var.type):
             lines += [f"{var.ctype()} {var.emit()} = {var.type.cinit()};\n"]
         elif var.constant is None:
-            lines += [f"{var.ctype()} {var.emit()};\n"]        
+            lines += [f"{var.ctype()} {var.emit()};\n"]
         else:
             lines += [f"const {var.ctype()} {var.emit()} = {constant_str(var.constant)};\n"]
 
@@ -3151,7 +3151,7 @@ def codegen_func_reverse(adj, func_type="kernel", device="cpu"):
     for var in adj.variables:
         name = var.emit_adj()
         ctype = var.ctype(value_type=True)
-               
+
         if is_tile(var.type):
             lines += [f"{ctype} {name} = {var.type.cinit(adjoint=True)};\n"]
         else:
diff --git a/warp/context.py b/warp/context.py
index a28d3f5a..9f0617b1 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -1751,7 +1751,7 @@ def __init__(self, name, loader):
             "fast_math": False,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
-            "block_dim": 0
+            "block_dim": 0,
         }
 
         # Module dependencies are determined by scanning each function
@@ -1888,7 +1888,7 @@ def load(self, device, block_dim=None) -> ModuleExec:
         # re-compile module if tile size (blockdim) changes
         # todo: it would be better to have a method such as `module.get_kernel(block_dim=N)`
         # that can return a single kernel instance with a given block size
-        if block_dim != None:
+        if block_dim is not None:
             if self.options["block_dim"] != block_dim:
                 self.unload()
             self.options["block_dim"] = block_dim
@@ -3220,6 +3220,8 @@ def __init__(self):
             self.core.is_cuda_compatibility_enabled.restype = ctypes.c_int
             self.core.is_cutlass_enabled.argtypes = None
             self.core.is_cutlass_enabled.restype = ctypes.c_int
+            self.core.is_mathdx_enabled.argtypes = None
+            self.core.is_mathdx_enabled.restype = ctypes.c_int
 
             self.core.cuda_driver_version.argtypes = None
             self.core.cuda_driver_version.restype = ctypes.c_int
@@ -3344,52 +3346,52 @@ def __init__(self):
             self.core.cuda_graph_destroy.restype = ctypes.c_bool
 
             self.core.cuda_compile_program.argtypes = [
-                ctypes.c_char_p, # cuda_src
-                ctypes.c_int, # arch
-                ctypes.c_char_p, # include_dir
-                ctypes.c_int, # num_cuda_include_dirs
-                ctypes.POINTER(ctypes.c_char_p), # cuda include dirs
-                ctypes.c_bool, # debug
-                ctypes.c_bool, # verbose
-                ctypes.c_bool, # verify_fp
-                ctypes.c_bool, # fast_math
-                ctypes.c_char_p, # output_path
-                ctypes.c_size_t, # num_ltoirs
-                ctypes.POINTER(ctypes.c_char_p), # ltoirs
-                ctypes.POINTER(ctypes.c_size_t), # ltoir_sizes
+                ctypes.c_char_p,  # cuda_src
+                ctypes.c_int,  # arch
+                ctypes.c_char_p,  # include_dir
+                ctypes.c_int,  # num_cuda_include_dirs
+                ctypes.POINTER(ctypes.c_char_p),  # cuda include dirs
+                ctypes.c_bool,  # debug
+                ctypes.c_bool,  # verbose
+                ctypes.c_bool,  # verify_fp
+                ctypes.c_bool,  # fast_math
+                ctypes.c_char_p,  # output_path
+                ctypes.c_size_t,  # num_ltoirs
+                ctypes.POINTER(ctypes.c_char_p),  # ltoirs
+                ctypes.POINTER(ctypes.c_size_t),  # ltoir_sizes
             ]
             self.core.cuda_compile_program.restype = ctypes.c_size_t
 
             self.core.cuda_compile_fft.argtypes = [
-                ctypes.c_char_p, # lto
-                ctypes.c_char_p, # function name
-                ctypes.c_int, # num include dirs
-                ctypes.POINTER(ctypes.c_char_p), # include dirs
-                ctypes.c_char_p, # mathdx include dir
-                ctypes.c_int, # arch
-                ctypes.c_int, # size
-                ctypes.c_int, # ept
-                ctypes.c_int, # direction
-                ctypes.c_int, # precision
-                ctypes.POINTER(ctypes.c_int) # smem (out)
+                ctypes.c_char_p,  # lto
+                ctypes.c_char_p,  # function name
+                ctypes.c_int,  # num include dirs
+                ctypes.POINTER(ctypes.c_char_p),  # include dirs
+                ctypes.c_char_p,  # mathdx include dir
+                ctypes.c_int,  # arch
+                ctypes.c_int,  # size
+                ctypes.c_int,  # ept
+                ctypes.c_int,  # direction
+                ctypes.c_int,  # precision
+                ctypes.POINTER(ctypes.c_int),  # smem (out)
             ]
             self.core.cuda_compile_fft.restype = ctypes.c_bool
 
             self.core.cuda_compile_dot.argtypes = [
-                ctypes.c_char_p, # lto
-                ctypes.c_char_p, # function name
-                ctypes.c_int, # num include dirs
-                ctypes.POINTER(ctypes.c_char_p), # include dirs
-                ctypes.c_char_p, # mathdx include dir
-                ctypes.c_int, # arch
-                ctypes.c_int, # M
-                ctypes.c_int, # N
-                ctypes.c_int, # K
-                ctypes.c_int, # precision
-                ctypes.c_int, # type
-                ctypes.c_int, # tA
-                ctypes.c_int, # tB
-                ctypes.c_int  # num threads
+                ctypes.c_char_p,  # lto
+                ctypes.c_char_p,  # function name
+                ctypes.c_int,  # num include dirs
+                ctypes.POINTER(ctypes.c_char_p),  # include dirs
+                ctypes.c_char_p,  # mathdx include dir
+                ctypes.c_int,  # arch
+                ctypes.c_int,  # M
+                ctypes.c_int,  # N
+                ctypes.c_int,  # K
+                ctypes.c_int,  # precision
+                ctypes.c_int,  # type
+                ctypes.c_int,  # tA
+                ctypes.c_int,  # tB
+                ctypes.c_int,  # num threads
             ]
             self.core.cuda_compile_dot.restype = ctypes.c_bool
 
@@ -4845,7 +4847,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
 # represents all data required for a kernel launch
 # so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
 class Launch:
-    def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0):
+    def __init__(
+        self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0, block_dim=256
+    ):
         # retain the module executable so it doesn't get unloaded
         self.module_exec = kernel.module.load(device)
         if not self.module_exec:
@@ -4884,6 +4888,7 @@ def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bo
         self.device = device
         self.bounds = bounds
         self.max_blocks = max_blocks
+        self.block_dim = block_dim
 
     def set_dim(self, dim):
         self.bounds = warp.types.launch_bounds_t(dim)
@@ -4965,6 +4970,7 @@ def launch(self, stream=None) -> Any:
                 self.hooks.forward,
                 self.bounds.size,
                 self.max_blocks,
+                self.block_dim,
                 self.params_addr,
                 stream.cuda_stream,
             )
@@ -5113,7 +5119,13 @@ def pack_args(args, params, adjoint=False):
                     )
 
                 runtime.core.cuda_launch_kernel(
-                    device.context, hooks.backward, bounds.size, max_blocks, block_dim, kernel_params, stream.cuda_stream
+                    device.context,
+                    hooks.backward,
+                    bounds.size,
+                    max_blocks,
+                    block_dim,
+                    kernel_params,
+                    stream.cuda_stream,
                 )
 
             else:
@@ -5136,7 +5148,13 @@ def pack_args(args, params, adjoint=False):
                 else:
                     # launch
                     runtime.core.cuda_launch_kernel(
-                        device.context, hooks.forward, bounds.size, max_blocks, block_dim, kernel_params, stream.cuda_stream
+                        device.context,
+                        hooks.forward,
+                        bounds.size,
+                        max_blocks,
+                        block_dim,
+                        kernel_params,
+                        stream.cuda_stream,
                     )
 
             try:
@@ -5150,7 +5168,9 @@ def pack_args(args, params, adjoint=False):
         # record file, lineno, func as metadata
         frame = inspect.currentframe().f_back
         caller = {"file": frame.f_code.co_filename, "lineno": frame.f_lineno, "func": frame.f_code.co_name}
-        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device, block_dim, metadata={"caller": caller})
+        runtime.tape.record_launch(
+            kernel, dim, max_blocks, inputs, outputs, device, block_dim, metadata={"caller": caller}
+        )
 
         # detect illegal inter-kernel read/write access patterns if verification flag is set
         if warp.config.verify_autograd_array_access:
@@ -5724,7 +5744,7 @@ def type_str(t):
         return f"{t.__name__}[{args_repr}]"
     elif warp.types.is_tile(t):
         return "Tile"
-    
+
     return t.__name__
 
 
diff --git a/warp/examples/benchmarks/benchmark_tile.py b/warp/examples/benchmarks/benchmark_tile.py
index fc5900fe..54fec3f9 100644
--- a/warp/examples/benchmarks/benchmark_tile.py
+++ b/warp/examples/benchmarks/benchmark_tile.py
@@ -1,53 +1,47 @@
 import numpy as np
-import warp as wp
-
 import torch
 
+import warp as wp
+
 wp.init()
 wp.set_module_options({"enable_backward": False, "fast_math": True})
 wp.set_device("cuda:0")
 
 wp.build.clear_kernel_cache()
 
-@wp.kernel
-def gemm(A: wp.array2d(dtype=float),
-         B: wp.array2d(dtype=float),
-         C: wp.array2d(dtype=float)):
 
+@wp.kernel
+def gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
     # output index
     i, j = wp.tid()
 
     sum = float(0.0)
 
     for k in range(0, A.shape[1]):
-        sum += A[i, k]*B[k, j]
+        sum += A[i, k] * B[k, j]
 
     C[i, j] = sum
 
 
-
 TILE_M = wp.constant(64)
 TILE_N = wp.constant(64)
 TILE_K = wp.constant(8)
 
-@wp.kernel
-def gemm_tiled(A: wp.array2d(dtype=float),
-               B: wp.array2d(dtype=float),
-               C: wp.array2d(dtype=float)):
 
+@wp.kernel
+def gemm_tiled(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
     # output tile index
     i, j = wp.tid()
 
     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
-    M = A.shape[0]
-    N = B.shape[1]
+    _M = A.shape[0]
+    _N = B.shape[1]
     K = A.shape[1]
 
-    count = int(K / 8) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
+    count = int(K / 8)  # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
 
     for k in range(count):
-
         a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
         b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
 
@@ -58,24 +52,21 @@ def gemm_tiled(A: wp.array2d(dtype=float),
 
 
 def benchmark_numpy(A, B, C):
-
     timers = {}
     iters = 10
 
     # warm up
-    for i in range(10):
-        C = A@B
+    for _i in range(10):
+        _C = A @ B
 
     with wp.ScopedTimer("NumPy", dict=timers):
-
-        for i in range(iters):
-            C = A@B
+        for _i in range(iters):
+            _C = A @ B
 
     return min(timers["NumPy"])
 
 
 def benchmark_warp_simt(A, B, C):
-
     timers = {}
     iters = 10
 
@@ -84,19 +75,17 @@ def benchmark_warp_simt(A, B, C):
     C_wp = wp.array(C)
 
     # warm up
-    for i in range(10):
+    for _i in range(10):
         wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
 
     with wp.ScopedTimer("Warp (SIMT)", dict=timers, print=False, synchronize=True):
-        
-        for i in range(iters):
+        for _i in range(iters):
             wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
 
     return min(timers["Warp (SIMT)"])
 
 
 def benchmark_warp_tiled(A, B, C):
-
     timers = {}
     iters = 10
 
@@ -104,8 +93,7 @@ def benchmark_warp_tiled(A, B, C):
     SUB_TILE_M = 4
     SUB_TILE_N = 4
 
-    num_threads = int(TILE_M/SUB_TILE_M)*int(TILE_N/SUB_TILE_N);
-    
+    num_threads = int(TILE_M / SUB_TILE_M) * int(TILE_N / SUB_TILE_N)
     A_wp = wp.array(A)
     B_wp = wp.array(B)
     C_wp = wp.array(C)
@@ -113,48 +101,42 @@ def benchmark_warp_tiled(A, B, C):
     # warm up
     wp.capture_begin()
 
-    for i in range(10):
-        wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
+    for _i in range(iters):
+        wp.launch(gemm_tiled, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
 
     graph = wp.capture_end()
 
-
     with wp.ScopedTimer("Warp (Tiled)", dict=timers, print=False, synchronize=True):
-
-        #for i in range(iters):
+        # for i in range(iters):
         #    wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
         wp.capture_launch(graph)
 
-
     return min(timers["Warp (Tiled)"])
 
 
 def benchmark_torch(A, B, C):
-
     A_tc = torch.from_numpy(A).to("cuda:0")
     B_tc = torch.from_numpy(B).to("cuda:0")
     C_tc = torch.from_numpy(C).to("cuda:0")
 
     # warm-up
-    for i in range(10):
+    for _i in range(10):
         torch.matmul(A_tc, B_tc, out=C_tc)
 
     timers = {}
     iters = 10
-    
+
     torch.cuda.synchronize()
 
     with wp.ScopedTimer("Torch", dict=timers, print=False):
-
-        for i in range(iters):
-            torch.matmul(A_tc, B_tc)#, out=C_tc)
+        for _i in range(iters):
+            torch.matmul(A_tc, B_tc)  # , out=C_tc)
 
         torch.cuda.synchronize()
 
     return min(timers["Torch"])
 
 
-
 results_torch = []
 results_warp_simt = []
 results_warp_tiled = []
@@ -163,10 +145,9 @@ def benchmark_torch(A, B, C):
 print("--------------------------------------------------------")
 
 for i in range(2, 33):
+    # for i in range(8,9):
 
-#for i in range(8,9):
-
-    M = i*128
+    M = i * 128
     N = M
     K = N
 
@@ -181,13 +162,11 @@ def benchmark_torch(A, B, C):
     C = np.zeros((M, N), dtype=np.float32)
 
     results_torch.append(benchmark_torch(A, B, C))
-    results_warp_simt.append(0.0)#benchmark_warp_simt(A, B, C))
+    results_warp_simt.append(0.0)  # benchmark_warp_simt(A, B, C))
     results_warp_tiled.append(benchmark_warp_tiled(A, B, C))
 
-    print("{:>8d} {:>8d} {:>8d} {:>8f} {:>8f} {:>8f}".format(M, N, K, results_torch[-1], results_warp_simt[-1], results_warp_tiled[-1]))
-
-    
-
-
-
-
+    print(
+        "{:>8d} {:>8d} {:>8d} {:>8f} {:>8f} {:>8f}".format(
+            M, N, K, results_torch[-1], results_warp_simt[-1], results_warp_tiled[-1]
+        )
+    )
diff --git a/warp/mathdx.py b/warp/mathdx.py
new file mode 100644
index 00000000..dab9fbc8
--- /dev/null
+++ b/warp/mathdx.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import ctypes
+import os
+import platform
+import re
+import sys
+import warnings
+from importlib.metadata import PackageNotFoundError, files
+
+CUDA_HOME = None
+MATHDX_HOME = None
+CUTLASS_HOME = None
+
+
+PLATFORM_LINUX = sys.platform.startswith("linux")
+PLATFORM_WIN = sys.platform.startswith("win32")
+
+
+def _conda_get_target_name():
+    if PLATFORM_LINUX:
+        plat = platform.processor()
+        if plat == "aarch64":
+            return "sbsa-linux"
+        else:
+            return f"{plat}-linux"
+    elif PLATFORM_WIN:
+        return "x64"
+    else:
+        raise AssertionError
+
+
+def _check_cuda_home():
+    # We need some CUDA headers for compiling mathDx headers.
+    # We assume users properly managing their local envs (ex: no mix-n-match).
+    global CUDA_HOME
+
+    # Try wheel
+    try:
+        # We need CUDA 12+ for device API support
+        cudart = files("nvidia-cuda-runtime-cu12")
+        cccl = files("nvidia-cuda-cccl-cu12")
+        # use cuda_fp16.h (which we need) as a proxy
+        cudart = [f for f in cudart if "cuda_fp16.h" in str(f)][0]
+        cudart = os.path.join(os.path.dirname(cudart.locate()), "..")
+        # use cuda/std/type_traits as a proxy
+        cccl = min([f for f in cccl if re.match(".*cuda\\/std\\/type_traits.*", str(f))], key=lambda x: len(str(x)))
+        cccl = os.path.join(os.path.dirname(cccl.locate()), "../../..")
+    except PackageNotFoundError:
+        pass
+    except ValueError:
+        # cccl wheel is buggy (headers missing), skip using wheels
+        pass
+    else:
+        CUDA_HOME = (cudart, cccl)
+        return
+
+    # Try conda
+    if "CONDA_PREFIX" in os.environ:
+        if PLATFORM_LINUX:
+            conda_include = os.path.join(
+                os.environ["CONDA_PREFIX"], "targets", f"{_conda_get_target_name()}", "include"
+            )
+        elif PLATFORM_WIN:
+            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "Library", "include")
+        else:
+            assert AssertionError
+        if os.path.isfile(os.path.join(conda_include, "cuda_fp16.h")) and os.path.isfile(
+            os.path.join(conda_include, "cuda/std/type_traits")
+        ):
+            CUDA_HOME = (os.path.join(conda_include, ".."),)
+            return
+
+    # Try local
+    CUDA_PATH = os.environ.get("CUDA_PATH", None)
+    CUDA_HOME = os.environ.get("CUDA_HOME", None)
+    if CUDA_PATH is None and CUDA_HOME is None:
+        raise RuntimeError(
+            "cudart headers not found. Depending on how you install nvmath-python and other CUDA packages,\n"
+            "you may need to perform one of the steps below:\n"
+            "  - conda install -c conda-forge cuda-cudart-dev cuda-cccl cuda-version=12\n"
+            "  - export CUDA_HOME=/path/to/CUDA/Toolkit"
+        )
+    elif CUDA_PATH is not None and CUDA_HOME is None:
+        CUDA_HOME = CUDA_PATH
+    elif CUDA_PATH is not None and CUDA_HOME is not None:
+        if CUDA_HOME != CUDA_PATH:
+            warnings.warn(
+                "Both CUDA_HOME and CUDA_PATH are set but not consistent. " "Ignoring CUDA_PATH...", stacklevel=2
+            )
+    CUDA_HOME = (CUDA_HOME,)
+
+
+def _check_mathdx_home():
+    # Find mathDx headers
+    global MATHDX_HOME
+
+    # Try wheel
+    try:
+        MATHDX_HOME = files("nvidia-mathdx")
+    except PackageNotFoundError:
+        pass
+    else:
+        # use cufftdx.hpp as a proxy
+        MATHDX_HOME = [f for f in MATHDX_HOME if "cufftdx.hpp" in str(f)][0]
+        MATHDX_HOME = os.path.join(os.path.dirname(MATHDX_HOME.locate()), "..")
+        return
+
+    # Try conda
+    if "CONDA_PREFIX" in os.environ:
+        if PLATFORM_LINUX:
+            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "include")
+        elif PLATFORM_WIN:
+            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "Library", "include")
+        if os.path.isfile(os.path.join(conda_include, "cufftdx.hpp")):
+            MATHDX_HOME = os.path.join(conda_include, "..")
+            return
+
+    # Try local
+    if "MATHDX_HOME" not in os.environ:
+        raise RuntimeError(
+            "mathDx headers not found. Depending on how you install nvmath-python and other CUDA packages, "
+            "you may need to perform one of the steps below:\n"
+            "   - pip install nvidia-mathdx\n"
+            "   - conda install -c conda-forge mathdx\n"
+            "   - export MATHDX_HOME=/path/to/mathdx"
+        )
+    else:
+        MATHDX_HOME = os.environ["MATHDX_HOME"]
+
+
+def get_mathdx_include_dirs():
+    _check_mathdx_home()
+
+    global MATHDX_HOME
+    return (MATHDX_HOME + "/include").encode("utf-8")
+
+
+def get_cuda_include_dirs():
+    _check_cuda_home()
+
+    global CUDA_HOME
+    include_dirs = [(f"{h}" + "/include").encode("utf-8") for h in CUDA_HOME]
+    arr_include_dirs = (ctypes.c_char_p * len(include_dirs))()
+    arr_include_dirs[:] = include_dirs
+    return arr_include_dirs
diff --git a/warp/native/mathdx.cpp b/warp/native/mathdx.cpp
new file mode 100644
index 00000000..1dca0afa
--- /dev/null
+++ b/warp/native/mathdx.cpp
@@ -0,0 +1,56 @@
+/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+#include "builtin.h"
+
+// stubs for platforms where there is no CUDA
+#if !WP_ENABLE_CUDA || !WP_ENABLE_MATHDX
+
+extern "C"
+{
+
+WP_API
+bool cuda_compile_fft(
+                      const char* ltoir_output_path,
+                      const char* symbol_name, int num_include_dirs,
+                      const char** include_dirs,
+                      const char* mathdx_include_dir,
+                      int arch,
+                      int size,
+                      int elements_per_thread,
+                      int direction,
+                      int precision,
+                      int* shared_memory_size)
+{
+    printf("CUDA is disabled and/or Warp was not compiled with MathDx support.\n");
+    return false;
+}
+
+WP_API bool cuda_compile_dot(
+                             const char* ltoir_output_path,
+                             const char* symbol_name,
+                             int num_include_dirs,
+                             const char** include_dirs,
+                             const char* mathdx_include_dir,
+                             int arch,
+                             int M,
+                             int N,
+                             int K,
+                             int precision,
+                             int type,
+                             int tA,
+                             int tB,
+                             int num_threads)
+{
+    printf("CUDA is disabled and/or Warp was not compiled with MathDx support.\n");
+    return false;
+}
+
+} // extern "C"
+
+#endif // !WP_ENABLE_CUDA || !WP_ENABLE_MATHDX
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index 1ca668d3..3aa3dbe7 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -2,9 +2,6 @@
 
 #include "builtin.h"
 
-// todo: requires CTK, replace with inline ptx
-#include "cuda_pipeline_primitives.h"
-
 #define USE_CUTE 0
 
 #if USE_CUTE
@@ -332,4 +329,4 @@ void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
 
 
 
-} // namespace wp
\ No newline at end of file
+} // namespace wp
diff --git a/warp/native/warp.cpp b/warp/native/warp.cpp
index ed3efbc4..2fd64562 100644
--- a/warp/native/warp.cpp
+++ b/warp/native/warp.cpp
@@ -147,6 +147,11 @@ int is_cutlass_enabled()
     return int(WP_ENABLE_CUTLASS);
 }
 
+int is_mathdx_enabled()
+{
+    return int(WP_ENABLE_MATHDX);
+}
+
 int is_debug_enabled()
 {
     return int(WP_ENABLE_DEBUG);
@@ -1038,7 +1043,7 @@ WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* i
 WP_API void* cuda_load_module(void* context, const char* ptx) { return NULL; }
 WP_API void cuda_unload_module(void* context, void* module) {}
 WP_API void* cuda_get_kernel(void* context, void* module, const char* name) { return NULL; }
-WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args, void* stream) { return 0; }
+WP_API size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, int tile_size, void** args, void* stream) { return 0; }
 
 WP_API void cuda_set_context_restore_policy(bool always_restore) {}
 WP_API int cuda_get_context_restore_policy() { return false; }
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index b2554ed4..76f7b97f 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -16,7 +16,9 @@
 #include <nvrtc.h>
 #include <nvJitLink.h>
 #include <nvPTXCompiler.h>
-#include <libmathdx.hpp>
+#if WP_ENABLE_MATHDX
+    #include <libmathdx.hpp>
+#endif
 
 #include <array>
 #include <algorithm>
@@ -129,26 +131,6 @@ bool check_nvjitlink_result(nvJitLinkHandle handle, nvJitLinkResult result, cons
     }
 }
 
-bool check_cufftdx_result(commonDxStatusType result, const char* file, int line)
-{
-    if (result != commonDxStatusType::COMMONDX_SUCCESS) {
-        fprintf(stderr, "libmathdx cuFFTDx error: %d on %s:%d\n", (int)result, file, line);
-        return false;
-    } else {
-        return true;
-    }
-}
-
-bool check_cublasdx_result(commonDxStatusType result, const char* file, int line)
-{
-    if (result != commonDxStatusType::COMMONDX_SUCCESS) {
-        fprintf(stderr, "libmathdx cuBLASDx error: %d on %s:%d\n", (int)result, file, line);
-        return false;
-    } else {
-        return true;
-    }
-}
-
 bool check_generic(int result, const char* file, int line)
 {
     if (!result) {
@@ -2628,104 +2610,6 @@ bool write_file(const char* data, size_t size, std::string filename, const char*
     }
 }
 
-bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
-{
-
-    CHECK_ANY(ltoir_output_path != nullptr);
-    CHECK_ANY(symbol_name != nullptr);
-    CHECK_ANY(mathdx_include_dir != nullptr);
-    CHECK_ANY(shared_memory_size != nullptr);
-    CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
-
-    bool res = true;
-    cufftdxHandle h;
-    CHECK_CUFFTDX(cufftDxCreate(&h));
-
-    // CUFFTDX_API_BLOCK_LMEM means each thread starts with a subset of the data
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_API, cufftDxApi::CUFFTDX_API_BLOCK_LMEM));
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SIZE, (long long)size));
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_DIRECTION, (cufftDxDirection)direction));
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SM, (long long)(arch * 10)));
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_ELEMENTS_PER_THREAD, (long long)(elements_per_thread)));
-    CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_FFTS_PER_BLOCK, 1));
-
-    CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
-    for(int dir = 0; dir < num_include_dirs; dir++) 
-    {
-        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
-    }
-    CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
-
-    size_t lto_size = 0;
-    CHECK_CUFFTDX(cufftDxGetLTOIRSize(h, &lto_size));
-
-    std::vector<char> lto(lto_size);
-    CHECK_CUFFTDX(cufftDxGetLTOIR(h, lto.size(), lto.data()));    
-
-    long long int smem = 0;
-    CHECK_CUFFTDX(cufftDxGetTraitInt64(h, cufftDxTraitType::CUFFTDX_TRAIT_SHARED_MEMORY_SIZE, &smem));
-    *shared_memory_size = (int)smem;
-
-    if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
-        res = false;
-    }
-
-    CHECK_CUFFTDX(cufftDxDestroy(h));
-
-    return res;
-}
-
-bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision, int type, int tA, int tB, int num_threads)
-{
-
-    CHECK_ANY(ltoir_output_path != nullptr);
-    CHECK_ANY(symbol_name != nullptr);
-    CHECK_ANY(mathdx_include_dir != nullptr);
-    CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
-
-    bool res = true;
-    cublasdxHandle h;
-    CHECK_CUBLASDX(cublasDxCreate(&h));
-
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_FUNCTION, cublasDxFunction::CUBLASDX_FUNCTION_MM));
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_API, cublasDxApi::CUBLASDX_API_BLOCK_SMEM));
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SM, (long long)(arch * 10)));
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TYPE, (cublasDxType)type));
-    std::array<long long int, 3> block_dim = {num_threads, 1, 1};
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
-    std::array<long long int, 3> size = {M, N, K};
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SIZE, size.size(), size.data()));
-    std::array<long long int, 2> transpose_mode = {(cublasDxTransposeMode_t)tA, (cublasDxTransposeMode_t)tB};
-    CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TRANSPOSE_MODE, transpose_mode.size(), transpose_mode.data()));
-    
-    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
-    for(int dir = 0; dir < num_include_dirs; dir++) 
-    {
-        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
-    }
-    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
-    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/cublasdx/include").c_str()));
-    CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/../external/cutlass/include").c_str()));
-
-    size_t lto_size = 0;
-    CHECK_CUBLASDX(cublasDxGetLTOIRSize(h, &lto_size));
-
-    std::vector<char> lto(lto_size);
-    CHECK_CUBLASDX(cublasDxGetLTOIR(h, lto.size(), lto.data()));    
-
-    if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
-        res = false;
-    }
-
-    CHECK_CUBLASDX(cublasDxDestroy(h));
-
-    return res;
-}
-
 size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes)
 {
     // use file extension to determine whether to output PTX or CUBIN
@@ -2971,6 +2855,126 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     return res;
 }
 
+#if WP_ENABLE_MATHDX
+    bool check_cufftdx_result(commonDxStatusType result, const char* file, int line)
+    {
+        if (result != commonDxStatusType::COMMONDX_SUCCESS) {
+            fprintf(stderr, "libmathdx cuFFTDx error: %d on %s:%d\n", (int)result, file, line);
+            return false;
+        } else {
+            return true;
+        }
+    }
+
+    bool check_cublasdx_result(commonDxStatusType result, const char* file, int line)
+    {
+        if (result != commonDxStatusType::COMMONDX_SUCCESS) {
+            fprintf(stderr, "libmathdx cuBLASDx error: %d on %s:%d\n", (int)result, file, line);
+            return false;
+        } else {
+            return true;
+        }
+    }
+
+    bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size)
+    {
+
+        CHECK_ANY(ltoir_output_path != nullptr);
+        CHECK_ANY(symbol_name != nullptr);
+        CHECK_ANY(mathdx_include_dir != nullptr);
+        CHECK_ANY(shared_memory_size != nullptr);
+        CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
+
+        bool res = true;
+        cufftdxHandle h;
+        CHECK_CUFFTDX(cufftDxCreate(&h));
+
+        // CUFFTDX_API_BLOCK_LMEM means each thread starts with a subset of the data
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_API, cufftDxApi::CUFFTDX_API_BLOCK_LMEM));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SIZE, (long long)size));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_DIRECTION, (cufftDxDirection)direction));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_SM, (long long)(arch * 10)));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_ELEMENTS_PER_THREAD, (long long)(elements_per_thread)));
+        CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_FFTS_PER_BLOCK, 1));
+
+        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
+        for(int dir = 0; dir < num_include_dirs; dir++) 
+        {
+            CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
+        }
+        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
+
+        size_t lto_size = 0;
+        CHECK_CUFFTDX(cufftDxGetLTOIRSize(h, &lto_size));
+
+        std::vector<char> lto(lto_size);
+        CHECK_CUFFTDX(cufftDxGetLTOIR(h, lto.size(), lto.data()));    
+
+        long long int smem = 0;
+        CHECK_CUFFTDX(cufftDxGetTraitInt64(h, cufftDxTraitType::CUFFTDX_TRAIT_SHARED_MEMORY_SIZE, &smem));
+        *shared_memory_size = (int)smem;
+
+        if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
+            res = false;
+        }
+
+        CHECK_CUFFTDX(cufftDxDestroy(h));
+
+        return res;
+    }
+
+    bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision, int type, int tA, int tB, int num_threads)
+    {
+
+        CHECK_ANY(ltoir_output_path != nullptr);
+        CHECK_ANY(symbol_name != nullptr);
+        CHECK_ANY(mathdx_include_dir != nullptr);
+        CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
+
+        bool res = true;
+        cublasdxHandle h;
+        CHECK_CUBLASDX(cublasDxCreate(&h));
+
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_FUNCTION, cublasDxFunction::CUBLASDX_FUNCTION_MM));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_API, cublasDxApi::CUBLASDX_API_BLOCK_SMEM));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SM, (long long)(arch * 10)));
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TYPE, (cublasDxType)type));
+        std::array<long long int, 3> block_dim = {num_threads, 1, 1};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
+        std::array<long long int, 3> size = {M, N, K};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SIZE, size.size(), size.data()));
+        std::array<long long int, 2> transpose_mode = {(cublasDxTransposeMode_t)tA, (cublasDxTransposeMode_t)tB};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TRANSPOSE_MODE, transpose_mode.size(), transpose_mode.data()));
+        
+        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
+        for(int dir = 0; dir < num_include_dirs; dir++) 
+        {
+            CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
+        }
+        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
+        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/cublasdx/include").c_str()));
+        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/../external/cutlass/include").c_str()));
+
+        size_t lto_size = 0;
+        CHECK_CUBLASDX(cublasDxGetLTOIRSize(h, &lto_size));
+
+        std::vector<char> lto(lto_size);
+        CHECK_CUBLASDX(cublasDxGetLTOIR(h, lto.size(), lto.data()));    
+
+        if(!write_file(lto.data(), lto.size(), ltoir_output_path, "wb")) {
+            res = false;
+        }
+
+        CHECK_CUBLASDX(cublasDxDestroy(h));
+
+        return res;
+    }
+#endif
+
 void* cuda_load_module(void* context, const char* path)
 {
     ContextGuard guard(context);
@@ -3253,7 +3257,6 @@ void cuda_timing_end(timing_result_t* results, int size)
     g_cuda_timing_state = parent_state;
 }
 
-
 // impl. files
 #include "bvh.cu"
 #include "mesh.cu"
diff --git a/warp/native/warp.h b/warp/native/warp.h
index 1a90e0d6..045d5f0a 100644
--- a/warp/native/warp.h
+++ b/warp/native/warp.h
@@ -34,6 +34,8 @@ extern "C"
     WP_API int is_cuda_compatibility_enabled();
     // whether Warp was compiled with CUTLASS support
     WP_API int is_cutlass_enabled();
+    // whether Warp was compiled with MathDx support
+    WP_API int is_mathdx_enabled();
     // whether Warp was compiled with debug support
     WP_API int is_debug_enabled();
 
@@ -315,9 +317,9 @@ extern "C"
     WP_API bool cuda_graph_launch(void* graph, void* stream);
     WP_API bool cuda_graph_destroy(void* context, void* graph);
 
+    WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes);
     WP_API bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size);
     WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision, int type, int tA, int tB, int num_threads);
-    WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes);
 
     WP_API void* cuda_load_module(void* context, const char* ptx);
     WP_API void cuda_unload_module(void* context, void* module);
diff --git a/warp/stubs.py b/warp/stubs.py
index f9d7be6b..1a41fd5f 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -886,6 +886,66 @@ def spatial_mass(
     ...
 
 
+@over
+def tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile:
+    """Allocate a tile local block of zero'd memory"""
+    ...
+
+
+@over
+def tile_load(a: Array[Any], x: int32, y: int32, m: int32, n: int32) -> Tile:
+    """Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)"""
+    ...
+
+
+@over
+def tile_store(a: Array[Any], x: int32, y: int32, t: Any):
+    """Store tile `t` to an array `a` at offset `(i=x*m, j=y*n)`"""
+    ...
+
+
+@over
+def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile:
+    """Atomically add a tile `t` worth of data to array `a` at offset `(i=x*m, j=y*n)`"""
+    ...
+
+
+@over
+def tile(x: Any) -> Tile:
+    """Construct a Tile from a per-thread kernel value, returns a tile with dimensions of `(1, block_dim)` where block_dim is the number of threads specified in `wp.launch()`"""
+    ...
+
+
+@over
+def tile_extract(a: Tile, i: int32, j: int32):
+    """Extract element at index (i, j) of the tile and return the native type"""
+    ...
+
+
+@over
+def tile_matmul(a: Tile, b: Tile, out: Tile):
+    """Compute matrix product and accumulate out += a*b."""
+    ...
+
+
+@over
+def tile_sum(a: Tile):
+    """Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored"""
+    ...
+
+
+@over
+def tile_map(op: Callable, a: Any):
+    """Map the operation onto each element of the tile"""
+    ...
+
+
+@over
+def tile_map(op: Callable, a: Any, b: Any):
+    """Map the operation onto each element of the tile"""
+    ...
+
+
 @over
 def mlp(
     weights: Array[float32],
@@ -2083,6 +2143,12 @@ def add(a: Transformation[Scalar], b: Transformation[Scalar]) -> Transformation[
     ...
 
 
+@over
+def add(a: Tile, b: Tile):
+    """Add each element of two tiles together"""
+    ...
+
+
 @over
 def sub(a: Scalar, b: Scalar) -> Scalar:
     """ """
@@ -2233,6 +2299,18 @@ def mul(a: Transformation[Scalar], b: Scalar) -> Transformation[Scalar]:
     ...
 
 
+@over
+def mul(x: Tile, y: Scalar) -> Tile:
+    """Multiply each element of a tile by a scalar"""
+    ...
+
+
+@over
+def mul(x: Scalar, y: Tile) -> Tile:
+    """Multiply each element of a tile by a scalar"""
+    ...
+
+
 @over
 def mod(a: Scalar, b: Scalar) -> Scalar:
     """Modulo operation using truncated division."""
@@ -2341,6 +2419,12 @@ def neg(x: Matrix[Any, Any, Scalar]) -> Matrix[Any, Any, Scalar]:
     ...
 
 
+@over
+def neg(x: Tile) -> Tile:
+    """Negate each element of a tile"""
+    ...
+
+
 @over
 def unot(a: bool) -> bool:
     """ """
@@ -2399,3 +2483,21 @@ def unot(a: uint64) -> bool:
 def unot(a: Array[Any]) -> bool:
     """ """
     ...
+
+
+@over
+def tile_matmul_dx(a: Tile, b: Tile, out: Tile):
+    """Compute matrix product and accumulate out += a*b."""
+    ...
+
+
+@over
+def tile_fft_dx(inout: Tile):
+    """Compute the FFT along the second dimension of a 2D tile of data."""
+    ...
+
+
+@over
+def tile_ifft_dx(inout: Tile):
+    """Compute the inverse FFT along the second dimension of a 2D tile of data."""
+    ...
diff --git a/warp/tape.py b/warp/tape.py
index 2aef1b0e..6df7c21b 100644
--- a/warp/tape.py
+++ b/warp/tape.py
@@ -130,7 +130,7 @@ def backward(self, loss: wp.array = None, grads: dict = None):
                 outputs = launch[4]
                 device = launch[5]
                 block_dim = launch[6]
-                
+
                 adj_inputs = []
                 adj_outputs = []
 
@@ -152,7 +152,7 @@ def backward(self, loss: wp.array = None, grads: dict = None):
                     device=device,
                     adjoint=True,
                     max_blocks=max_blocks,
-                    block_dim=block_dim
+                    block_dim=block_dim,
                 )
 
     # record a kernel launch on the tape
@@ -614,7 +614,9 @@ def emit_kernel_launch_node(
         self.array_grad_stats.insert(0, grad_stats)
 
 
-Launch = namedtuple("Launch", ["id", "kernel", "dim", "max_blocks", "inputs", "outputs", "device", "block_dim", "metadata"])
+Launch = namedtuple(
+    "Launch", ["id", "kernel", "dim", "max_blocks", "inputs", "outputs", "device", "block_dim", "metadata"]
+)
 RepeatedSequence = namedtuple("RepeatedSequence", ["start", "end", "repetitions"])
 
 
@@ -645,8 +647,8 @@ def visit_tape(
     def get_launch_id(launch):
         kernel = launch[0]
         suffix = ""
-        if len(launch) > 6:
-            metadata = launch[6]
+        if len(launch) > 7:
+            metadata = launch[7]
             # calling function helps to identify unique launches
             if "caller" in metadata:
                 caller = metadata["caller"]
@@ -680,7 +682,8 @@ def get_launch_id(launch):
             inputs=launch[3],
             outputs=launch[4],
             device=launch[5],
-            metadata=launch[6] if len(launch) > 6 else {},
+            block_dim=launch[6],
+            metadata=launch[7] if len(launch) > 7 else {},
         )
         for launch in kernel_launches
     ]
diff --git a/warp/tests/test_mat_scalar_ops.py b/warp/tests/test_mat_scalar_ops.py
index 67b6c0c7..61df6c38 100644
--- a/warp/tests/test_mat_scalar_ops.py
+++ b/warp/tests/test_mat_scalar_ops.py
@@ -1501,7 +1501,7 @@ def test_matmat_multiplication(test, device, dtype, register_kernels=False):
     tol = {
         np.float16: 2.0e-2,
         np.float32: 5.0e-6,
-        np.float64: 1.0e-8,
+        np.float64: 5.0e-7,
     }.get(dtype, 0)
 
     wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
diff --git a/warp/tests/test_spatial.py b/warp/tests/test_spatial.py
index 4eb21a8c..7449d71c 100644
--- a/warp/tests/test_spatial.py
+++ b/warp/tests/test_spatial.py
@@ -1611,7 +1611,7 @@ def test_spatial_matmat_multiplication(test, device, dtype, register_kernels=Fal
     tol = {
         np.float16: 2.0e-2,
         np.float32: 5.0e-6,
-        np.float64: 1.0e-8,
+        np.float64: 5.0e-7,
     }.get(dtype, 0)
 
     wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index d95b1f6d..ed47b4a3 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -1,14 +1,16 @@
-import numpy as np
-import warp as wp
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import unittest
 
-wp.init()
-wp.set_module_options({"enable_backward": True})
-wp.set_device("cuda:0")
-wp.set_module_options({"fast_math": True})
-#wp.config.mode = "debug"
-#wp.config.verify_cuda = True
+import numpy as np
 
-wp.build.clear_kernel_cache()
+import warp as wp
+from warp.tests.unittest_utils import *
 
 TILE_M = wp.constant(8)
 TILE_N = wp.constant(4)
@@ -17,118 +19,122 @@
 # num threads per-tile
 TILE_DIM = 64
 
+
 @wp.kernel
-def tile_copy(A: wp.array2d(dtype=float),
-              B: wp.array2d(dtype=float)):
-    
+def tile_copy(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
     # tile index
-    i, j, _ = wp.tid() 
-    
+    i, j, _ = wp.tid()
+
     a = wp.tile_load(A, i, j, m=TILE_M, n=TILE_N)
     wp.tile_store(B, i, j, a)
 
 
-def test_tile_copy():
-
+def test_tile_copy(test, device):
     rng = np.random.default_rng(42)
 
-    M = TILE_M*7
-    N = TILE_N*5
+    M = TILE_M * 7
+    N = TILE_N * 5
 
     A = rng.random((M, N), dtype=np.float32)
     B = rng.random((M, N), dtype=np.float32)
 
-    A_wp = wp.array(A, requires_grad=True)
-    B_wp = wp.array(B, requires_grad=True)
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_copy, dim=[int(M/TILE_M), int(N/TILE_N), TILE_DIM], inputs=[A_wp, B_wp], block_dim=TILE_DIM)
+        wp.launch(
+            tile_copy,
+            dim=[int(M / TILE_M), int(N / TILE_N), TILE_DIM],
+            inputs=[A_wp, B_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
 
     # verify forward pass
-    assert(np.allclose(A, B_wp.numpy(), rtol=1.e-4))
-    print("Copy forward passed")
+    assert_array_equal(B_wp, A_wp)
 
     # verify backward pass
-    B_wp.grad = wp.ones_like(B_wp)
+    B_wp.grad = wp.ones_like(B_wp, device=device)
     tape.backward()
 
-    assert(np.allclose(A_wp.grad.numpy(), B_wp.grad.numpy()))
-    print("Copy backward passed")
+    assert_array_equal(B_wp.grad, A_wp.grad)
+
 
 @wp.func
 def unary_func(x: float):
     return wp.sin(x)
 
+
 @wp.kernel
-def tile_unary_map(input: wp.array2d(dtype=float),
-                   output: wp.array2d(dtype=float)):
-    
+def tile_unary_map(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
     # tile index
-    i, j, _ = wp.tid() 
-    
+    i, j, _ = wp.tid()
+
     a = wp.tile_load(input, i, j, m=TILE_M, n=TILE_N)
-    
+
     sa = wp.tile_map(wp.sin, a)
-    
-    wp.tile_store(output, i, j, sa)
 
+    wp.tile_store(output, i, j, sa)
 
-def test_tile_unary_map():
 
+def test_tile_unary_map(test, device):
     rng = np.random.default_rng(42)
 
-    M = TILE_M*7
-    N = TILE_N*5
+    M = TILE_M * 7
+    N = TILE_N * 5
 
     A = rng.random((M, N), dtype=np.float32)
     B = np.sin(A)
 
     A_grad = np.cos(A)
 
-    A_wp = wp.array(A, requires_grad=True)
-    B_wp = wp.zeros_like(A_wp, requires_grad=True)
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_unary_map, dim=[int(M/TILE_M), int(N/TILE_N), TILE_DIM], inputs=[A_wp, B_wp], block_dim=TILE_DIM)
+        wp.launch(
+            tile_unary_map,
+            dim=[int(M / TILE_M), int(N / TILE_N), TILE_DIM],
+            inputs=[A_wp, B_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
 
     # verify forward pass
-    assert(np.allclose(B, B_wp.numpy(), atol=1.e-4))
-    print("Unary map forward passed")
+    assert_np_equal(B_wp.numpy(), B, tol=1.0e-4)
 
     # verify backward pass
-    B_wp.grad = wp.ones_like(B_wp)
+    B_wp.grad = wp.ones_like(B_wp, device=device)
     tape.backward()
 
-    assert(np.allclose(A_wp.grad.numpy(), A_grad))
-    print("Unary map backward passed")
+    assert_np_equal(A_wp.grad.numpy(), A_grad, tol=1.0e-6)
 
 
 @wp.func
 def binary_func(x: float, y: float):
     return wp.sin(x) + y
 
+
 @wp.kernel
-def tile_binary_map(input_a: wp.array2d(dtype=float),
-                   input_b: wp.array2d(dtype=float),
-                   output: wp.array2d(dtype=float)):
-    
+def tile_binary_map(
+    input_a: wp.array2d(dtype=float), input_b: wp.array2d(dtype=float), output: wp.array2d(dtype=float)
+):
     # tile index
-    i, j, _= wp.tid() 
-    
+    i, j, _ = wp.tid()
+
     a = wp.tile_load(input_a, i, j, m=TILE_M, n=TILE_N)
     b = wp.tile_load(input_b, i, j, m=TILE_M, n=TILE_N)
-    
+
     sa = wp.tile_map(binary_func, a, b)
-    
-    wp.tile_store(output, i, j, sa)
 
+    wp.tile_store(output, i, j, sa)
 
-def test_tile_binary_map():
 
+def test_tile_binary_map(test, device):
     rng = np.random.default_rng(42)
 
-    M = TILE_M*7
-    N = TILE_N*5
+    M = TILE_M * 7
+    N = TILE_N * 5
 
     A = rng.random((M, N), dtype=np.float32)
     B = rng.random((M, N), dtype=np.float32)
@@ -137,32 +143,32 @@ def test_tile_binary_map():
     A_grad = np.cos(A)
     B_grad = np.ones_like(B)
 
-    A_wp = wp.array(A, requires_grad=True)
-    B_wp = wp.array(B, requires_grad=True)
-    C_wp = wp.zeros_like(A_wp, requires_grad=True)
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
+    C_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_binary_map, dim=[int(M/TILE_M), int(N/TILE_N), TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
+        wp.launch(
+            tile_binary_map,
+            dim=[int(M / TILE_M), int(N / TILE_N), TILE_DIM],
+            inputs=[A_wp, B_wp, C_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
 
     # verify forward pass
-    assert(np.allclose(C, C_wp.numpy(), rtol=1.e-4))
-    print("Binary map forward passed")
+    assert_np_equal(C_wp.numpy(), C, tol=1.0e-6)
 
     # verify backward pass
-    C_wp.grad = wp.ones_like(C_wp)
+    C_wp.grad = wp.ones_like(C_wp, device=device)
     tape.backward()
 
-    assert(np.allclose(A_wp.grad.numpy(), A_grad, rtol=1.e-2))
-    assert(np.allclose(B_wp.grad.numpy(), B_grad, rtol=1.e-2))
-    
-    print("Binary map backward passed")
+    assert_np_equal(A_wp.grad.numpy(), A_grad, tol=1.0e-6)
+    assert_np_equal(B_wp.grad.numpy(), B_grad)
 
 
 @wp.kernel
-def tile_grouped_gemm(A: wp.array3d(dtype=float),
-                      B: wp.array3d(dtype=float),
-                      C: wp.array3d(dtype=float)):
-
+def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C: wp.array3d(dtype=float)):
     # output tile index
     i = wp.tid()
 
@@ -176,8 +182,8 @@ def tile_grouped_gemm(A: wp.array3d(dtype=float),
     wp.tile_store(C[i], 0, 0, sum)
 
 
-def test_tile_grouped_gemm():
-
+@unittest.expectedFailure
+def test_tile_grouped_gemm(test, device):
     batch_count = 56
 
     M = TILE_M
@@ -187,29 +193,25 @@ def test_tile_grouped_gemm():
     rng = np.random.default_rng(42)
     A = rng.random((batch_count, M, K), dtype=np.float32)
     B = rng.random((batch_count, K, N), dtype=np.float32)
-    C = np.zeros((batch_count, M, N), dtype=np.float32)
+    C = A @ B
 
-    A_wp = wp.array(A, requires_grad=True)
-    B_wp = wp.array(B, requires_grad=True)
-    C_wp = wp.array(C, requires_grad=True)
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
+    C_wp = wp.array(C, requires_grad=True, device=device)
 
-    with wp.Tape() as tape:    
-        wp.launch(tile_grouped_gemm, dim=[batch_count, TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
-
-    # bring back to host
-    C_host = C_wp.numpy()
+    with wp.Tape() as tape:
+        wp.launch(
+            tile_grouped_gemm, dim=[batch_count, TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM, device=device
+        )
 
-    # GEMM forward passed
-    print("Batched matmul forward passed")
+    # TODO: 32 mismatched elements
+    assert_np_equal(C_wp.numpy(), C)
 
 
 @wp.kernel
-def tile_gemm(A: wp.array2d(dtype=float),
-              B: wp.array2d(dtype=float),
-              C: wp.array2d(dtype=float)):
-
+def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
     # output tile index
-    i, j, _= wp.tid()
+    i, j, _ = wp.tid()
 
     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
@@ -217,10 +219,9 @@ def tile_gemm(A: wp.array2d(dtype=float),
     N = B.shape[1]
     K = A.shape[1]
 
-    count = int(K / TILE_K) 
-    
-    for k in range(0, count):
+    count = int(K / TILE_K)
 
+    for k in range(0, count):
         a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
         b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
 
@@ -230,66 +231,62 @@ def tile_gemm(A: wp.array2d(dtype=float),
     wp.tile_store(C, i, j, sum)
 
 
-def test_tile_gemm():
-
-    M = TILE_M*7
-    K = TILE_K*6
-    N = TILE_N*5
+def test_tile_gemm(test, device):
+    M = TILE_M * 7
+    K = TILE_K * 6
+    N = TILE_N * 5
 
     rng = np.random.default_rng(42)
     A = rng.random((M, K), dtype=np.float32)
     B = rng.random((K, N), dtype=np.float32)
     C = np.zeros((M, N), dtype=np.float32)
 
-    A_wp = wp.array(A, requires_grad=True)
-    B_wp = wp.array(B, requires_grad=True)
-    C_wp = wp.array(C, requires_grad=True)
-
-    with wp.Tape() as tape:    
-        wp.launch(tile_gemm, dim=(int(M/TILE_M), int(N/TILE_N), TILE_DIM), inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
+    C_wp = wp.array(C, requires_grad=True, device=device)
 
-    assert(np.allclose(A@B, C_wp.numpy(), rtol=1.e-4))
+    with wp.Tape() as tape:
+        wp.launch(
+            tile_gemm,
+            dim=(int(M / TILE_M), int(N / TILE_N), TILE_DIM),
+            inputs=[A_wp, B_wp, C_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
 
-    # GEMM forward passed
-    print("Tiled matmul forward passed")
+    assert_np_equal(C_wp.numpy(), A @ B, tol=1.0e-5)
 
     adj_C = np.ones_like(C)
 
-    tape.backward(grads={C_wp: wp.array(adj_C)})
-
-    assert(np.allclose(adj_C@B.T, A_wp.grad.numpy(), rtol=1.e-4))
-    assert(np.allclose(A.T@adj_C, B_wp.grad.numpy(), rtol=1.e-4))
-
-    print("Tiled matmul backward passed")
+    tape.backward(grads={C_wp: wp.array(adj_C, device=device)})
 
+    assert_np_equal(A_wp.grad.numpy(), adj_C @ B.T, tol=1.0e-5)
+    assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C, 1.0e-5)
 
 
 @wp.kernel
-def tile_operators(input: wp.array3d(dtype=float),
-                   output: wp.array3d(dtype=float)):
-
+def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=float)):
     # output tile index
     i, _ = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
-    
+
     # neg
     b = -a
 
     # right scalar multiply
-    c = b*0.5
+    c = b * 0.5
 
     # left scalar multiply
-    d = 0.5*c
+    d = 0.5 * c
 
     # add tiles
     e = a + d
-    
-    wp.tile_store(output[i], 0, 0, e)
 
+    wp.tile_store(output[i], 0, 0, e)
 
-def test_tile_operators():
 
+def test_tile_operators(test, device):
     batch_count = 56
 
     M = TILE_M
@@ -297,41 +294,37 @@ def test_tile_operators():
 
     rng = np.random.default_rng(42)
     input = rng.random((batch_count, M, N), dtype=np.float32)
-    output = input*0.75
+    output = input * 0.75
 
-    input_wp = wp.array(input, requires_grad=True)
-    output_wp = wp.zeros_like(input_wp, requires_grad=True)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros_like(input_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_operators, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
+        wp.launch(
+            tile_operators, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
 
-    assert(np.allclose(output, output_wp.numpy(), rtol=1.e-4))
-
-    print("Operators forward passed")
+    assert_np_equal(output_wp.numpy(), output)
 
     output_wp.grad.fill_(1.0)
 
     tape.backward()
 
-    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.75, rtol=1.e-4))
-
-    print("Operators backward passed")    
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.75)
 
 
 @wp.kernel
-def tile_sum_kernel(input: wp.array3d(dtype=float),
-                    output: wp.array(dtype=float)):
-
+def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
     i, _ = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
-    s = wp.tile_sum(a)*0.5
+    s = wp.tile_sum(a) * 0.5
 
     wp.tile_store(output, i, 0, s)
 
-def test_tile_sum():
 
+def test_tile_sum(test, device):
     batch_count = 56
 
     M = TILE_M
@@ -340,34 +333,33 @@ def test_tile_sum():
     rng = np.random.default_rng(42)
     input = rng.random((batch_count, M, N), dtype=np.float32)
 
-    input_wp = wp.array(input, requires_grad=True)
-    output_wp = wp.zeros(batch_count, requires_grad=True)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_sum_kernel, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
+        wp.launch(
+            tile_sum_kernel,
+            dim=[batch_count, TILE_DIM],
+            inputs=[input_wp, output_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
 
+    sum_wp = output_wp.numpy()
 
     for i in range(batch_count):
-        sum_np = np.sum(input[i])*0.5
-        sum_wp = output_wp.numpy()[i]
-
-        assert(np.allclose(sum_np, sum_wp, rtol=1.e-4))
-
-    print("Sum forward passed")
+        sum_np = np.sum(input[i]) * 0.5
+        test.assertAlmostEqual(sum_wp[i], sum_np, places=5)
 
     output_wp.grad.fill_(1.0)
 
     tape.backward()
 
-    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.5, rtol=1.e-4))
-
-    print("Sum backward passed")
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
 
 
 @wp.kernel
-def tile_extract_kernel(input: wp.array2d(dtype=float),
-                        output: wp.array2d(dtype=float)):
-
+def tile_extract_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
     # output tile index
     i, _ = wp.tid()
 
@@ -377,126 +369,38 @@ def tile_extract_kernel(input: wp.array2d(dtype=float),
     # tile element individually
     for i in range(TILE_M):
         for j in range(TILE_N):
-            output[i,j] = t[i,j]
+            output[i, j] = t[i, j]
 
-def test_tile_extract():
 
+def test_tile_extract(test, device):
     M = TILE_M
     N = TILE_N
 
     rng = np.random.default_rng(42)
     input = rng.random((M, N), dtype=np.float32)
 
-    input_wp = wp.array(input, requires_grad=True)
-    output_wp = wp.zeros_like(input_wp, requires_grad=True)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros_like(input_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_extract_kernel, dim=[1, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
-
-    assert(np.allclose(input_wp.numpy(), output_wp.numpy(), rtol=1.e-4))
+        wp.launch(
+            tile_extract_kernel, dim=[1, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
 
-    print("Extract forward passed")
+    assert_array_equal(output_wp, input_wp)
 
     output_wp.grad.fill_(1.0)
 
     tape.backward()
 
-    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input), rtol=1.e-4))
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input))
 
-    print("Extract backward passed")
-
-@wp.kernel()
-def tile_matmul_dx_kernel(ga: wp.array2d(dtype=wp.float64),
-                          gb: wp.array2d(dtype=wp.float64),
-                          gc: wp.array2d(dtype=wp.float64)):
-    i, j, _ = wp.tid()
-    a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
-    b = wp.tile_load(gb, i, j, m=TILE_K, n=TILE_N)
-    c = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
-    wp.tile_matmul_dx(a, b, c)
-    wp.tile_store(gc, i, j, c)
-
-def test_tile_matmul_dx():
-
-    rng = np.random.default_rng(42)
-
-    A = rng.random((TILE_M, TILE_K), dtype=np.float64)
-    B = rng.random((TILE_K, TILE_N), dtype=np.float64)
-    C = np.zeros((TILE_M, TILE_N), dtype=np.float64)
-
-    A_wp = wp.array(A, requires_grad=True)
-    B_wp = wp.array(B, requires_grad=True)
-    C_wp = wp.array(C, requires_grad=True)
-
-    with wp.Tape() as tape:
-        wp.launch(tile_matmul_dx_kernel, dim=[1, 1, TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM)
-
-    # verify forward pass
-    assert(np.allclose(A @ B, C_wp.numpy(), rtol=1.e-4))
-
-    print("Matmul (Dx) forward passed")
-
-    adj_C = np.ones_like(C)
-
-    tape.backward(grads={C_wp: wp.array(adj_C)})
-
-    assert(np.allclose(adj_C@B.T, A_wp.grad.numpy(), rtol=1.e-4))
-    assert(np.allclose(A.T@adj_C, B_wp.grad.numpy(), rtol=1.e-4))
-
-    print("Matmul (Dx) backward passed")
-
-N_FFT = 128
-
-@wp.kernel()
-def tile_fft_dx_kernel(gx: wp.array2d(dtype=wp.vec2f),
-                       gy: wp.array2d(dtype=wp.vec2f)):
-    i, j, _ = wp.tid()
-    xy = wp.tile_load(gx, i, j, m=N_FFT, n=N_FFT)
-    wp.tile_fft_dx(xy)
-    wp.tile_store(gy, i, j, xy)
-
-def test_tile_fft_dx():
-
-    rng = np.random.default_rng(42)
-
-    # Warp doesn't really have a complex64 type, 
-    # so we use 2 float32 to represent a single complex64 number and then convert it to vec2f
-
-    X = rng.random((N_FFT, 2*N_FFT), dtype=np.float32)
-    Y = np.zeros_like(X)
-    
-    X_wp = wp.array2d(X, requires_grad=True, dtype=wp.vec2f)
-    Y_wp = wp.array2d(Y, requires_grad=True, dtype=wp.vec2f)
-    
-    X_c64 = X.view(np.complex64).reshape(N_FFT, N_FFT)
-    Y_c64 = np.fft.fft(X_c64, axis=-1)
-
-    with wp.Tape() as tape:
-        wp.launch(tile_fft_dx_kernel, dim=[1, 1, TILE_DIM], inputs=[X_wp, Y_wp], block_dim=TILE_DIM)
-
-    Y_wp_c64 = Y_wp.numpy().view(np.complex64).reshape(N_FFT, N_FFT)
-    assert(np.allclose(Y_c64, Y_wp_c64, rtol=1.e-4))
-
-    print("FFT (Dx) forward passed")
-
-    # TODO: implement and test backward pass
-
-test_tile_copy()
-test_tile_unary_map()
-test_tile_binary_map()
-test_tile_grouped_gemm()
-test_tile_gemm()
-test_tile_operators()
-test_tile_sum()
-test_tile_extract()
-test_tile_matmul_dx()
-test_tile_fft_dx()
 
 # #-----------------------------------------
 # # center of mass computation
 
-# start = offset[i] 
-# end = offset[i+1] 
+# start = offset[i]
+# end = offset[i+1]
 
 # com = wp.tile_zeros(dtype=wp.vec3, M=1)
 
@@ -504,7 +408,7 @@ def test_tile_fft_dx():
 # for i in range(start, end, N):
 
 #     count = wp.min(N, end-i)
-    
+
 #     idx = wp.tile_load(indices, i, N, max_col=count)
 #     p = wp.tile_load(points, idx, max_col=count)
 
@@ -514,13 +418,12 @@ def test_tile_fft_dx():
 # wp.tile_store(out[i], com)
 
 
-
 # #-------------------------------------------
 # # compute deformation gradient
 
-# i = 
+# i =
 # j =
-# k = 
+# k =
 # l =
 
 # f = wp.tile(F)  # generate a block size tile of feature vectors
@@ -545,7 +448,7 @@ def test_tile_fft_dx():
 # #----------------------------------
 # # MLP with helper function for linear layers
 # # where shape is only partially known
-# # at compile time, and the other dims 
+# # at compile time, and the other dims
 # # are inferred from the input vector
 
 # f = wp.tile(F)
@@ -562,32 +465,33 @@ def test_tile_fft_dx():
 # o = wp.untile(z)
 
 
-
 # #----------------------------------
 # # softmax
 
 # def softmax(z: Any):
-    
+
 #     e = wp.tile_map(wp.exp, z)
 #     s = wp.tile_sum(e, dim=0)
 
 #     return z/s[0]
 
+devices = get_cuda_test_devices()
 
 
+class TestTile(unittest.TestCase):
+    pass
 
 
+add_function_test(TestTile, "test_tile_copy", test_tile_copy, devices=devices)
+add_function_test(TestTile, "test_tile_unary_map", test_tile_unary_map, devices=devices)
+add_function_test(TestTile, "test_tile_binary_map", test_tile_binary_map, devices=devices)
+add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)  # FAILS
+add_function_test(TestTile, "test_tile_gemm", test_tile_gemm, devices=devices)
+add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
+add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
+add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
 
 
-
-
-
-
-
-
-
-
-
-
-
-
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)
diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
new file mode 100644
index 00000000..6cf4b7c1
--- /dev/null
+++ b/warp/tests/test_tile_mathdx.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import unittest
+
+import numpy as np
+
+import warp as wp
+from warp.tests.unittest_utils import *
+
+wp.init()  # For wp.context.runtime.core.is_mathdx_enabled()
+
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+
+N_FFT = wp.constant(128)
+
+# num threads per-tile
+TILE_DIM = 64
+
+
+@wp.kernel()
+def tile_math_dx_matmul_kernel(
+    ga: wp.array2d(dtype=wp.float64), gb: wp.array2d(dtype=wp.float64), gc: wp.array2d(dtype=wp.float64)
+):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
+    b = wp.tile_load(gb, i, j, m=TILE_K, n=TILE_N)
+    c = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
+    wp.tile_matmul_dx(a, b, c)
+    wp.tile_store(gc, i, j, c)
+
+
+def test_tile_math_dx_matmul(test, device):
+    rng = np.random.default_rng(42)
+
+    A = rng.random((TILE_M, TILE_K), dtype=np.float64)
+    B = rng.random((TILE_K, TILE_N), dtype=np.float64)
+    C = np.zeros((TILE_M, TILE_N), dtype=np.float64)
+
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
+    C_wp = wp.array(C, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch(
+            tile_math_dx_matmul_kernel,
+            dim=[1, 1, TILE_DIM],
+            inputs=[A_wp, B_wp, C_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
+
+    # verify forward pass
+    assert_np_equal(C_wp.numpy(), A @ B)
+
+    adj_C = np.ones_like(C)
+
+    tape.backward(grads={C_wp: wp.array(adj_C, device=device)})
+
+    assert_np_equal(A_wp.grad.numpy(), adj_C @ B.T)
+    assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C)
+
+
+@wp.kernel()
+def tile_math_dx_fft_kernel(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
+    i, j, _ = wp.tid()
+    xy = wp.tile_load(gx, i, j, m=N_FFT, n=N_FFT)
+    wp.tile_fft_dx(xy)
+    wp.tile_store(gy, i, j, xy)
+
+
+def test_tile_math_dx_fft(test, device):
+    rng = np.random.default_rng(42)
+
+    # Warp doesn't really have a complex64 type,
+    # so we use 2 float32 to represent a single complex64 number and then convert it to vec2f
+
+    X = rng.random((N_FFT, 2 * N_FFT), dtype=np.float32)
+    Y = np.zeros_like(X)
+
+    X_wp = wp.array2d(X, requires_grad=True, dtype=wp.vec2f, device=device)
+    Y_wp = wp.array2d(Y, requires_grad=True, dtype=wp.vec2f, device=device)
+
+    X_c64 = X.view(np.complex64).reshape(N_FFT, N_FFT)
+    Y_c64 = np.fft.fft(X_c64, axis=-1)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_math_dx_fft_kernel, dim=[1, 1, TILE_DIM], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
+
+    Y_wp_c64 = Y_wp.numpy().view(np.complex64).reshape(N_FFT, N_FFT)
+
+    assert_np_equal(Y_wp_c64, Y_c64, tol=1.0e-4)
+
+    # TODO: implement and test backward pass
+
+
+devices = get_cuda_test_devices()
+
+
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
+class TestTileMathDx(unittest.TestCase):
+    pass
+
+
+add_function_test(TestTileMathDx, "test_tile_math_dx_matmul", test_tile_math_dx_matmul, devices=devices)
+add_function_test(TestTileMathDx, "test_tile_math_dx_fft", test_tile_math_dx_fft, devices=devices)
+
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index a71e08d3..81491878 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -1,14 +1,16 @@
-import numpy as np
-import warp as wp
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
 
-wp.init()
-wp.set_module_options({"enable_backward": True})
-wp.set_device("cuda:0")
-wp.set_module_options({"fast_math": True})
-#wp.config.mode = "debug"
-#wp.config.verify_cuda = True
+import unittest
 
-wp.build.clear_kernel_cache()
+import numpy as np
+
+import warp as wp
+from warp.tests.unittest_utils import *
 
 TILE_M = wp.constant(8)
 TILE_N = wp.constant(4)
@@ -19,19 +21,17 @@
 
 
 @wp.kernel
-def tile_sum_kernel(input: wp.array3d(dtype=float),
-                    output: wp.array(dtype=float)):
-
+def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
     i, _ = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
-    s = wp.tile_sum(a)*0.5
+    s = wp.tile_sum(a) * 0.5
 
     wp.tile_store(output, i, 0, s)
 
-def test_tile_sum():
 
+def test_tile_reduce_sum(test, device):
     batch_count = 56
 
     M = TILE_M
@@ -40,83 +40,64 @@ def test_tile_sum():
     rng = np.random.default_rng(42)
     input = rng.random((batch_count, M, N), dtype=np.float32)
 
-    input_wp = wp.array(input, requires_grad=True)
-    output_wp = wp.zeros(batch_count, requires_grad=True)
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_sum_kernel, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM)
-
-
+        wp.launch(
+            tile_sum_kernel,
+            dim=[batch_count, TILE_DIM],
+            inputs=[input_wp, output_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
+
+    sum_wp = output_wp.numpy()
     for i in range(batch_count):
-        sum_np = np.sum(input[i])*0.5
-        sum_wp = output_wp.numpy()[i]
-
-        assert(np.allclose(sum_np, sum_wp, rtol=1.e-4))
-
-    print("Sum forward passed")
+        sum_np = np.sum(input[i]) * 0.5
+        test.assertAlmostEqual(sum_wp[i], sum_np, places=5)
 
     output_wp.grad.fill_(1.0)
 
     tape.backward()
 
-    assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.5, rtol=1.e-4))
-
-    print("Sum backward passed")
-
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
 
 
 @wp.kernel
 def tile_reduce_1d_kernel(output: wp.array(dtype=int)):
-
     # output tile index
     i = wp.tid()
-    
-    t = wp.tile(i)      # convert to block wide tile    
+
+    t = wp.tile(i)  # convert to block wide tile
     s = wp.tile_sum(t)  # sum over block
 
     # update global sum
     wp.tile_atomic_add(output, i, 0, s)
 
-def test_tile_reduce_1d():
 
-    N = int(TILE_DIM*3/2)
+@unittest.expectedFailure
+def test_tile_reduce_1d(test, device):
+    N = int(TILE_DIM * 3 / 2)
 
-    output = wp.zeros(shape=1, dtype=int, requires_grad=True)
+    output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_reduce_1d_kernel, dim=[N], inputs=[output], block_dim=TILE_DIM)
-
-    assert(np.sum(np.arange(N)), output.numpy())
-
-    print("Sum 1D forward passed")
-
-    # output_wp.grad.fill_(1.0)
-
-    # tape.backward()
-
-    # assert(np.allclose(input_wp.grad.numpy(), np.ones_like(input)*0.5, rtol=1.e-4))
-
-    # print("Sum backward passed")
-
-
-test_tile_sum()
-test_tile_reduce_1d()
-
-
-
-
-
-
-
-
-
-
-
+        wp.launch(tile_reduce_1d_kernel, dim=[N], inputs=[output], block_dim=TILE_DIM, device=device)
 
+    test.assertAlmostEqual(output.numpy()[0], np.sum(np.arange(N)))
 
 
+devices = get_cuda_test_devices()
 
 
+class TestTileReduce(unittest.TestCase):
+    pass
 
 
+add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_1d", test_tile_reduce_1d, devices=devices)  # FAILS
 
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)
diff --git a/warp/tests/unittest_utils.py b/warp/tests/unittest_utils.py
index 83e6ab2f..a94e6a36 100644
--- a/warp/tests/unittest_utils.py
+++ b/warp/tests/unittest_utils.py
@@ -232,6 +232,10 @@ def test_func(self):
         else:
             func(self, device, **kwargs)
 
+    # Copy the __unittest_expecting_failure__ attribute from func to test_func
+    if hasattr(func, "__unittest_expecting_failure__"):
+        test_func.__unittest_expecting_failure__ = func.__unittest_expecting_failure__
+
     return test_func
 
 
diff --git a/warp/types.py b/warp/types.py
index c346a044..9dbab2a4 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -1492,10 +1492,10 @@ def types_equal(a, b, match_generic=False):
 
     if is_array(a) and type(a) is type(b):
         return True
-    
+
     if is_tile(a) and is_tile(b):
         return True
-    
+
     return scalars_equal(a, b, match_generic)
 
 
@@ -2957,7 +2957,6 @@ def array_type_id(a):
 
 # tile expression objects
 class Tile:
-
     allocation = 0
 
     def __init__(self, dtype, M, N, op=None, storage="register"):
@@ -2973,7 +2972,7 @@ def ctype(self):
 
         if self.storage == "register":
             return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
-        elif self.storage == "shared":           
+        elif self.storage == "shared":
             return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
 
     # generates C-initializer string
@@ -2983,15 +2982,12 @@ def cinit(self, adjoint=False):
         if self.storage == "register":
             return self.ctype() + "(0.0)"
         elif self.storage == "shared":
-
             if adjoint:
                 # backward pass requires zeroed memory
                 return f"wp::tile_alloc_zeros<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"
             else:
                 # forward mode can be uninitialized until first used by the kernel
                 return f"wp::tile_alloc_empty<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"
-            
-
 
     # generate a unique allocation index for shared memory
     @classmethod
@@ -3000,26 +2996,23 @@ def alloc(cls):
         Tile.allocation += 1
         return index
 
-class TileZeros(Tile):
 
+class TileZeros(Tile):
     def __init__(self, dtype, M, N):
         Tile.__init__(self, dtype, M, N, op="zeros", storage="shared")
-        
 
-class TileConstant(Tile):
 
+class TileConstant(Tile):
     def __init__(self, dtype, M, N):
         Tile.__init__(self, dtype, M, N, op="constant", storage="register")
-        
 
-class TileLoad(Tile):
 
+class TileLoad(Tile):
     def __init__(self, array, M, N):
         Tile.__init__(self, array.dtype, M, N, op="load", storage="register")
-        
 
-class TileUnaryMap(Tile):
 
+class TileUnaryMap(Tile):
     def __init__(self, t):
         Tile.__init__(self, t.dtype, t.M, t.N, op="unary_map", storage="register")
 
@@ -3027,7 +3020,6 @@ def __init__(self, t):
 
 
 class TileBinaryMap(Tile):
-
     def __init__(self, a, b):
         Tile.__init__(self, a.dtype, a.M, a.N, op="binary_map", storage="register")
 
@@ -3036,7 +3028,6 @@ def __init__(self, a, b):
 
 
 class TileShared(Tile):
-
     def __init__(self, t):
         Tile.__init__(self, t.dtype, t.M, t.N, "shared", storage="shared")
 

From 1c415a4bb6bc691409c0ad40305ccfca570a8547 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 26 Sep 2024 01:40:38 +0000
Subject: [PATCH 034/102] Update some docstrings

---
 warp/builtins.py | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 3b74d18d..e24e0d76 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1750,7 +1750,7 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     value_func=tile_zeros_value_func,
     dispatch_func=tile_zeros_dispatch_func,
     variadic=True,
-    doc="Allocate a tile local block of zero'd memory",
+    doc="Allocate a tile of zero initialized items",
     group="Tile Primitives",
     export=False,
 )
@@ -1909,8 +1909,27 @@ def tile_value_func(arg_types, arg_values):
     input_types={"x": Any},
     value_func=tile_value_func,
     variadic=True,
-    doc="Construct a Tile from a per-thread kernel value, returns a tile with dimensions of `(1, block_dim)` where block_dim is the number of threads specified in `wp.launch()`",
-    group="Tile Primitives",
+    doc="""Construct a Tile from a per-thread kernel value.
+    
+    Args:
+        x (Any): A per-thread local value, e.g.: scalar, vector, or matrix.
+
+    Returns:
+        Tile: A tile with dimensions of ``(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch().``
+
+    Examples:
+        This example shows how to create a linear sequence from thread variables:
+
+        .. code-block:: python
+            
+            # get thread id
+            i = wp.tid()
+            
+            # convert to block wide tile
+            t = wp.tile(i*2)
+    """,
+
+    group="Tile Primitives""",
     export=False,
 )
 
@@ -1919,7 +1938,7 @@ def tile_extract_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
-        return None    
+        return Scalar    
     
     if len(arg_types) != 3: 
         raise RuntimeError("tile_extract() requires 3 positional args")
@@ -1945,7 +1964,7 @@ def tile_matmul_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
-        return None
+        return Tile(dtype=Any, M=Any, N=Any)
 
     if len(arg_types) != 3: 
         raise RuntimeError("tile_matmul() requires 4 positional args")
@@ -1971,7 +1990,7 @@ def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     b = arg_values["b"]
     out = arg_values["out"]
 
-    # set the storage type to the inputs to shared
+    # force the storage type of the input variables to shared memory
     a.type.storage = "shared"
     b.type.storage = "shared"
     out.type.storage = "shared"
@@ -1995,7 +2014,7 @@ def tile_sum_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
-        return None
+        return Tile(dtype=Any, M=1, N=1)
 
     if len(arg_types) != 1:
         raise RuntimeError("tile_sum() requires 1 positional args")
@@ -2024,7 +2043,7 @@ def tile_sum_value_func(arg_types, arg_values):
 def tile_unary_map_value_func(arg_types, arg_values):
 
     if arg_types is None:
-        return None
+        return Tile(dtype=Any, M=Any, N=Any)
 
     a = arg_types["a"]
 
@@ -2048,7 +2067,7 @@ def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, ar
     #dispatch_func=tile_map_dispatch_func,
     #variadic=True,
     native_func="tile_unary_map",
-    doc="Map the operation onto each element of the tile", 
+    doc="Unary map the operation onto each element of the tile.", 
     group="Tile Primitives",
     export=False,
 )
@@ -2056,7 +2075,7 @@ def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, ar
 def tile_binary_map_value_func(arg_types, arg_values):
 
     if arg_types is None:
-        return None
+        return Tile(dtype=Any, M=Any, N=Any)
 
     a = arg_types["a"]
     b = arg_types["b"]
@@ -2088,7 +2107,7 @@ def tile_binary_map_value_func(arg_types, arg_values):
     #dispatch_func=tile_map_dispatch_func,
     #variadic=True,
     native_func="tile_binary_map",
-    doc="Map the operation onto each element of the tile", 
+    doc="Apply the binary map operation onto each corresponding pair of elements from each the tile.", 
     group="Tile Primitives",
     export=False,
 )
@@ -4793,7 +4812,7 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     
     # return generic type (for doc builds)
     if arg_types is None:
-        return None
+        return Tile(dtype=Any, M=Any, N=Any)
 
     if len(arg_types) != 3: 
         raise RuntimeError("tile_matmul() requires 4 positional args")
@@ -4928,7 +4947,7 @@ def make_transpose(t):
 def tile_fft_generic_value_func(arg_types, arg_values):
     
     if arg_types is None:
-        return None
+        return Tile(dtype=Any, M=Any, N=Any)
 
     if len(arg_types) != 1: 
         raise RuntimeError("tile_fft() requires 1 positional args")

From 53968c6d86747d37c6e70af718eb6f709d524682 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 26 Sep 2024 10:15:32 +0000
Subject: [PATCH 035/102] Add wp.tile_ones() Add wp.tile_arange() Add detailed
 docstrings for most tile methods

---
 warp/builtins.py               | 292 +++++++++++++++++++++++++++------
 warp/native/tile.h             |  39 ++++-
 warp/tests/test_tile_reduce.py |  60 ++++++-
 warp/types.py                  |  11 ++
 4 files changed, 347 insertions(+), 55 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index cbad03b5..e400c364 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1713,9 +1713,6 @@ def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
 
-    # if len(arg_types) > 0:
-    #     raise RuntimeError("tile_zero() args must be passed by keyword")
-
     if "m" not in arg_values:
         raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
 
@@ -1748,12 +1745,138 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     value_func=tile_zeros_value_func,
     dispatch_func=tile_zeros_dispatch_func,
     variadic=True,
-    doc="Allocate a tile of zero initialized items",
+    doc="""Allocates a tile of zero initialized items.
+    
+    :param m: Size of the first dimension of the output tile
+    :param n: Size of the second dimension of the output tile
+    :param dtype: Datatype of output tile's elements
+    :returns: A zero initialized tile with ``shape=(m,n)`` and the specified datatype""",
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_ones_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile(dtype=Any, M=Any, N=Any)
+
+    if "m" not in arg_values:
+        raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
+
+    if "n" not in arg_values:
+        raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
+
+    if "dtype" not in arg_values:
+        raise RuntimeError("'dtype' keyword argument must be specified when calling tile_zeros() function")
+
+    m, n = arg_values["m"], arg_values["n"]
+    dtype = arg_values["dtype"]
+
+    return TileZeros(dtype=dtype, M=m, N=n)
+
+
+def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    m, n, dtype = arg_values["m"], arg_values["n"], arg_values["dtype"]
+
+    template_args = []
+    template_args.append(dtype)
+    template_args.append(m.constant)
+    template_args.append(n.constant)
+
+    return ([], template_args)
+
+
+add_builtin(
+    "tile_ones",
+    input_types={"m": int, "n": int, "dtype": Scalar},
+    value_func=tile_ones_value_func,
+    dispatch_func=tile_ones_dispatch_func,
+    variadic=True,
+    doc="""Allocates a tile of one initialized items.
+    
+    :param m: Size of the first dimension of the output tile
+    :param n: Size of the second dimension of the output tile
+    :param dtype: Datatype of output tile's elements
+    :returns: A one initialized tile with ``shape=(m,n)`` and the specified dtype""",
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile(dtype=Any, M=Any, N=Any)
+
+    start = 0
+    stop = 0
+    step = 1
+    dtype = int
+
+    args = arg_values["args"]
+
+    if len(args) == 1:
+        start = 0
+        stop = args[0]
+
+    elif len(args) == 2:
+        start = args[0]
+        stop = args[1]
+
+    elif len(args) == 3:
+        start = args[0]
+        stop = args[1]
+        step = args[2]
+
+    if start == None or stop == None or step == None:
+        raise RuntimeError("wp.tile_arange() arguments must be compile time constants")
+
+    if arg_values["dtype"] is not None:
+        dtype = arg_values["dtype"]
+
+    return TileRange(dtype=dtype, start=start, stop=stop, step=step)
+
+
+def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    m, n, dtype = return_type.M, return_type.N, return_type.dtype
+
+    template_args = []
+    template_args.append(dtype)
+    template_args.append(m)
+    template_args.append(n)
+
+    # take dtype from stop value
+    t = return_type.dtype
+
+    start = warp.codegen.Var(label=None, type=t, constant=return_type.start)
+    stop = warp.codegen.Var(label=None, type=t, constant=return_type.stop)
+    step = warp.codegen.Var(label=None, type=t, constant=return_type.step)
+
+    return ([start, stop, step], template_args)
+
+
+add_builtin(
+    "tile_arange",
+    input_types={"*args": Scalar, "dtype": Scalar},
+    defaults={"dtype": None},
+    value_func=tile_arange_value_func,
+    dispatch_func=tile_arange_dispatch_func,
+    variadic=True,
+    doc="""Generates a tile of linearly spaced elements.
+    
+    :param args: Variable length positional arguments, interpreted as:
+
+        - ``(stop,)``: Generates values from ``0`` to ``stop - 1``
+        - ``(start, stop)``: Generates values from ``start`` to ``stop - 1``
+        - ``(start, stop, step)``: Generates values from ``start`` to ``stop - 1`` with a step size
+
+    :param dtype: Datatype of output tile's elements (optional, default: int)
+    :returns: A tile with ``shape=(1,n)`` with linearly spaced elements of specified dtype""",
     group="Tile Primitives",
     export=False,
 )
 
 
+
 def tile_load_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1803,7 +1926,16 @@ def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     value_func=tile_load_value_func,
     dispatch_func=tile_load_dispatch_func,
     variadic=True,
-    doc="Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)",
+    doc="""Loads a tile from a global memory array.
+    
+    This method will cooperatively load a tile from global memory using all threads in the block.
+
+    :param a: The source array in global memory
+    :param x: Offset in the source array measured in multiples of ``m``, i.e.: ``i=x*m``
+    :param y: Offset in the source array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param m: The size of the tile's first dimension
+    :param n: The size of the tile's second dimensions
+    :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array""",
     group="Tile Primitives",
     export=False,
 )
@@ -1829,6 +1961,9 @@ def tile_store_value_func(arg_types, arg_values):
     if not is_tile(arg_types["t"]):
         raise RuntimeError("tile_store() argument 3 must be a tile")
 
+    if not types_equal(arg_types["a"].dtype, arg_types["t"].dtype):
+        raise RuntimeError("tile_store() destination array must have same type as source tile")
+
     return None
 
 
@@ -1837,7 +1972,14 @@ def tile_store_value_func(arg_types, arg_values):
     input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
     value_func=tile_store_value_func,
     variadic=True,
-    doc="Store tile `t` to an array `a` at offset `(i=x*m, j=y*n)`",
+    doc="""Stores a tile to a global memory array.
+    
+    This method will cooperatively store a tile to global memory using all threads in the block.
+
+    :param a: The destination array in global memory
+    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*m``
+    :param y: Offset in the destination array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param t: The source tile to store data from, must have the same dtype as the destination array""",
     group="Tile Primitives",
     export=False,
 )
@@ -1874,7 +2016,13 @@ def tile_atomic_add_value_func(arg_types, arg_values):
     input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
     value_func=tile_atomic_add_value_func,
     variadic=True,
-    doc="Atomically add a tile `t` worth of data to array `a` at offset `(i=x*m, j=y*n)`",
+    doc="""Atomically add a tile to the array `a`, each element will be updated atomically.
+   
+    :param a: Array in global memory, should have the same ``dtype`` as the input tile
+    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*M`` where ``M`` is the first tile dimension
+    :param y: Offset in the destination array measured in multiples of ``n``, i.e.: ``j=y*N`` where ``N`` is the second tile dimension
+    :param t: Source tile to add to the desination array
+    :returns: A tile with the same dimensions and type as the source tile, holding the original value of the destination elements""",
     group="Tile Primitives",
     export=False,
 )
@@ -1900,24 +2048,30 @@ def tile_value_func(arg_types, arg_values):
     input_types={"x": Any},
     value_func=tile_value_func,
     variadic=True,
-    doc="""Construct a Tile from a per-thread kernel value.
+    doc="""Constructs a new Tile from a per-thread kernel values.
     
-    Args:
-        x (Any): A per-thread local value, e.g.: scalar, vector, or matrix.
+    This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
-    Returns:
-        Tile: A tile with dimensions of ``(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch().``
+    :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
+    :returns: A tile with ``shape=(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch()``.
 
-    Examples:
-        This example shows how to create a linear sequence from thread variables:
+    This example shows how to create a linear sequence from thread variables:
 
-        .. code-block:: python
-            
-            # get thread id
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
             i = wp.tid()
-            
-            # convert to block wide tile
             t = wp.tile(i*2)
+            print(t)
+
+        wp.launch(compute, dim=16, inputs=[], block_dim=16)
+
+    Prints:
+    
+    .. code-block:: text
+
+        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 10 12 14...]]
     """,
 
     group="Tile Primitives""",
@@ -1944,7 +2098,14 @@ def tile_extract_value_func(arg_types, arg_values):
     input_types={"a": Tile(dtype=Any, M=Any, N=Any), "i": int, "j": int},
     value_func=tile_extract_value_func,
     variadic=True,
-    doc="Extract element at index (i, j) of the tile and return the native type",
+    doc="""Extracts a single element from the tile and returns it as a scalar type.
+    
+    This function will extract an element from the tile and broadcast its value to all threads in the block, note that this may incur additional synchronization if the source tile is a register tile.
+
+    :param a: Tile to extract the element from
+    :param i: Coordinate of element on first dimension
+    :param j: Coordinate of element on the second dimension
+    :returns: The value of the element at the specified tile location, with the same type as the input tile's per-element dtype""",
     group="Tile Primitives",
     export=False,
 )
@@ -1988,13 +2149,14 @@ def tile_matmul_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
 
 
 add_builtin(
-    "tile_matmul",
+    "tile_matmul_scalar",
     input_types={"a": Tile, "b": Tile, "out": Tile},
     value_func=tile_matmul_value_func,
     dispatch_func=tile_matmul_dispatch_func,
     variadic=True,
     doc="Compute matrix product and accumulate out += a*b.",
     group="Tile Primitives",
+    hidden=True,
     export=False,
 )
 
@@ -2020,7 +2182,32 @@ def tile_sum_value_func(arg_types, arg_values):
     input_types={"a": Tile},
     value_func=tile_sum_value_func,
     variadic=True,
-    doc="Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored",
+    doc="""Cooperatively compute the sum the tile elements using all threads in the block.
+    
+    :param a: The tile to compute the sum of
+    :returns: A single element tile with dimensions of (1,1) holding the sum
+    
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            
+            t = wp.tile_ones(dtype=float, m=16, n=16)
+            s = wp.tile_sum(t)
+
+            print(t)
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+    
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[256]]
+    
+    """,
     group="Tile Primitives",
     export=False,
 )
@@ -2053,7 +2240,34 @@ def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, ar
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
     native_func="tile_unary_map",
-    doc="Unary map the operation onto each element of the tile.", 
+    doc="""Apply a unary function onto the tile.
+    
+    This function cooperatively applies a unary function to each element of the tile using all threads in the block.
+    
+    :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
+    :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A tile with the same dimensions as the input tile, currently output tiles must have the same dtype as the input.
+    
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_arange(0.0, 1.0, 0.1, dtype=float)
+            s = wp.tile_map(wp.sin, t)
+
+            print(s)
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+    
+    .. code-block:: text
+
+        tile(m=1, n=10, storage=register) = [[0 0.0998334 0.198669 0.29552 ...]]
+    """, 
     group="Tile Primitives",
     export=False,
 )
@@ -3871,9 +4085,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
@@ -3882,9 +4094,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
@@ -3893,9 +4103,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
@@ -3904,9 +4112,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
@@ -3916,9 +4122,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
@@ -3927,9 +4131,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
@@ -3938,9 +4140,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
@@ -3949,9 +4149,7 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.""",
         group="Utility",
         skip_replay=True,
     )
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 7563e0d9..4f562e15 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -471,7 +471,7 @@ struct tile_shared_t
 
         if (threadIdx.x == 0)
         {
-            printf("Tile(M=%d, N=%d, storage=shared) = [\n", M, N);
+            printf("tile(m=%d, n=%d, storage=shared) = [", M, N);
             for (int i=0; i < M; ++i)
             {
                 printf("%*s[", i>0, "");
@@ -570,7 +570,7 @@ void tile_register_t<T, M, N>::print()
 
     if (threadIdx.x == 0)
     {
-        printf("Tile(M=%d, N=%d, storage=register) = [\n", M, N);
+        printf("tile(m=%d, n=%d, storage=register) = [", M, N);
         for (int i=0; i < M; ++i)
         {
             printf("%*s[", i>0, "");
@@ -666,9 +666,42 @@ template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_zeros()
 {
     // tile variable assignment operator will handle initialization (since lhs could be shared/register tile)
-    return T(0.0);
+    return T(0);
 }
 
+// zero initialized tile
+template <typename T, int M, int N>
+inline CUDA_CALLABLE auto tile_ones()
+{
+    // tile variable assignment operator will handle initialization (since lhs could be shared/register tile)
+    return T(1);
+}
+
+// zero initialized tile
+template <typename T, int M, int N>
+inline CUDA_CALLABLE auto tile_arange(T start, T stop, T step)
+{
+    tile_register_t<T, M, N> out;
+    
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < out.NumRegs; ++i)
+    {
+        const int linear = out.index(i);
+
+        // handle case where tile size is not
+        // aligned to block dimensions
+        if (!out.Aligned && linear >= out.Size)
+            break;
+
+        out.data[i] = start + linear*step;
+    }
+    
+    return out;
+}
+
+template <typename T, int M, int N>
+inline CUDA_CALLABLE void adj_tile_arange(int start, int stop, int step,
+                                          int adj_start, int adj_stop, int adj_step, const tile_register_t<T,M,N>& adj_ret) {}
 
 // entry point for load
 template <typename T, int M, int N>
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index 81491878..00b8b301 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -71,23 +71,71 @@ def tile_reduce_1d_kernel(output: wp.array(dtype=int)):
 
     t = wp.tile(i)  # convert to block wide tile
     s = wp.tile_sum(t)  # sum over block
-
+    
     # update global sum
-    wp.tile_atomic_add(output, i, 0, s)
+    wp.tile_atomic_add(output, 0, 0, s)
 
 
-@unittest.expectedFailure
 def test_tile_reduce_1d(test, device):
+    
+    # use an unaligned grid dimension
     N = int(TILE_DIM * 3 / 2)
 
     output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
         wp.launch(tile_reduce_1d_kernel, dim=[N], inputs=[output], block_dim=TILE_DIM, device=device)
-
+    
     test.assertAlmostEqual(output.numpy()[0], np.sum(np.arange(N)))
 
 
+
+@wp.kernel
+def tile_ones_kernel(out: wp.array(dtype=float)):
+    i = wp.tid()
+    
+    t = wp.tile_ones(dtype=float, m=16, n=16)
+    s = wp.tile_sum(t)
+
+    wp.tile_store(out, 0, 0, s)
+
+def test_tile_ones(test, device):
+    
+    output = wp.zeros(shape=1, dtype=float, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_ones_kernel, dim=[1, TILE_DIM], inputs=[output], block_dim=TILE_DIM, device=device)
+    wp.synchronize()
+
+    test.assertAlmostEqual(output.numpy()[0], 256.0)
+
+
+@wp.kernel
+def tile_arange_kernel(out: wp.array2d(dtype=int)):
+    i = wp.tid()
+    
+    a = wp.tile_arange(17, dtype=int)
+    b = wp.tile_arange(5, 23, dtype=int)
+    c = wp.tile_arange(0, 34, 2, dtype=int)
+
+    wp.tile_store(out, 0, 0, a)
+    wp.tile_store(out, 1, 0, b)
+    wp.tile_store(out, 2, 0, c)
+
+def test_tile_arange(test, device):
+    
+    N = 17
+
+    output = wp.zeros(shape=(3, N), dtype=int, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_arange_kernel, dim=[1, N], inputs=[output], block_dim=TILE_DIM, device=device)
+    
+    assert_np_equal(output.numpy()[0], np.arange(17))
+    assert_np_equal(output.numpy()[1], np.arange(5, 22))
+    assert_np_equal(output.numpy()[2], np.arange(0, 34, 2))
+
+
 devices = get_cuda_test_devices()
 
 
@@ -96,7 +144,9 @@ class TestTileReduce(unittest.TestCase):
 
 
 add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
-add_function_test(TestTileReduce, "test_tile_reduce_1d", test_tile_reduce_1d, devices=devices)  # FAILS
+add_function_test(TestTileReduce, "test_tile_reduce_1d", test_tile_reduce_1d, devices=devices) 
+add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
+add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
diff --git a/warp/types.py b/warp/types.py
index 9dbab2a4..e099119d 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -3001,6 +3001,17 @@ class TileZeros(Tile):
     def __init__(self, dtype, M, N):
         Tile.__init__(self, dtype, M, N, op="zeros", storage="shared")
 
+class TileRange(Tile):
+    def __init__(self, dtype, start, stop, step):
+
+        self.start = start
+        self.stop = stop
+        self.step = step
+
+        M = 1
+        N = int((stop-start)/step)
+
+        Tile.__init__(self, dtype, M, N, op="arange", storage="register")
 
 class TileConstant(Tile):
     def __init__(self, dtype, M, N):

From 83b1ed3d48925e311d2be39baf2ab039657659ce Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Thu, 26 Sep 2024 13:48:06 -0700
Subject: [PATCH 036/102] Fix various issues with tile branch tests

---
 .gitlab-ci.yml                        |   3 +-
 .gitlab/ci/additional-tests.yml       |   2 +-
 .gitlab/ci/cuda-11-build-and-test.yml |   2 +-
 .gitlab/ci/debug-build-and-test.yml   |   2 +-
 .gitlab/ci/mathdx-support.yml         |   3 +-
 docs/modules/functions.rst            | 215 ++++++++++++------
 warp/autograd.py                      |  51 ++++-
 warp/builtins.py                      |  56 ++---
 warp/context.py                       |   4 +-
 warp/jax_experimental.py              |   4 +-
 warp/native/builtin.h                 |   5 +-
 warp/native/tile.h                    |   3 +-
 warp/native/warp.cu                   |  11 +-
 warp/stubs.py                         | 303 ++++++++++++++++----------
 warp/tests/test_tile.py               |   1 -
 warp/tests/test_tile_reduce.py        |  19 +-
 warp/types.py                         |   5 +-
 17 files changed, 443 insertions(+), 246 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5eb130f6..ea8ae21c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -207,6 +207,7 @@ linux-aarch64 test jetson:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
     - !reference [.snippets, install-python+warp-aarch64]
     - python -m pip install coverage[toml]
+    - python -m pip install -U "jax[cuda12]"
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
     - python -m warp.tests --junit-report-xml rspec.xml --coverage --coverage-xml coverage.xml -s autodetect --failfast
@@ -231,7 +232,7 @@ linux-x86_64 test:
     - python -m pip install --upgrade pip
     - python -m pip install --upgrade usd-core coverage[toml]
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
-    - python -m pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+    - python -m pip install -U "jax[cuda12]"
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
     # HACK: disable P2P tests due to misbehaving agents
diff --git a/.gitlab/ci/additional-tests.yml b/.gitlab/ci/additional-tests.yml
index 10c19889..aba4a45d 100644
--- a/.gitlab/ci/additional-tests.yml
+++ b/.gitlab/ci/additional-tests.yml
@@ -43,7 +43,7 @@ linux-x86_64 test:
     - python -m pip install --upgrade pip
     - python -m pip install --upgrade usd-core
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
-    - python -m pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+    - python -m pip install -U "jax[cuda12]"
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
     # HACK: disable P2P tests due to misbehaving agents
diff --git a/.gitlab/ci/cuda-11-build-and-test.yml b/.gitlab/ci/cuda-11-build-and-test.yml
index 735104ea..7282d9e8 100644
--- a/.gitlab/ci/cuda-11-build-and-test.yml
+++ b/.gitlab/ci/cuda-11-build-and-test.yml
@@ -122,7 +122,7 @@ linux-x86_64 test:
     - python -m pip install --upgrade pip
     - python -m pip install --upgrade usd-core
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
-    - python -m pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+    - python -m pip install -U "jax[cuda12]"
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
     # HACK: disable P2P tests due to misbehaving agents
diff --git a/.gitlab/ci/debug-build-and-test.yml b/.gitlab/ci/debug-build-and-test.yml
index 3ebeeade..e041739a 100644
--- a/.gitlab/ci/debug-build-and-test.yml
+++ b/.gitlab/ci/debug-build-and-test.yml
@@ -114,7 +114,7 @@ linux-x86_64 test:
     - python -m pip install --upgrade pip
     - python -m pip install --upgrade usd-core
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
-    - python -m pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+    - python -m pip install -U "jax[cuda12]"
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
     # HACK: disable P2P tests due to misbehaving agents
diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index 5bea3383..b6fff5b3 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -100,7 +100,7 @@ linux-x86_64 test:
     - python -m pip install --upgrade pip
     - python -m pip install --upgrade usd-core
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
-    - python -m pip install --upgrade "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+    - python -m pip install -U "jax[cuda12]"
     - python -m pip install --upgrade nvidia-mathdx==24.4.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
@@ -117,6 +117,7 @@ linux-aarch64 test jetson:
   before_script:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
     - !reference [.snippets, install-python+warp-aarch64]
+    - python -m pip install -U "jax[cuda12]"
     - python -m pip install --upgrade nvidia-mathdx==24.4.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index bcd18cc9..5d2bc605 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -804,67 +804,196 @@ Tile Primitives
 ---------------
 .. py:function:: tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile
 
-    Allocate a tile local block of zero'd memory
+    Allocates a tile of zero initialized items.
+
+    :param m: Size of the first dimension of the output tile
+    :param n: Size of the second dimension of the output tile
+    :param dtype: Datatype of output tile's elements
+    :returns: A zero initialized tile with ``shape=(m,n)`` and the specified datatype
+
+
+.. py:function:: tile_ones(m: int32, n: int32, dtype: Scalar) -> Tile
+
+    Allocates a tile of one initialized items.
+
+    :param m: Size of the first dimension of the output tile
+    :param n: Size of the second dimension of the output tile
+    :param dtype: Datatype of output tile's elements
+    :returns: A one initialized tile with ``shape=(m,n)`` and the specified dtype
+
+
+.. py:function:: tile_arange(*args: Scalar, dtype: Scalar) -> Tile
+
+    Generates a tile of linearly spaced elements.
+
+    :param args: Variable length positional arguments, interpreted as:
+
+        - ``(stop,)``: Generates values from ``0`` to ``stop - 1``
+        - ``(start, stop)``: Generates values from ``start`` to ``stop - 1``
+        - ``(start, stop, step)``: Generates values from ``start`` to ``stop - 1`` with a step size
+
+    :param dtype: Datatype of output tile's elements (optional, default: int)
+    :returns: A tile with ``shape=(1,n)`` with linearly spaced elements of specified dtype
 
 
 .. py:function:: tile_load(a: Array[Any], x: int32, y: int32, m: int32, n: int32) -> Tile
 
-    Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)
+    Loads a tile from a global memory array.
+
+    This method will cooperatively load a tile from global memory using all threads in the block.
+
+    :param a: The source array in global memory
+    :param x: Offset in the source array measured in multiples of ``m``, i.e.: ``i=x*m``
+    :param y: Offset in the source array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param m: The size of the tile's first dimension
+    :param n: The size of the tile's second dimensions
+    :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array
 
 
 .. py:function:: tile_store(a: Array[Any], x: int32, y: int32, t: Any) -> None
 
-    Store tile `t` to an array `a` at offset `(i=x*m, j=y*n)`
+    Stores a tile to a global memory array.
+
+    This method will cooperatively store a tile to global memory using all threads in the block.
+
+    :param a: The destination array in global memory
+    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*m``
+    :param y: Offset in the destination array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param t: The source tile to store data from, must have the same dtype as the destination array
 
 
 .. py:function:: tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile
 
-    Atomically add a tile `t` worth of data to array `a` at offset `(i=x*m, j=y*n)`
+    Atomically add a tile to the array `a`, each element will be updated atomically.
+
+    :param a: Array in global memory, should have the same ``dtype`` as the input tile
+    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*M`` where ``M`` is the first tile dimension
+    :param y: Offset in the destination array measured in multiples of ``n``, i.e.: ``j=y*N`` where ``N`` is the second tile dimension
+    :param t: Source tile to add to the destination array
+    :returns: A tile with the same dimensions and type as the source tile, holding the original value of the destination elements
 
 
 .. py:function:: tile(x: Any) -> Tile
 
-    Construct a Tile from a per-thread kernel value, returns a tile with dimensions of `(1, block_dim)` where block_dim is the number of threads specified in `wp.launch()`
+    Constructs a new Tile from a per-thread kernel values.
 
+    This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
-.. py:function:: tile_extract(a: Tile, i: int32, j: int32) -> None
+    :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
+    :returns: A tile with ``shape=(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch()``.
 
-    Extract element at index (i, j) of the tile and return the native type
+    This example shows how to create a linear sequence from thread variables:
 
+    .. code-block:: python
 
-.. py:function:: tile_matmul(a: Tile, b: Tile, out: Tile) -> None
+        @wp.kernel
+        def compute():
+            i = wp.tid()
+            t = wp.tile(i*2)
+            print(t)
 
-    Compute matrix product and accumulate out += a*b.
+        wp.launch(compute, dim=16, inputs=[], block_dim=16)
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 10 12 14...]]
+    
+
+
+.. py:function:: tile_extract(a: Tile, i: int32, j: int32) -> Scalar
+
+    Extracts a single element from the tile and returns it as a scalar type.
+
+    This function will extract an element from the tile and broadcast its value to all threads in the block, note that this may incur additional synchronization if the source tile is a register tile.
+
+    :param a: Tile to extract the element from
+    :param i: Coordinate of element on first dimension
+    :param j: Coordinate of element on the second dimension
+    :returns: The value of the element at the specified tile location, with the same type as the input tile's per-element dtype
+
+
+.. py:function:: tile_sum(a: Tile) -> Tile
+
+    Cooperatively compute the sum the tile elements using all threads in the block.
+
+    :param a: The tile to compute the sum of
+    :returns: A single element tile with dimensions of (1,1) holding the sum
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_ones(dtype=float, m=16, n=16)
+            s = wp.tile_sum(t)
+
+            print(t)
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
 
+    .. code-block:: text
 
-.. py:function:: tile_sum(a: Tile) -> None
+        tile(m=1, n=1, storage=register) = [[256]]
 
-    Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored
+    
+
+
+.. py:function:: tile_map(op: Callable, a: Any) -> Tile
+
+    Apply a unary function onto the tile.
+
+    This function cooperatively applies a unary function to each element of the tile using all threads in the block.
+
+    :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
+    :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A tile with the same dimensions as the input tile, currently output tiles must have the same dtype as the input.
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_arange(0.0, 1.0, 0.1, dtype=float)
+            s = wp.tile_map(wp.sin, t)
 
+            print(s)
 
-.. py:function:: tile_map(op: Callable, a: Any) -> None
+        wp.launch(compute, dim=[64], inputs=[])
 
-    Map the operation onto each element of the tile
+    Prints:
 
+    .. code-block:: text
 
-.. py:function:: tile_map(op: Callable, a: Any, b: Any) -> None
+        tile(m=1, n=10, storage=register) = [[0 0.0998334 0.198669 0.29552 ...]]
+    
+
+
+.. py:function:: tile_map(op: Callable, a: Any, b: Any) -> Tile
     :noindex:
     :nocontentsentry:
 
-    Map the operation onto each element of the tile
+    Apply the binary map operation onto each corresponding pair of elements from each the tile.
 
 
-.. py:function:: tile_matmul_dx(a: Tile, b: Tile, out: Tile) -> None
+.. py:function:: tile_matmul_dx(a: Tile, b: Tile, out: Tile) -> Tile
 
     Compute matrix product and accumulate out += a*b.
 
 
-.. py:function:: tile_fft_dx(inout: Tile) -> None
+.. py:function:: tile_fft_dx(inout: Tile) -> Tile
 
     Compute the FFT along the second dimension of a 2D tile of data.
 
 
-.. py:function:: tile_ifft_dx(inout: Tile) -> None
+.. py:function:: tile_ifft_dx(inout: Tile) -> Tile
 
     Compute the inverse FFT along the second dimension of a 2D tile of data.
 
@@ -1183,8 +1312,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1192,8 +1319,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1201,8 +1326,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1210,8 +1333,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1219,8 +1340,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1228,8 +1347,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1237,8 +1354,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1246,8 +1361,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1255,8 +1368,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1264,8 +1375,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1273,8 +1382,6 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1282,15 +1389,11 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, value: Any) -> Any
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1298,8 +1401,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1307,8 +1408,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1316,8 +1415,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1325,8 +1422,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1334,8 +1429,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1343,8 +1436,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1352,8 +1443,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1361,8 +1450,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1370,8 +1457,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1379,8 +1464,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1388,8 +1471,6 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-
 
 .. py:function:: lerp(a: Float, b: Float, t: Float) -> Float
 
@@ -2032,7 +2113,7 @@ Operators
     :nocontentsentry:
 
 
-.. py:function:: add(a: Tile, b: Tile) -> None
+.. py:function:: add(a: Tile, b: Tile) -> Tile
     :noindex:
     :nocontentsentry:
 
diff --git a/warp/autograd.py b/warp/autograd.py
index 8f884f04..9b2eea47 100644
--- a/warp/autograd.py
+++ b/warp/autograd.py
@@ -34,6 +34,7 @@ def gradcheck(
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
     max_blocks=0,
+    block_dim=256,
     max_inputs_per_var=-1,
     max_outputs_per_var=-1,
     plot_relative_error=False,
@@ -44,7 +45,8 @@ def gradcheck(
     Checks whether the autodiff gradient of a Warp kernel matches finite differences.
     Fails if the relative or absolute errors between the autodiff and finite difference gradients exceed the specified tolerance, or if the autodiff gradients contain NaN values.
 
-    The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
+    The kernel function and its adjoint version are launched with the given inputs and outputs, as well as the provided
+    ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
 
     Note:
         This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -65,6 +67,7 @@ def gradcheck(
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         device: The device to launch on (optional)
         max_blocks: The maximum number of CUDA thread blocks to use.
+        block_dim: The number of threads per block.
         max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
         max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
         plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
@@ -85,6 +88,7 @@ def gradcheck(
         input_output_mask=input_output_mask,
         device=device,
         max_blocks=max_blocks,
+        block_dim=block_dim,
         max_inputs_per_var=max_inputs_per_var,
         eps=eps,
         plot_jacobians=False,
@@ -98,6 +102,7 @@ def gradcheck(
         input_output_mask=input_output_mask,
         device=device,
         max_blocks=max_blocks,
+        block_dim=block_dim,
         max_outputs_per_var=max_outputs_per_var,
         plot_jacobians=False,
     )
@@ -237,7 +242,6 @@ def gradcheck_tape(
         input_output_masks: Dictionary of input-output masks for each kernel in the tape, mapping from kernel keys to input-output masks. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         blacklist_kernels: List of kernel keys to exclude from the gradient check.
         whitelist_kernels: List of kernel keys to include in the gradient check. If not empty or None, only kernels in this list are checked.
-        max_blocks: The maximum number of CUDA thread blocks to use.
         max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
         max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
         plot_relative_error: If True, visualizes the relative error of the Jacobians in a plot (requires ``matplotlib``).
@@ -262,7 +266,7 @@ def gradcheck_tape(
     for launch in tape.launches:
         if not isinstance(launch[0], wp.Kernel):
             continue
-        kernel, dim, max_blocks, inputs, outputs, device = launch[:6]
+        kernel, dim, max_blocks, inputs, outputs, device, block_dim = launch[:7]
         if len(whitelist_kernels) > 0 and kernel.key not in whitelist_kernels:
             continue
         if kernel.key in blacklist_kernels:
@@ -280,6 +284,7 @@ def gradcheck_tape(
             input_output_mask=input_output_mask,
             device=device,
             max_blocks=max_blocks,
+            block_dim=block_dim,
             max_inputs_per_var=max_inputs_per_var,
             max_outputs_per_var=max_outputs_per_var,
             plot_relative_error=plot_relative_error,
@@ -611,13 +616,15 @@ def jacobian(
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
     max_blocks=0,
+    block_dim=256,
     max_outputs_per_var=-1,
     plot_jacobians=False,
 ) -> Dict[Tuple[int, int], wp.array]:
     """
     Computes the Jacobians of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
 
-    The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
+    The kernel adjoint function is launched with the given inputs and outputs, as well as the provided ``dim``,
+    ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
 
     Note:
         This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -634,6 +641,7 @@ def jacobian(
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         device: The device to launch on (optional)
         max_blocks: The maximum number of CUDA thread blocks to use.
+        block_dim: The number of threads per block.
         max_outputs_per_var: Maximum number of output dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all output dimensions if value <= 0.
         plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
 
@@ -661,7 +669,15 @@ def resolve_arg(name, offset: int = 0):
         device = infer_device(inputs + outputs)
 
     tape = wp.Tape()
-    tape.record_launch(kernel=kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=outputs, device=device)
+    tape.record_launch(
+        kernel=kernel,
+        dim=dim,
+        inputs=inputs,
+        outputs=outputs,
+        device=device,
+        max_blocks=max_blocks,
+        block_dim=block_dim,
+    )
 
     jacobians = {}
 
@@ -709,6 +725,7 @@ def jacobian_fd(
     input_output_mask: List[Tuple[Union[str, int], Union[str, int]]] = None,
     device: wp.context.Devicelike = None,
     max_blocks=0,
+    block_dim=256,
     max_inputs_per_var=-1,
     eps=1e-4,
     plot_jacobians=False,
@@ -717,7 +734,8 @@ def jacobian_fd(
     Computes the finite-difference Jacobian of a Warp kernel launch for the provided selection of differentiable inputs to differentiable outputs.
     The method uses a central difference scheme to approximate the Jacobian.
 
-    The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the provided ``dim`` and ``max_blocks`` arguments (see :func:`warp.launch` for more details).
+    The kernel is launched multiple times in forward-only mode with the given inputs and outputs, as well as the
+    provided ``dim``, ``max_blocks``, and ``block_dim`` arguments (see :func:`warp.launch` for more details).
 
     Note:
         This function only supports Warp kernels whose input arguments precede the output arguments.
@@ -734,6 +752,7 @@ def jacobian_fd(
         input_output_mask: List of tuples specifying the input-output pairs to compute the Jacobian for. Inputs and outputs can be identified either by their integer indices of where they appear in the kernel input/output arguments, or by the respective argument names as strings. If None, computes the Jacobian for all input-output pairs.
         device: The device to launch on (optional)
         max_blocks: The maximum number of CUDA thread blocks to use.
+        block_dim: The number of threads per block.
         max_inputs_per_var: Maximum number of input dimensions over which to evaluate the Jacobians for the input-output pairs. Evaluates all input dimensions if value <= 0.
         eps: The finite-difference step size.
         plot_jacobians: If True, visualizes the computed Jacobians in a plot (requires ``matplotlib``).
@@ -793,10 +812,26 @@ def resolve_arg(name, offset: int = 0):
             input_num = min(input_num, max_inputs_per_var)
         for i in range(input_num):
             set_element(flat_input, i, -eps, relative=True)
-            wp.launch(kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=left_outputs, device=device)
+            wp.launch(
+                kernel,
+                dim=dim,
+                inputs=inputs,
+                outputs=left_outputs,
+                device=device,
+                max_blocks=max_blocks,
+                block_dim=block_dim,
+            )
 
             set_element(flat_input, i, 2 * eps, relative=True)
-            wp.launch(kernel, dim=dim, max_blocks=max_blocks, inputs=inputs, outputs=right_outputs, device=device)
+            wp.launch(
+                kernel,
+                dim=dim,
+                inputs=inputs,
+                outputs=right_outputs,
+                device=device,
+                max_blocks=max_blocks,
+                block_dim=block_dim,
+            )
 
             set_element(flat_input, i, -eps, relative=True)
 
diff --git a/warp/builtins.py b/warp/builtins.py
index e400c364..da5463ec 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1746,7 +1746,7 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     dispatch_func=tile_zeros_dispatch_func,
     variadic=True,
     doc="""Allocates a tile of zero initialized items.
-    
+
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
@@ -1755,6 +1755,7 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     export=False,
 )
 
+
 def tile_ones_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1793,7 +1794,7 @@ def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     dispatch_func=tile_ones_dispatch_func,
     variadic=True,
     doc="""Allocates a tile of one initialized items.
-    
+
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
@@ -1802,6 +1803,7 @@ def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     export=False,
 )
 
+
 def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1827,7 +1829,7 @@ def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
         stop = args[1]
         step = args[2]
 
-    if start == None or stop == None or step == None:
+    if start is None or stop is None or step is None:
         raise RuntimeError("wp.tile_arange() arguments must be compile time constants")
 
     if arg_values["dtype"] is not None:
@@ -1862,7 +1864,7 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     dispatch_func=tile_arange_dispatch_func,
     variadic=True,
     doc="""Generates a tile of linearly spaced elements.
-    
+
     :param args: Variable length positional arguments, interpreted as:
 
         - ``(stop,)``: Generates values from ``0`` to ``stop - 1``
@@ -1876,7 +1878,6 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
 )
 
 
-
 def tile_load_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1927,7 +1928,7 @@ def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     dispatch_func=tile_load_dispatch_func,
     variadic=True,
     doc="""Loads a tile from a global memory array.
-    
+
     This method will cooperatively load a tile from global memory using all threads in the block.
 
     :param a: The source array in global memory
@@ -1973,7 +1974,7 @@ def tile_store_value_func(arg_types, arg_values):
     value_func=tile_store_value_func,
     variadic=True,
     doc="""Stores a tile to a global memory array.
-    
+
     This method will cooperatively store a tile to global memory using all threads in the block.
 
     :param a: The destination array in global memory
@@ -2017,11 +2018,11 @@ def tile_atomic_add_value_func(arg_types, arg_values):
     value_func=tile_atomic_add_value_func,
     variadic=True,
     doc="""Atomically add a tile to the array `a`, each element will be updated atomically.
-   
+
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
     :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*M`` where ``M`` is the first tile dimension
     :param y: Offset in the destination array measured in multiples of ``n``, i.e.: ``j=y*N`` where ``N`` is the second tile dimension
-    :param t: Source tile to add to the desination array
+    :param t: Source tile to add to the destination array
     :returns: A tile with the same dimensions and type as the source tile, holding the original value of the destination elements""",
     group="Tile Primitives",
     export=False,
@@ -2049,7 +2050,7 @@ def tile_value_func(arg_types, arg_values):
     value_func=tile_value_func,
     variadic=True,
     doc="""Constructs a new Tile from a per-thread kernel values.
-    
+
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
     :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
@@ -2068,13 +2069,12 @@ def compute():
         wp.launch(compute, dim=16, inputs=[], block_dim=16)
 
     Prints:
-    
+
     .. code-block:: text
 
         tile(m=1, n=16, storage=register) = [[0 2 4 6 8 10 12 14...]]
     """,
-
-    group="Tile Primitives""",
+    group="Tile Primitives" "",
     export=False,
 )
 
@@ -2082,9 +2082,9 @@ def compute():
 def tile_extract_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
-        return Scalar    
-    
-    if len(arg_types) != 3: 
+        return Scalar
+
+    if len(arg_types) != 3:
         raise RuntimeError("tile_extract() requires 3 positional args")
 
     if not is_tile(arg_types["a"]):
@@ -2099,7 +2099,7 @@ def tile_extract_value_func(arg_types, arg_values):
     value_func=tile_extract_value_func,
     variadic=True,
     doc="""Extracts a single element from the tile and returns it as a scalar type.
-    
+
     This function will extract an element from the tile and broadcast its value to all threads in the block, note that this may incur additional synchronization if the source tile is a register tile.
 
     :param a: Tile to extract the element from
@@ -2183,17 +2183,17 @@ def tile_sum_value_func(arg_types, arg_values):
     value_func=tile_sum_value_func,
     variadic=True,
     doc="""Cooperatively compute the sum the tile elements using all threads in the block.
-    
+
     :param a: The tile to compute the sum of
     :returns: A single element tile with dimensions of (1,1) holding the sum
-    
+
     Example:
 
     .. code-block:: python
 
         @wp.kernel
         def compute():
-            
+
             t = wp.tile_ones(dtype=float, m=16, n=16)
             s = wp.tile_sum(t)
 
@@ -2202,11 +2202,11 @@ def compute():
         wp.launch(compute, dim=[64], inputs=[])
 
     Prints:
-    
+
     .. code-block:: text
 
         tile(m=1, n=1, storage=register) = [[256]]
-    
+
     """,
     group="Tile Primitives",
     export=False,
@@ -2241,13 +2241,13 @@ def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, ar
     # variadic=True,
     native_func="tile_unary_map",
     doc="""Apply a unary function onto the tile.
-    
+
     This function cooperatively applies a unary function to each element of the tile using all threads in the block.
-    
+
     :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
     :returns: A tile with the same dimensions as the input tile, currently output tiles must have the same dtype as the input.
-    
+
     Example:
 
     .. code-block:: python
@@ -2263,11 +2263,11 @@ def compute():
         wp.launch(compute, dim=[64], inputs=[])
 
     Prints:
-    
+
     .. code-block:: text
 
         tile(m=1, n=10, storage=register) = [[0 0.0998334 0.198669 0.29552 ...]]
-    """, 
+    """,
     group="Tile Primitives",
     export=False,
 )
@@ -2307,7 +2307,7 @@ def tile_binary_map_value_func(arg_types, arg_values):
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
     native_func="tile_binary_map",
-    doc="Apply the binary map operation onto each corresponding pair of elements from each the tile.", 
+    doc="Apply the binary map operation onto each corresponding pair of elements from each the tile.",
     group="Tile Primitives",
     export=False,
 )
diff --git a/warp/context.py b/warp/context.py
index 9f0617b1..55b42f3d 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -1751,7 +1751,7 @@ def __init__(self, name, loader):
             "fast_math": False,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
-            "block_dim": 0,
+            "block_dim": 256,
         }
 
         # Module dependencies are determined by scanning each function
@@ -5009,7 +5009,7 @@ def launch(
         record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
         max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
             If negative or zero, the maximum hardware value will be used.
-        block_dim: The number of threads per-block
+        block_dim: The number of threads per block.
     """
 
     init()
diff --git a/warp/jax_experimental.py b/warp/jax_experimental.py
index 8e78ab26..5f62f953 100644
--- a/warp/jax_experimental.py
+++ b/warp/jax_experimental.py
@@ -102,7 +102,9 @@ def _warp_custom_callback(stream, buffers, opaque, opaque_len):
     assert hooks.forward, "Failed to find kernel entry point"
 
     # Launch the kernel.
-    wp.context.runtime.core.cuda_launch_kernel(device.context, hooks.forward, bounds.size, 0, kernel_params, stream)
+    wp.context.runtime.core.cuda_launch_kernel(
+        device.context, hooks.forward, bounds.size, 0, 256, kernel_params, stream
+    )
 
 
 # TODO: is there a simpler way of getting the Jax "current" device?
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index 91701a89..7d1ac8d9 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1575,11 +1575,14 @@ inline CUDA_CALLABLE void print(transform_t<Type> t)
     printf("(%g %g %g) (%g %g %g %g)\n", float(t.p[0]), float(t.p[1]), float(t.p[2]), float(t.q.x), float(t.q.y), float(t.q.z), float(t.q.w));
 }
 
+inline CUDA_CALLABLE void adj_print(bool i, bool adj_i) { printf("%d adj: %d\n", i, adj_i); }
+inline CUDA_CALLABLE void adj_print(int8 i, int8 adj_i) { printf("%hhd adj: %hhd\n", i, adj_i); }
 inline CUDA_CALLABLE void adj_print(int i, int adj_i) { printf("%d adj: %d\n", i, adj_i); }
 inline CUDA_CALLABLE void adj_print(float f, float adj_f) { printf("%g adj: %g\n", f, adj_f); }
 inline CUDA_CALLABLE void adj_print(short f, short adj_f) { printf("%hd adj: %hd\n", f, adj_f); }
 inline CUDA_CALLABLE void adj_print(long f, long adj_f) { printf("%ld adj: %ld\n", f, adj_f); }
 inline CUDA_CALLABLE void adj_print(long long f, long long adj_f) { printf("%lld adj: %lld\n", f, adj_f); }
+inline CUDA_CALLABLE void adj_print(uint8 i, uint8 adj_i) { printf("%hhu adj: %hhu\n", i, adj_i); }
 inline CUDA_CALLABLE void adj_print(unsigned f, unsigned adj_f) { printf("%u adj: %u\n", f, adj_f); }
 inline CUDA_CALLABLE void adj_print(unsigned short f, unsigned short adj_f) { printf("%hu adj: %hu\n", f, adj_f); }
 inline CUDA_CALLABLE void adj_print(unsigned long f, unsigned long adj_f) { printf("%lu adj: %lu\n", f, adj_f); }
@@ -1689,4 +1692,4 @@ inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expect
 #include "tile.h"
 #include "tile_gemm.h"
 #include "tile_reduce.h"
-#endif
\ No newline at end of file
+#endif
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 4f562e15..4252bc97 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -238,7 +238,7 @@ struct tile_register_t
 
         WP_TILE_SHARED Type scratch;
 
-        // ensure any prevoiusly scheduled threads have finished reading from scratch
+        // ensure any previously scheduled threads have finished reading from scratch
         WP_TILE_SYNC();
 
         if (threadIdx.x == thread)
@@ -1063,4 +1063,3 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
     } while (0)
 
 } // namespace wp
-
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index 76f7b97f..7ae7b634 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -3121,10 +3121,19 @@ size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_block
 {
     ContextGuard guard(context);
 
+    if (tile_size <= 0)
+    {
+#if defined(_DEBUG)
+        fprintf(stderr, "Warp warning: Got tile_size %d. Setting to 256.\n", dim, tile_size);
+#endif
+        tile_size = 256;
+    }
+
     const int block_dim = tile_size;
+
     // CUDA specs up to compute capability 9.0 says the max x-dim grid is 2**31-1, so
     // grid_dim is fine as an int for the near future
-    int grid_dim = dim;
+    int grid_dim = (dim + block_dim - 1)/block_dim;
 
     if (max_blocks <= 0) {
         max_blocks = 2147483647;
diff --git a/warp/stubs.py b/warp/stubs.py
index 1a41fd5f..b2511920 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -888,61 +888,202 @@ def spatial_mass(
 
 @over
 def tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile:
-    """Allocate a tile local block of zero'd memory"""
+    """Allocates a tile of zero initialized items.
+
+    :param m: Size of the first dimension of the output tile
+    :param n: Size of the second dimension of the output tile
+    :param dtype: Datatype of output tile's elements
+    :returns: A zero initialized tile with ``shape=(m,n)`` and the specified datatype
+    """
+    ...
+
+
+@over
+def tile_ones(m: int32, n: int32, dtype: Scalar) -> Tile:
+    """Allocates a tile of one initialized items.
+
+    :param m: Size of the first dimension of the output tile
+    :param n: Size of the second dimension of the output tile
+    :param dtype: Datatype of output tile's elements
+    :returns: A one initialized tile with ``shape=(m,n)`` and the specified dtype
+    """
+    ...
+
+
+@over
+def tile_arange(*args: Scalar, dtype: Scalar) -> Tile:
+    """Generates a tile of linearly spaced elements.
+
+    :param args: Variable length positional arguments, interpreted as:
+
+        - ``(stop,)``: Generates values from ``0`` to ``stop - 1``
+        - ``(start, stop)``: Generates values from ``start`` to ``stop - 1``
+        - ``(start, stop, step)``: Generates values from ``start`` to ``stop - 1`` with a step size
+
+    :param dtype: Datatype of output tile's elements (optional, default: int)
+    :returns: A tile with ``shape=(1,n)`` with linearly spaced elements of specified dtype
+    """
     ...
 
 
 @over
 def tile_load(a: Array[Any], x: int32, y: int32, m: int32, n: int32) -> Tile:
-    """Load a tile of size (m, n) worth of data from array a from offset (i=x*m, j=y*n)"""
+    """Loads a tile from a global memory array.
+
+    This method will cooperatively load a tile from global memory using all threads in the block.
+
+    :param a: The source array in global memory
+    :param x: Offset in the source array measured in multiples of ``m``, i.e.: ``i=x*m``
+    :param y: Offset in the source array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param m: The size of the tile's first dimension
+    :param n: The size of the tile's second dimensions
+    :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array
+    """
     ...
 
 
 @over
 def tile_store(a: Array[Any], x: int32, y: int32, t: Any):
-    """Store tile `t` to an array `a` at offset `(i=x*m, j=y*n)`"""
+    """Stores a tile to a global memory array.
+
+    This method will cooperatively store a tile to global memory using all threads in the block.
+
+    :param a: The destination array in global memory
+    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*m``
+    :param y: Offset in the destination array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param t: The source tile to store data from, must have the same dtype as the destination array
+    """
     ...
 
 
 @over
 def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile:
-    """Atomically add a tile `t` worth of data to array `a` at offset `(i=x*m, j=y*n)`"""
+    """Atomically add a tile to the array `a`, each element will be updated atomically.
+
+    :param a: Array in global memory, should have the same ``dtype`` as the input tile
+    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*M`` where ``M`` is the first tile dimension
+    :param y: Offset in the destination array measured in multiples of ``n``, i.e.: ``j=y*N`` where ``N`` is the second tile dimension
+    :param t: Source tile to add to the destination array
+    :returns: A tile with the same dimensions and type as the source tile, holding the original value of the destination elements
+    """
     ...
 
 
 @over
 def tile(x: Any) -> Tile:
-    """Construct a Tile from a per-thread kernel value, returns a tile with dimensions of `(1, block_dim)` where block_dim is the number of threads specified in `wp.launch()`"""
-    ...
+    """Constructs a new Tile from a per-thread kernel values.
 
+    This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
-@over
-def tile_extract(a: Tile, i: int32, j: int32):
-    """Extract element at index (i, j) of the tile and return the native type"""
+    :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
+    :returns: A tile with ``shape=(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch()``.
+
+    This example shows how to create a linear sequence from thread variables:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            i = wp.tid()
+            t = wp.tile(i * 2)
+            print(t)
+
+
+        wp.launch(compute, dim=16, inputs=[], block_dim=16)
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 10 12 14...]]
+
+    """
     ...
 
 
 @over
-def tile_matmul(a: Tile, b: Tile, out: Tile):
-    """Compute matrix product and accumulate out += a*b."""
+def tile_extract(a: Tile, i: int32, j: int32) -> Scalar:
+    """Extracts a single element from the tile and returns it as a scalar type.
+
+    This function will extract an element from the tile and broadcast its value to all threads in the block, note that this may incur additional synchronization if the source tile is a register tile.
+
+    :param a: Tile to extract the element from
+    :param i: Coordinate of element on first dimension
+    :param j: Coordinate of element on the second dimension
+    :returns: The value of the element at the specified tile location, with the same type as the input tile's per-element dtype
+    """
     ...
 
 
 @over
-def tile_sum(a: Tile):
-    """Computes the sum of all elements in the tile, returns a 1x1 tile, axis is currently ignored"""
+def tile_sum(a: Tile) -> Tile:
+    """Cooperatively compute the sum the tile elements using all threads in the block.
+
+    :param a: The tile to compute the sum of
+    :returns: A single element tile with dimensions of (1,1) holding the sum
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            t = wp.tile_ones(dtype=float, m=16, n=16)
+            s = wp.tile_sum(t)
+
+            print(t)
+
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[256]]
+
+
+    """
     ...
 
 
 @over
-def tile_map(op: Callable, a: Any):
-    """Map the operation onto each element of the tile"""
+def tile_map(op: Callable, a: Any) -> Tile:
+    """Apply a unary function onto the tile.
+
+    This function cooperatively applies a unary function to each element of the tile using all threads in the block.
+
+    :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
+    :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A tile with the same dimensions as the input tile, currently output tiles must have the same dtype as the input.
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            t = wp.tile_arange(0.0, 1.0, 0.1, dtype=float)
+            s = wp.tile_map(wp.sin, t)
+
+            print(s)
+
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=10, storage=register) = [[0 0.0998334 0.198669 0.29552 ...]]
+
+    """
     ...
 
 
 @over
-def tile_map(op: Callable, a: Any, b: Any):
-    """Map the operation onto each element of the tile"""
+def tile_map(op: Callable, a: Any, b: Any) -> Tile:
+    """Apply the binary map operation onto each corresponding pair of elements from each the tile."""
     ...
 
 
@@ -1837,217 +1978,145 @@ def atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: in
 
 @over
 def atomic_min(arr: Array[Any], i: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
-
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
-    """
+    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
     ...
 
 
@@ -2144,7 +2213,7 @@ def add(a: Transformation[Scalar], b: Transformation[Scalar]) -> Transformation[
 
 
 @over
-def add(a: Tile, b: Tile):
+def add(a: Tile, b: Tile) -> Tile:
     """Add each element of two tiles together"""
     ...
 
@@ -2486,18 +2555,18 @@ def unot(a: Array[Any]) -> bool:
 
 
 @over
-def tile_matmul_dx(a: Tile, b: Tile, out: Tile):
+def tile_matmul_dx(a: Tile, b: Tile, out: Tile) -> Tile:
     """Compute matrix product and accumulate out += a*b."""
     ...
 
 
 @over
-def tile_fft_dx(inout: Tile):
+def tile_fft_dx(inout: Tile) -> Tile:
     """Compute the FFT along the second dimension of a 2D tile of data."""
     ...
 
 
 @over
-def tile_ifft_dx(inout: Tile):
+def tile_ifft_dx(inout: Tile) -> Tile:
     """Compute the inverse FFT along the second dimension of a 2D tile of data."""
     ...
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index ed47b4a3..bc991c77 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -182,7 +182,6 @@ def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C:
     wp.tile_store(C[i], 0, 0, sum)
 
 
-@unittest.expectedFailure
 def test_tile_grouped_gemm(test, device):
     batch_count = 56
 
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index 00b8b301..c343e353 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -71,13 +71,12 @@ def tile_reduce_1d_kernel(output: wp.array(dtype=int)):
 
     t = wp.tile(i)  # convert to block wide tile
     s = wp.tile_sum(t)  # sum over block
-    
+
     # update global sum
     wp.tile_atomic_add(output, 0, 0, s)
 
 
 def test_tile_reduce_1d(test, device):
-    
     # use an unaligned grid dimension
     N = int(TILE_DIM * 3 / 2)
 
@@ -85,27 +84,25 @@ def test_tile_reduce_1d(test, device):
 
     with wp.Tape() as tape:
         wp.launch(tile_reduce_1d_kernel, dim=[N], inputs=[output], block_dim=TILE_DIM, device=device)
-    
-    test.assertAlmostEqual(output.numpy()[0], np.sum(np.arange(N)))
 
+    test.assertEqual(output.numpy()[0], np.sum(np.arange(N)))
 
 
 @wp.kernel
 def tile_ones_kernel(out: wp.array(dtype=float)):
     i = wp.tid()
-    
+
     t = wp.tile_ones(dtype=float, m=16, n=16)
     s = wp.tile_sum(t)
 
     wp.tile_store(out, 0, 0, s)
 
+
 def test_tile_ones(test, device):
-    
     output = wp.zeros(shape=1, dtype=float, device=device)
 
     with wp.Tape() as tape:
         wp.launch(tile_ones_kernel, dim=[1, TILE_DIM], inputs=[output], block_dim=TILE_DIM, device=device)
-    wp.synchronize()
 
     test.assertAlmostEqual(output.numpy()[0], 256.0)
 
@@ -113,7 +110,7 @@ def test_tile_ones(test, device):
 @wp.kernel
 def tile_arange_kernel(out: wp.array2d(dtype=int)):
     i = wp.tid()
-    
+
     a = wp.tile_arange(17, dtype=int)
     b = wp.tile_arange(5, 23, dtype=int)
     c = wp.tile_arange(0, 34, 2, dtype=int)
@@ -122,15 +119,15 @@ def tile_arange_kernel(out: wp.array2d(dtype=int)):
     wp.tile_store(out, 1, 0, b)
     wp.tile_store(out, 2, 0, c)
 
+
 def test_tile_arange(test, device):
-    
     N = 17
 
     output = wp.zeros(shape=(3, N), dtype=int, device=device)
 
     with wp.Tape() as tape:
         wp.launch(tile_arange_kernel, dim=[1, N], inputs=[output], block_dim=TILE_DIM, device=device)
-    
+
     assert_np_equal(output.numpy()[0], np.arange(17))
     assert_np_equal(output.numpy()[1], np.arange(5, 22))
     assert_np_equal(output.numpy()[2], np.arange(0, 34, 2))
@@ -144,7 +141,7 @@ class TestTileReduce(unittest.TestCase):
 
 
 add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
-add_function_test(TestTileReduce, "test_tile_reduce_1d", test_tile_reduce_1d, devices=devices) 
+add_function_test(TestTileReduce, "test_tile_reduce_1d", test_tile_reduce_1d, devices=devices)
 add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
 add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
 
diff --git a/warp/types.py b/warp/types.py
index e099119d..7dc725fb 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -3001,18 +3001,19 @@ class TileZeros(Tile):
     def __init__(self, dtype, M, N):
         Tile.__init__(self, dtype, M, N, op="zeros", storage="shared")
 
+
 class TileRange(Tile):
     def __init__(self, dtype, start, stop, step):
-
         self.start = start
         self.stop = stop
         self.step = step
 
         M = 1
-        N = int((stop-start)/step)
+        N = int((stop - start) / step)
 
         Tile.__init__(self, dtype, M, N, op="arange", storage="register")
 
+
 class TileConstant(Tile):
     def __init__(self, dtype, M, N):
         Tile.__init__(self, dtype, M, N, op="constant", storage="register")

From c51349af6ebab9eeb9e6dc05d575a1e2c7b23e87 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 27 Sep 2024 01:13:49 +0000
Subject: [PATCH 037/102] Cosmetic change to tile_arange()

---
 warp/builtins.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index e400c364..1caccac6 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1844,14 +1844,18 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     template_args.append(m)
     template_args.append(n)
 
-    # take dtype from stop value
-    t = return_type.dtype
-
-    start = warp.codegen.Var(label=None, type=t, constant=return_type.start)
-    stop = warp.codegen.Var(label=None, type=t, constant=return_type.stop)
-    step = warp.codegen.Var(label=None, type=t, constant=return_type.step)
-
-    return ([start, stop, step], template_args)
+    # todo: it is somewhat redundant to create new vars here since some of start,stop,step
+    # already exist depending on which form the function was called by the user
+    start = warp.codegen.Var(label=None, type=return_type.dtype, constant=return_type.start)
+    stop = warp.codegen.Var(label=None, type=return_type.dtype, constant=return_type.stop)
+    step = warp.codegen.Var(label=None, type=return_type.dtype, constant=return_type.step)
+
+    function_args = []
+    function_args.append(start)
+    function_args.append(stop)
+    function_args.append(step)
+
+    return (function_args, template_args)
 
 
 add_builtin(

From 3f31e2c5073d17b48ddc2e926ff7e80ff881b834 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 27 Sep 2024 03:49:42 +0000
Subject: [PATCH 038/102] Fix for test_tile_grouped_gemm Fix for duplicate LTO
 symbols when using the same GEMM multiple times in a module Remove *_dx()
 suffix and make them the default, disable scalar tile_matmul() Update
 docstrings for all tile functions

---
 docs/modules/functions.rst     |  71 ++++++++++++++++++++---
 warp/builtins.py               | 102 ++++++++++++++++++++++++++++-----
 warp/codegen.py                |  12 +---
 warp/context.py                |   4 +-
 warp/native/builtin.h          |  85 ++++++++++++++++++---------
 warp/native/tile.h             |  34 +++++++----
 warp/native/tile_gemm.h        |   3 +-
 warp/stubs.py                  |  77 ++++++++++++++++++++++---
 warp/tests/test_tile.py        |  10 ++--
 warp/tests/test_tile_mathdx.py |  20 +++----
 10 files changed, 322 insertions(+), 96 deletions(-)

diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 5d2bc605..061fb5f6 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -952,7 +952,7 @@ Tile Primitives
 
     :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A tile with the same dimensions as the input tile, currently output tiles must have the same dtype as the input.
+    :returns: A tile with the same dimensions and datatype as the input tile.
 
     Example:
 
@@ -966,7 +966,7 @@ Tile Primitives
 
             print(s)
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch(compute, dim=[16], inputs=[])
 
     Prints:
 
@@ -980,22 +980,77 @@ Tile Primitives
     :noindex:
     :nocontentsentry:
 
-    Apply the binary map operation onto each corresponding pair of elements from each the tile.
+    Apply a binary function onto the tile.
+
+    This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
+    Both input tiles must have the same dimensions and datatype.
+
+    :param op: A callable function that accepts two arguments and returns one argument, all of the same type, may be a user function or builtin
+    :param a: The first input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :param b: The second input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A tile with the same dimensions and datatype as the input tiles.
+
+    Example:
 
+    .. code-block:: python
 
-.. py:function:: tile_matmul_dx(a: Tile, b: Tile, out: Tile) -> Tile
+        @wp.kernel
+        def compute():
 
-    Compute matrix product and accumulate out += a*b.
+            a = wp.tile_arange(0.0, 1.0, 0.1, dtype=float)
+            b = wp.tile_ones(m=1, n=10, dtype=float)
 
+            s = wp.tile_map(wp.add, a, b)
+
+            print(s)
+
+        wp.launch(compute, dim=[16], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
 
-.. py:function:: tile_fft_dx(inout: Tile) -> Tile
+        tile(m=1, n=10, storage=register) = [[1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9]]
 
-    Compute the FFT along the second dimension of a 2D tile of data.
 
+.. py:function:: tile_matmul(a: Tile, b: Tile, out: Tile) -> Tile
 
-.. py:function:: tile_ifft_dx(inout: Tile) -> Tile
+    Computes the matrix product and accumulates ``out += a*b``.
+
+    Supported datatypes are:
+        * fp16, fp32, fp64 (real)
+        * vec2h, vec2f, vec2d (complex)
+
+    All input and output tiles must have the same datatype, and will be automatically be migrated to shared memory if necessary.
+       
+    :param a: A tile with ``shape=(M, K)``
+    :param b: A tile with ``shape=(K, N)``
+    :param out: A tile with ``shape=(M, N)``
+    
+
+
+.. py:function:: tile_fft(inout: Tile) -> Tile
+
+    Compute the forward FFT along the second dimension of a 2D tile of data.
+    
+    This function cooperatively computes the forward FFT on a tile of data inplace, treating each row individually.
+
+    Supported datatypes are:
+        * vec2f, vec2d
+
+    :param inout: The input/output tile
+
+
+.. py:function:: tile_ifft(inout: Tile) -> Tile
 
     Compute the inverse FFT along the second dimension of a 2D tile of data.
+    
+    This function cooperatively computes the inverse FFT on a tile of data inplace, treating each row individually.
+
+    Supported datatypes are:
+        * vec2f, vec2d
+
+    :param inout: The input/output tile
 
 
 
diff --git a/warp/builtins.py b/warp/builtins.py
index 8fe51981..bcc3a573 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2250,7 +2250,7 @@ def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, ar
 
     :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A tile with the same dimensions as the input tile, currently output tiles must have the same dtype as the input.
+    :returns: A tile with the same dimensions and datatype as the input tile.
 
     Example:
 
@@ -2264,7 +2264,7 @@ def compute():
 
             print(s)
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch(compute, dim=[16], inputs=[])
 
     Prints:
 
@@ -2311,7 +2311,37 @@ def tile_binary_map_value_func(arg_types, arg_values):
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
     native_func="tile_binary_map",
-    doc="Apply the binary map operation onto each corresponding pair of elements from each the tile.",
+    doc="""Apply a binary function onto the tile.
+
+    This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
+    Both input tiles must have the same dimensions and datatype.
+
+    :param op: A callable function that accepts two arguments and returns one argument, all of the same type, may be a user function or builtin
+    :param a: The first input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :param b: The second input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A tile with the same dimensions and datatype as the input tiles.
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            a = wp.tile_arange(0.0, 1.0, 0.1, dtype=float)
+            b = wp.tile_ones(m=1, n=10, dtype=float)
+
+            s = wp.tile_map(wp.add, a, b)
+
+            print(s)
+
+        wp.launch(compute, dim=[16], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=10, storage=register) = [[1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9]]""",
     group="Tile Primitives",
     export=False,
 )
@@ -5023,8 +5053,8 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     return None
 
 
-def tile_matmul_generic_dispatch_func(
-    arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any]
+def tile_matmul_generic_lto_dispatch_func(
+    arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any], builder: warp.context.ModuleBuilder
 ):
     a = arg_values["a"]
     b = arg_values["b"]
@@ -5097,6 +5127,12 @@ def make_transpose(t):
             raise RuntimeError("Invalid transpose mode")
 
         lto_symbol = f"dot_{M}_{N}_{K}_{tA}_{tB}_{precision}_{element_type}"
+
+        # early out if LTO for this combination already exists for this module
+        if lto_symbol in builder.ltoirs:
+            return lto_symbol, builder.ltoirs[lto_symbol]
+
+        # otherwise compile LTO
         lto_code = tempfile.NamedTemporaryFile()
         include_dirs = get_cuda_include_dirs()
         result = warp.context.runtime.core.cuda_compile_dot(
@@ -5120,6 +5156,8 @@ def make_transpose(t):
         else:
             with open(lto_code.name, "rb") as f:
                 lto_code = f.read()
+
+            builder.ltoirs[lto_symbol] = lto_code            
             return lto_symbol, lto_code
 
     (fun_forward, lto_forward) = make_function(M, N, K, "N", "N")  #    C += A * B
@@ -5142,12 +5180,24 @@ def make_transpose(t):
 
 
 add_builtin(
-    "tile_matmul_dx",
+    "tile_matmul",
     input_types={"a": Tile, "b": Tile, "out": Tile},
     value_func=tile_matmul_generic_value_func,
-    lto_dispatch_func=tile_matmul_generic_dispatch_func,
+    lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
     variadic=True,
-    doc="Compute matrix product and accumulate out += a*b.",
+    doc="""Computes the matrix product and accumulates ``out += a*b``.
+
+    Supported datatypes are:
+        * fp16, fp32, fp64 (real)
+        * vec2h, vec2f, vec2d (complex)
+
+    All input and output tiles must have the same datatype. Tile data will be automatically be migrated 
+    to shared memory if necessary and will use TensoreCore operations when available.
+       
+    :param a: A tile with ``shape=(M, K)``
+    :param b: A tile with ``shape=(K, N)``
+    :param out: A tile with ``shape=(M, N)``
+    """,
     group="Tile Primitives",
     export=False,
     namespace="",
@@ -5173,11 +5223,12 @@ def tile_fft_generic_value_func(arg_types, arg_values):
     return None
 
 
-def tile_fft_generic_dispatch_func(
+def tile_fft_generic_lto_dispatch_func(
     arg_types: Mapping[str, type],
     return_type: Any,
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
+    builder: warp.context.ModuleBuilder,
     direction: str = None,
 ):
     inout = arg_values["inout"]
@@ -5213,6 +5264,11 @@ def tile_fft_generic_dispatch_func(
     ept = size // num_threads
     lto_symbol = f"fft_{size}_{ept}_{arch}_{direction}_{precision}"
 
+    # early out if LTO for this combination already exists for this module
+    if lto_symbol in builder.ltoirs:
+        return lto_symbol, builder.ltoirs[lto_symbol]
+
+    # otherwise compile LTO
     lto_code = tempfile.NamedTemporaryFile()
     shared_memory_size = ctypes.c_int(0)
 
@@ -5238,6 +5294,8 @@ def tile_fft_generic_dispatch_func(
     with open(lto_code.name, "rb") as f:
         lto_code = f.read()
 
+    builder.ltoirs[lto_symbol] = lto_code
+
     return (
         (
             Var(lto_symbol, str, False, True, False),
@@ -5253,24 +5311,38 @@ def tile_fft_generic_dispatch_func(
 
 
 add_builtin(
-    "tile_fft_dx",
+    "tile_fft",
     input_types={"inout": Tile},
     value_func=tile_fft_generic_value_func,
-    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction="forward"),
+    lto_dispatch_func=functools.partial(tile_fft_generic_lto_dispatch_func, direction="forward"),
     variadic=True,
-    doc="Compute the FFT along the second dimension of a 2D tile of data.",
+    doc="""Compute the forward FFT along the second dimension of a 2D tile of data.
+    
+    This function cooperatively computes the forward FFT on a tile of data inplace, treating each row individually.
+
+    Supported datatypes are:
+        * vec2f, vec2d
+
+    :param inout: The input/output tile""",
     group="Tile Primitives",
     export=False,
     namespace="",
 )
 
 add_builtin(
-    "tile_ifft_dx",
+    "tile_ifft",
     input_types={"inout": Tile},
     value_func=tile_fft_generic_value_func,
-    lto_dispatch_func=functools.partial(tile_fft_generic_dispatch_func, direction="inverse"),
+    lto_dispatch_func=functools.partial(tile_fft_generic_lto_dispatch_func, direction="inverse"),
     variadic=True,
-    doc="Compute the inverse FFT along the second dimension of a 2D tile of data.",
+    doc="""Compute the inverse FFT along the second dimension of a 2D tile of data.
+    
+    This function cooperatively computes the inverse FFT on a tile of data inplace, treating each row individually.
+
+    Supported datatypes are:
+        * vec2f, vec2d
+
+    :param inout: The input/output tile""",
     group="Tile Primitives",
     export=False,
     namespace="",
diff --git a/warp/codegen.py b/warp/codegen.py
index f9a47f25..0eb26c1b 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -940,9 +940,6 @@ def build(adj, builder, default_builder_options=None):
         # used to generate new label indices
         adj.label_count = 0
 
-        # collect ltoirs
-        adj.ltoirs = []
-
         # update symbol map for each argument
         for a in adj.args:
             adj.symbols[a.label] = a
@@ -968,8 +965,6 @@ def build(adj, builder, default_builder_options=None):
                 elif isinstance(a.type, warp.types.array) and isinstance(a.type.dtype, Struct):
                     builder.build_struct_recursive(a.type.dtype)
 
-            builder.ltoirs.extend(adj.ltoirs)
-
     # code generation methods
     def format_template(adj, template, input_vars, output_var):
         # output var is always the 0th index
@@ -1280,9 +1275,8 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
         # a literal value or references a variable.
         if func.lto_dispatch_func is not None:
             func_args, template_args, ltoirs = func.lto_dispatch_func(
-                func.input_types, return_type, bound_args, options=adj.builder_options
+                func.input_types, return_type, bound_args, options=adj.builder_options, builder=adj.builder
             )
-            adj.ltoirs.extend(ltoirs)
         elif func.dispatch_func is not None:
             func_args, template_args = func.dispatch_func(func.input_types, return_type, bound_args)
         else:
@@ -2759,7 +2753,7 @@ def get_references(adj) -> Dict[str, Any]:
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
 
-#define builtin_tid1d() wp::tid(task_index)
+#define builtin_tid1d() wp::tid(task_index, dim)
 #define builtin_tid2d(x, y) wp::tid(x, y, task_index, dim)
 #define builtin_tid3d(x, y, z) wp::tid(x, y, z, task_index, dim)
 #define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, task_index, dim)
@@ -2778,7 +2772,7 @@ def get_references(adj) -> Dict[str, Any]:
 #define int(x) cast_int(x)
 #define adj_int(x, adj_x, adj_ret) adj_cast_int(x, adj_x, adj_ret)
 
-#define builtin_tid1d() wp::tid(_idx)
+#define builtin_tid1d() wp::tid(_idx, dim)
 #define builtin_tid2d(x, y) wp::tid(x, y, _idx, dim)
 #define builtin_tid3d(x, y, z) wp::tid(x, y, z, _idx, dim)
 #define builtin_tid4d(x, y, z, w) wp::tid(x, y, z, w, _idx, dim)
diff --git a/warp/context.py b/warp/context.py
index 55b42f3d..ad18d270 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -1541,7 +1541,7 @@ def __init__(self, module, options, hasher=None):
         self.options = options
         self.module = module
         self.deferred_functions = []
-        self.ltoirs = []
+        self.ltoirs = {}    # map from lto symbol to lto binary
 
         if hasher is None:
             hasher = ModuleHasher(module)
@@ -2024,7 +2024,7 @@ def load(self, device, block_dim=None) -> ModuleExec:
                                 config=self.options["mode"],
                                 fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
-                                ltoirs=builder.ltoirs,
+                                ltoirs=builder.ltoirs.values(),
                             )
 
                     except Exception as e:
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index 7d1ac8d9..bf12b765 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1145,7 +1145,47 @@ struct launch_bounds_t
     size_t size;                // total number of threads
 };
 
-inline CUDA_CALLABLE int tid(size_t index)
+// represents coordinate in the launch grid
+struct launch_coord_t
+{
+    int i;
+    int j;
+    int k;
+    int l;
+};
+
+// unravels a linear thread index to the corresponding launch grid coord (up to 4d)
+inline CUDA_CALLABLE launch_coord_t launch_coord(size_t linear, const launch_bounds_t& bounds)
+{
+    launch_coord_t coord = {0, 0, 0, 0};
+
+    if (bounds.ndim > 3)
+    {
+        coord.l = linear%bounds.shape[3];
+        linear /= bounds.shape[3];
+    }
+
+    if (bounds.ndim > 2)
+    {
+        coord.k = linear%bounds.shape[2];
+        linear /= bounds.shape[2];
+    }
+
+    if (bounds.ndim > 1)
+    {
+        coord.j = linear%bounds.shape[1];
+        linear /= bounds.shape[1];
+    }
+
+    if (bounds.ndim > 0)
+    {
+        coord.i = linear;
+    }
+
+    return coord;
+}
+
+inline CUDA_CALLABLE int tid(size_t index, const launch_bounds_t& bounds)
 {
     // For the 1-D tid() we need to warn the user if we're about to provide a truncated index
     // Only do this in _DEBUG when called from device to avoid excessive register allocation
@@ -1154,40 +1194,33 @@ inline CUDA_CALLABLE int tid(size_t index)
         printf("Warp warning: tid() is returning an overflowed int\n");
     }
 #endif
-    return static_cast<int>(index);
+
+    launch_coord_t c = launch_coord(index, bounds);
+    return static_cast<int>(c.i);
 }
 
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& launch_bounds)
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& bounds)
 {
-    const size_t n = launch_bounds.shape[1];
-
-    // convert to work item
-    i = index/n;
-    j = index%n;
+    launch_coord_t c = launch_coord(index, bounds);
+    i = c.i;
+    j = c.j;
 }
 
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& launch_bounds)
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& bounds)
 {
-    const size_t n = launch_bounds.shape[1];
-    const size_t o = launch_bounds.shape[2];
-
-    // convert to work item
-    i = index/(n*o);
-    j = index%(n*o)/o;
-    k = index%o;
+    launch_coord_t c = launch_coord(index, bounds);
+    i = c.i;
+    j = c.j;
+    k = c.k;
 }
 
-inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& launch_bounds)
+inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& bounds)
 {
-    const size_t n = launch_bounds.shape[1];
-    const size_t o = launch_bounds.shape[2];
-    const size_t p = launch_bounds.shape[3];
-
-    // convert to work item
-    i = index/(n*o*p);
-    j = index%(n*o*p)/(o*p);
-    k = index%(o*p)/p;
-    l = index%p;
+    launch_coord_t c = launch_coord(index, bounds);
+    i = c.i;
+    j = c.j;
+    k = c.k;
+    l = c.l;
 }
 
 template<typename T>
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 4252bc97..f3b1eea5 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -36,13 +36,20 @@
     [ ] Layouts
         [x] Simple
         [ ] Cute
-    [ ] Remove Alloc type from tile_shared_t
-    
+    [x] Remove Alloc type from tile_shared_t
+    [ ] wp.launch_tiled() helper
+[ ] Creation
+    [x] zeros
+    [x] ones
+    [x] arange
+    [x] tile()
+    [ ] untile()
+    [ ] explicit storage
 [ ] Load/Store
     [ ] 1D load/store variants
     [ ] max_coord option for non-aligned loads
     [ ] Indexed load
-    [ ] wp.tile_atomic_add()
+    [x] wp.tile_atomic_add()
 [ ] Maps
     [x] Support user functions
     [x] Support built-in functions
@@ -58,6 +65,9 @@
 [x] MatMul
     [x] Forward
     [x] Reverse
+[ ] Operators
+    [ ] +, -, *, /, @?
+    [ ] += for matmul, e.g.: c += a@b, or c = a@b
 [ ] Reshape
     [ ] Broadcasting
     [ ] Transpose
@@ -66,7 +76,7 @@
     [ ] Slice
 [ ] Runtime
     [x] Compile-time block dimensions
-    [ ] Switch between SIMT / Tile based execution if `tile_dim` not provided to wp.launch()
+    [x] Switch between SIMT / Tile based execution if `tile_dim` not provided to wp.launch()
 [ ] Examples
     [ ] GEMM
     [ ] Batched MLP
@@ -1011,7 +1021,7 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
 
 // But cuBLASDx follows the BLAS convention: matrices are col-major, so we swap A & B in the code below
 
-#define tile_matmul_dx(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C) \
+#define tile_matmul(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C) \
     do { \
         void fun_forward(dtype, dtype*, dtype*, dtype, dtype*); \
         WP_TILE_SYNC(); \
@@ -1021,7 +1031,7 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
 
 // adj_fun_forward, adj_fun_backward_A, adj_fun_backward_B, adj_dtype are in practice ignored
 // but are here because builtins.py creates them even though those are effectively compile time constants
-#define adj_tile_matmul_dx(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C, \
+#define adj_tile_matmul(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C, \
                            adj_fun_forward, adj_fun_backward_A, adj_fun_backward_B, adj_dtype, \
                            adjA, adjB, adjC) \
     do { \
@@ -1033,7 +1043,7 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
         WP_TILE_SYNC(); \
     } while (0)
 
-#define tile_fft_dx(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
+#define tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
     do { \
         void function_name(dtype*, dtype*); \
         WP_TILE_SHARED __align__(16) char buffer[shared_memory_size]; \
@@ -1044,22 +1054,22 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
         } \
     } while (0)
 
-#define tile_ifft_dx tile_fft_dx
+#define tile_ifft tile_fft
 
 // adj_function_name, adj_dtype, adj_shared_memory_size, adj_batch_size, adj_ept are all ignored
 
-#define adj_tile_fft_dx(function_name, dtype, shared_memory_size, batch_size, ept, Xinout, \
+#define adj_tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout, \
                         adj_function_name, adj_dtype, adj_shared_memory_size, adj_batch_size, adj_ept, \
                         adj_Xinout) \
     do { \
-        tile_ifft_dx(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
+        tile_ifft(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
     } while (0)
 
-#define adj_tile_ifft_dx(function_name, dtype, shared_memory_size, batch_size, ept, Xinout, \
+#define adj_tile_ifft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout, \
                          adj_function_name, adj_dtype, adj_shared_memory_size, adj_batch_size, adj_ept, \
                          adj_Xinout) \
     do { \
-        tile_fft_dx(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
+        tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
     } while (0)
 
 } // namespace wp
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index 3aa3dbe7..c033330a 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -305,6 +305,7 @@ inline CUDA_CALLABLE void tile_matmul(const array_t<T>& A, const array_t<T>& B,
 #endif // USE_CUTE
 
 
+#if 0
 
 template <typename TileA, typename TileB, typename TileC>
 void tile_matmul(TileA& a, TileB& b, TileC& c)
@@ -327,6 +328,6 @@ void adj_tile_matmul(TileA& a, TileB& b, TileC& c,
     tile_matmul_scalar(wp::tile_transpose(a), adj_c, adj_b);
 }
 
-
+#endif // 0
 
 } // namespace wp
diff --git a/warp/stubs.py b/warp/stubs.py
index b2511920..c5b6fbf3 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -1055,7 +1055,7 @@ def tile_map(op: Callable, a: Any) -> Tile:
 
     :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A tile with the same dimensions as the input tile, currently output tiles must have the same dtype as the input.
+    :returns: A tile with the same dimensions and datatype as the input tile.
 
     Example:
 
@@ -1069,7 +1069,7 @@ def compute():
             print(s)
 
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch(compute, dim=[16], inputs=[])
 
     Prints:
 
@@ -1083,7 +1083,38 @@ def compute():
 
 @over
 def tile_map(op: Callable, a: Any, b: Any) -> Tile:
-    """Apply the binary map operation onto each corresponding pair of elements from each the tile."""
+    """Apply a binary function onto the tile.
+
+    This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
+    Both input tiles must have the same dimensions and datatype.
+
+    :param op: A callable function that accepts two arguments and returns one argument, all of the same type, may be a user function or builtin
+    :param a: The first input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :param b: The second input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A tile with the same dimensions and datatype as the input tiles.
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            a = wp.tile_arange(0.0, 1.0, 0.1, dtype=float)
+            b = wp.tile_ones(m=1, n=10, dtype=float)
+
+            s = wp.tile_map(wp.add, a, b)
+
+            print(s)
+
+
+        wp.launch(compute, dim=[16], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=10, storage=register) = [[1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9]]
+    """
     ...
 
 
@@ -2555,18 +2586,46 @@ def unot(a: Array[Any]) -> bool:
 
 
 @over
-def tile_matmul_dx(a: Tile, b: Tile, out: Tile) -> Tile:
-    """Compute matrix product and accumulate out += a*b."""
+def tile_matmul(a: Tile, b: Tile, out: Tile) -> Tile:
+    """Computes the matrix product and accumulates ``out += a*b``.
+
+    Supported datatypes are:
+        * fp16, fp32, fp64 (real)
+        * vec2h, vec2f, vec2d (complex)
+
+    All input and output tiles must have the same datatype, and will be automatically be migrated to shared memory if necessary.
+
+    :param a: A tile with ``shape=(M, K)``
+    :param b: A tile with ``shape=(K, N)``
+    :param out: A tile with ``shape=(M, N)``
+
+    """
     ...
 
 
 @over
-def tile_fft_dx(inout: Tile) -> Tile:
-    """Compute the FFT along the second dimension of a 2D tile of data."""
+def tile_fft(inout: Tile) -> Tile:
+    """Compute the forward FFT along the second dimension of a 2D tile of data.
+
+    This function cooperatively computes the forward FFT on a tile of data inplace, treating each row individually.
+
+    Supported datatypes are:
+        * vec2f, vec2d
+
+    :param inout: The input/output tile
+    """
     ...
 
 
 @over
-def tile_ifft_dx(inout: Tile) -> Tile:
-    """Compute the inverse FFT along the second dimension of a 2D tile of data."""
+def tile_ifft(inout: Tile) -> Tile:
+    """Compute the inverse FFT along the second dimension of a 2D tile of data.
+
+    This function cooperatively computes the inverse FFT on a tile of data inplace, treating each row individually.
+
+    Supported datatypes are:
+        * vec2f, vec2d
+
+    :param inout: The input/output tile
+    """
     ...
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index bc991c77..f6822a0e 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -196,12 +196,14 @@ def test_tile_grouped_gemm(test, device):
 
     A_wp = wp.array(A, requires_grad=True, device=device)
     B_wp = wp.array(B, requires_grad=True, device=device)
-    C_wp = wp.array(C, requires_grad=True, device=device)
+    C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
-            tile_grouped_gemm, dim=[batch_count, TILE_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM, device=device
-        )
+        wp.launch(tile_grouped_gemm, 
+                  dim=[batch_count, TILE_DIM], 
+                  inputs=[A_wp, B_wp, C_wp], 
+                  block_dim=TILE_DIM, 
+                  device=device)
 
     # TODO: 32 mismatched elements
     assert_np_equal(C_wp.numpy(), C)
diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
index 6cf4b7c1..229ce074 100644
--- a/warp/tests/test_tile_mathdx.py
+++ b/warp/tests/test_tile_mathdx.py
@@ -25,18 +25,18 @@
 
 
 @wp.kernel()
-def tile_math_dx_matmul_kernel(
+def tile_math_matmul_kernel(
     ga: wp.array2d(dtype=wp.float64), gb: wp.array2d(dtype=wp.float64), gc: wp.array2d(dtype=wp.float64)
 ):
     i, j, _ = wp.tid()
     a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
     b = wp.tile_load(gb, i, j, m=TILE_K, n=TILE_N)
     c = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
-    wp.tile_matmul_dx(a, b, c)
+    wp.tile_matmul(a, b, c)
     wp.tile_store(gc, i, j, c)
 
 
-def test_tile_math_dx_matmul(test, device):
+def test_tile_math_matmul(test, device):
     rng = np.random.default_rng(42)
 
     A = rng.random((TILE_M, TILE_K), dtype=np.float64)
@@ -49,7 +49,7 @@ def test_tile_math_dx_matmul(test, device):
 
     with wp.Tape() as tape:
         wp.launch(
-            tile_math_dx_matmul_kernel,
+            tile_math_matmul_kernel,
             dim=[1, 1, TILE_DIM],
             inputs=[A_wp, B_wp, C_wp],
             block_dim=TILE_DIM,
@@ -68,14 +68,14 @@ def test_tile_math_dx_matmul(test, device):
 
 
 @wp.kernel()
-def tile_math_dx_fft_kernel(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
+def tile_math_fft_kernel(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
     i, j, _ = wp.tid()
     xy = wp.tile_load(gx, i, j, m=N_FFT, n=N_FFT)
-    wp.tile_fft_dx(xy)
+    wp.tile_fft(xy)
     wp.tile_store(gy, i, j, xy)
 
 
-def test_tile_math_dx_fft(test, device):
+def test_tile_math_fft(test, device):
     rng = np.random.default_rng(42)
 
     # Warp doesn't really have a complex64 type,
@@ -91,7 +91,7 @@ def test_tile_math_dx_fft(test, device):
     Y_c64 = np.fft.fft(X_c64, axis=-1)
 
     with wp.Tape() as tape:
-        wp.launch(tile_math_dx_fft_kernel, dim=[1, 1, TILE_DIM], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
+        wp.launch(tile_math_fft_kernel, dim=[1, 1, TILE_DIM], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
 
     Y_wp_c64 = Y_wp.numpy().view(np.complex64).reshape(N_FFT, N_FFT)
 
@@ -108,8 +108,8 @@ class TestTileMathDx(unittest.TestCase):
     pass
 
 
-add_function_test(TestTileMathDx, "test_tile_math_dx_matmul", test_tile_math_dx_matmul, devices=devices)
-add_function_test(TestTileMathDx, "test_tile_math_dx_fft", test_tile_math_dx_fft, devices=devices)
+add_function_test(TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=devices)
+add_function_test(TestTileMathDx, "test_tile_math_fft", test_tile_math_fft, devices=devices)
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()

From b3a409263d9fe4ff29b21c7c395be807d7c423b2 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 27 Sep 2024 04:57:59 +0000
Subject: [PATCH 039/102] Add a wp.tiled_launch() helper to launch grids with a
 trailing tile dim

---
 docs/modules/functions.rst     |  3 +-
 docs/modules/runtime.rst       |  2 ++
 warp/__init__.py               |  1 +
 warp/context.py                | 41 ++++++++++++++++++++++
 warp/stubs.py                  |  4 ++-
 warp/tests/test_misc.py        | 63 ++++++++++++++++++++++++++++++++++
 warp/tests/test_tile.py        | 56 ++++++++++++++++--------------
 warp/tests/test_tile_mathdx.py | 15 +++++---
 warp/tests/test_tile_reduce.py | 27 ++++++---------
 9 files changed, 164 insertions(+), 48 deletions(-)
 create mode 100644 warp/tests/test_misc.py

diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 061fb5f6..4f328880 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -1021,7 +1021,8 @@ Tile Primitives
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
 
-    All input and output tiles must have the same datatype, and will be automatically be migrated to shared memory if necessary.
+    All input and output tiles must have the same datatype. Tile data will be automatically be migrated 
+    to shared memory if necessary and will use TensoreCore operations when available.
        
     :param a: A tile with ``shape=(M, K)``
     :param b: A tile with ``shape=(K, N)``
diff --git a/docs/modules/runtime.rst b/docs/modules/runtime.rst
index 4d0d4fad..05c63d43 100644
--- a/docs/modules/runtime.rst
+++ b/docs/modules/runtime.rst
@@ -38,6 +38,8 @@ The location of the kernel cache is printed when Warp is initialized.
 generated compilation artifacts as Warp does not automatically try to keep the cache below a certain size.
 
 .. autofunction:: launch
+.. autofunction:: launch_tiled
+    
 .. autofunction:: clear_kernel_cache
 
 .. _Runtime Kernel Creation:
diff --git a/warp/__init__.py b/warp/__init__.py
index 76672327..8ecda0c1 100644
--- a/warp/__init__.py
+++ b/warp/__init__.py
@@ -58,6 +58,7 @@
     copy,
     from_numpy,
     launch,
+    launch_tiled,
     synchronize,
     force_load,
     load_module,
diff --git a/warp/context.py b/warp/context.py
index ad18d270..efcf7fd6 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -5176,6 +5176,47 @@ def pack_args(args, params, adjoint=False):
         if warp.config.verify_autograd_array_access:
             runtime.tape._check_kernel_array_access(kernel, fwd_args)
 
+def launch_tiled(*args, **kwargs):
+    """A helper method for launching a grid with an extra trailing dimension equal to the block size.
+
+    For example, to launch a 2D grid, where each element has 64 threads assigned you would use the following:
+
+    .. code-block:: python
+
+        wp.launch_tiled(kernel, [M, N], inputs=[...], block_dim=64)
+
+    Which is equivalent to the following:
+
+    .. code-block:: python
+
+        wp.launch(kernel, [M, N, 64], inputs=[...], block_dim=64)
+
+    Inside your kernel code you can retrieve the first two indices of the thread as usual, ignoring the implicit third dimension if desired:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute()
+
+            i, j = wp.tid()
+
+            ...
+    """
+
+    if len(kwargs["dim"]) > 3:
+        raise RuntimeError("wp.launch_tiled() requires a grid with fewer than 4 dimensions")
+
+    # promote dim to a list in case it was passed as a scalar or tuple
+    dim = kwargs["dim"]
+    if not isinstance(dim, list):
+        dim = list(dim) if isinstance(dim, tuple) else [dim]
+
+    # add trailing dimension
+    kwargs["dim"] = dim + [kwargs["block_dim"]]
+
+    # forward to original launch method
+    launch(*args, **kwargs)
+    
 
 def synchronize():
     """Manually synchronize the calling CPU thread with any outstanding CUDA work on all devices
diff --git a/warp/stubs.py b/warp/stubs.py
index c5b6fbf3..a0e38e91 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -67,6 +67,7 @@
     copy,
     from_numpy,
     launch,
+    launch_tiled,
     synchronize,
     force_load,
     load_module,
@@ -2593,7 +2594,8 @@ def tile_matmul(a: Tile, b: Tile, out: Tile) -> Tile:
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
 
-    All input and output tiles must have the same datatype, and will be automatically be migrated to shared memory if necessary.
+    All input and output tiles must have the same datatype. Tile data will be automatically be migrated
+    to shared memory if necessary and will use TensoreCore operations when available.
 
     :param a: A tile with ``shape=(M, K)``
     :param b: A tile with ``shape=(K, N)``
diff --git a/warp/tests/test_misc.py b/warp/tests/test_misc.py
new file mode 100644
index 00000000..de9e5fc4
--- /dev/null
+++ b/warp/tests/test_misc.py
@@ -0,0 +1,63 @@
+import numpy as np
+import warp as wp
+
+wp.clear_kernel_cache()
+
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+
+# num threads per-tile
+TILE_DIM = 64
+
+
+@wp.kernel
+def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C: wp.array3d(dtype=float)):
+    # output tile index
+    i = wp.tid()
+
+    a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
+    b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
+
+    print(a)
+    print(b)
+
+    # sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+
+    # wp.tile_matmul(a, b, sum)
+
+    # print(sum)
+
+    # wp.tile_store(C[i], 0, 0, sum)
+
+
+batch_count = 1
+
+M = TILE_M
+N = TILE_N
+K = TILE_K
+
+device = "cuda:0"
+
+rng = np.random.default_rng(42)
+A = rng.random((batch_count, M, K), dtype=np.float32)
+B = rng.random((batch_count, K, N), dtype=np.float32)
+C = A @ B
+
+A_wp = wp.array(A, requires_grad=True, device=device)
+B_wp = wp.array(B, requires_grad=True, device=device)
+C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
+
+with wp.Tape() as tape:
+    wp.launch(tile_grouped_gemm, 
+                dim=[batch_count, TILE_DIM], 
+                inputs=[A_wp, B_wp, C_wp], 
+                block_dim=TILE_DIM, 
+                device=device)
+
+wp.synchronize()
+
+# TODO: 32 mismatched elements
+#assert_np_equal(C_wp.numpy(), C)
+#print(C_wp.numpy())
+
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index f6822a0e..f757be22 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -23,7 +23,7 @@
 @wp.kernel
 def tile_copy(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
     # tile index
-    i, j, _ = wp.tid()
+    i, j = wp.tid()
 
     a = wp.tile_load(A, i, j, m=TILE_M, n=TILE_N)
     wp.tile_store(B, i, j, a)
@@ -42,9 +42,9 @@ def test_tile_copy(test, device):
     B_wp = wp.array(B, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
+        wp.launch_tiled(
             tile_copy,
-            dim=[int(M / TILE_M), int(N / TILE_N), TILE_DIM],
+            dim=[int(M / TILE_M), int(N / TILE_N)],
             inputs=[A_wp, B_wp],
             block_dim=TILE_DIM,
             device=device,
@@ -68,7 +68,7 @@ def unary_func(x: float):
 @wp.kernel
 def tile_unary_map(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
     # tile index
-    i, j, _ = wp.tid()
+    i, j = wp.tid()
 
     a = wp.tile_load(input, i, j, m=TILE_M, n=TILE_N)
 
@@ -92,9 +92,9 @@ def test_tile_unary_map(test, device):
     B_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
+        wp.launch_tiled(
             tile_unary_map,
-            dim=[int(M / TILE_M), int(N / TILE_N), TILE_DIM],
+            dim=[int(M / TILE_M), int(N / TILE_N)],
             inputs=[A_wp, B_wp],
             block_dim=TILE_DIM,
             device=device,
@@ -120,7 +120,7 @@ def tile_binary_map(
     input_a: wp.array2d(dtype=float), input_b: wp.array2d(dtype=float), output: wp.array2d(dtype=float)
 ):
     # tile index
-    i, j, _ = wp.tid()
+    i, j = wp.tid()
 
     a = wp.tile_load(input_a, i, j, m=TILE_M, n=TILE_N)
     b = wp.tile_load(input_b, i, j, m=TILE_M, n=TILE_N)
@@ -148,9 +148,9 @@ def test_tile_binary_map(test, device):
     C_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
+        wp.launch_tiled(
             tile_binary_map,
-            dim=[int(M / TILE_M), int(N / TILE_N), TILE_DIM],
+            dim=[int(M / TILE_M), int(N / TILE_N)],
             inputs=[A_wp, B_wp, C_wp],
             block_dim=TILE_DIM,
             device=device,
@@ -199,8 +199,8 @@ def test_tile_grouped_gemm(test, device):
     C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_grouped_gemm, 
-                  dim=[batch_count, TILE_DIM], 
+        wp.launch_tiled(tile_grouped_gemm, 
+                  dim=[batch_count],
                   inputs=[A_wp, B_wp, C_wp], 
                   block_dim=TILE_DIM, 
                   device=device)
@@ -212,7 +212,7 @@ def test_tile_grouped_gemm(test, device):
 @wp.kernel
 def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
     # output tile index
-    i, j, _ = wp.tid()
+    i, j = wp.tid()
 
     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
@@ -247,9 +247,9 @@ def test_tile_gemm(test, device):
     C_wp = wp.array(C, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
+        wp.launch_tiled(
             tile_gemm,
-            dim=(int(M / TILE_M), int(N / TILE_N), TILE_DIM),
+            dim=(int(M / TILE_M), int(N / TILE_N)),
             inputs=[A_wp, B_wp, C_wp],
             block_dim=TILE_DIM,
             device=device,
@@ -268,7 +268,7 @@ def test_tile_gemm(test, device):
 @wp.kernel
 def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=float)):
     # output tile index
-    i, _ = wp.tid()
+    i = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
 
@@ -301,9 +301,12 @@ def test_tile_operators(test, device):
     output_wp = wp.zeros_like(input_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
-            tile_operators, dim=[batch_count, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
-        )
+        wp.launch_tiled(
+            tile_operators, 
+            dim=[batch_count], 
+            inputs=[input_wp, output_wp], 
+            block_dim=TILE_DIM, 
+            device=device)
 
     assert_np_equal(output_wp.numpy(), output)
 
@@ -317,7 +320,7 @@ def test_tile_operators(test, device):
 @wp.kernel
 def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
-    i, _ = wp.tid()
+    i = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
     s = wp.tile_sum(a) * 0.5
@@ -338,9 +341,9 @@ def test_tile_sum(test, device):
     output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
+        wp.launch_tiled(
             tile_sum_kernel,
-            dim=[batch_count, TILE_DIM],
+            dim=[batch_count],
             inputs=[input_wp, output_wp],
             block_dim=TILE_DIM,
             device=device,
@@ -362,7 +365,7 @@ def test_tile_sum(test, device):
 @wp.kernel
 def tile_extract_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
     # output tile index
-    i, _ = wp.tid()
+    i = wp.tid()
 
     t = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
 
@@ -384,9 +387,12 @@ def test_tile_extract(test, device):
     output_wp = wp.zeros_like(input_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
-            tile_extract_kernel, dim=[1, TILE_DIM], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
-        )
+        wp.launch_tiled(
+            tile_extract_kernel, 
+            dim=[1], 
+            inputs=[input_wp, output_wp], 
+            block_dim=TILE_DIM, 
+            device=device)
 
     assert_array_equal(output_wp, input_wp)
 
diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
index 229ce074..92e97ff0 100644
--- a/warp/tests/test_tile_mathdx.py
+++ b/warp/tests/test_tile_mathdx.py
@@ -28,7 +28,7 @@
 def tile_math_matmul_kernel(
     ga: wp.array2d(dtype=wp.float64), gb: wp.array2d(dtype=wp.float64), gc: wp.array2d(dtype=wp.float64)
 ):
-    i, j, _ = wp.tid()
+    i, j = wp.tid()
     a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
     b = wp.tile_load(gb, i, j, m=TILE_K, n=TILE_N)
     c = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
@@ -48,9 +48,9 @@ def test_tile_math_matmul(test, device):
     C_wp = wp.array(C, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
+        wp.launch_tiled(
             tile_math_matmul_kernel,
-            dim=[1, 1, TILE_DIM],
+            dim=[1, 1],
             inputs=[A_wp, B_wp, C_wp],
             block_dim=TILE_DIM,
             device=device,
@@ -69,7 +69,7 @@ def test_tile_math_matmul(test, device):
 
 @wp.kernel()
 def tile_math_fft_kernel(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
-    i, j, _ = wp.tid()
+    i, j = wp.tid()
     xy = wp.tile_load(gx, i, j, m=N_FFT, n=N_FFT)
     wp.tile_fft(xy)
     wp.tile_store(gy, i, j, xy)
@@ -91,7 +91,12 @@ def test_tile_math_fft(test, device):
     Y_c64 = np.fft.fft(X_c64, axis=-1)
 
     with wp.Tape() as tape:
-        wp.launch(tile_math_fft_kernel, dim=[1, 1, TILE_DIM], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
+        wp.launch_tiled(
+            tile_math_fft_kernel, 
+            dim=[1, 1], 
+            inputs=[X_wp, Y_wp], 
+            block_dim=TILE_DIM, 
+            device=device)
 
     Y_wp_c64 = Y_wp.numpy().view(np.complex64).reshape(N_FFT, N_FFT)
 
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index c343e353..3f65b7cf 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -21,9 +21,9 @@
 
 
 @wp.kernel
-def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
+def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):    
     # output tile index
-    i, _ = wp.tid()
+    i = wp.tid()
 
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
     s = wp.tile_sum(a) * 0.5
@@ -44,13 +44,7 @@ def test_tile_reduce_sum(test, device):
     output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(
-            tile_sum_kernel,
-            dim=[batch_count, TILE_DIM],
-            inputs=[input_wp, output_wp],
-            block_dim=TILE_DIM,
-            device=device,
-        )
+        wp.launch_tiled(tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device)
 
     sum_wp = output_wp.numpy()
     for i in range(batch_count):
@@ -65,8 +59,9 @@ def test_tile_reduce_sum(test, device):
 
 
 @wp.kernel
-def tile_reduce_1d_kernel(output: wp.array(dtype=int)):
-    # output tile index
+def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
+    
+    # thread index
     i = wp.tid()
 
     t = wp.tile(i)  # convert to block wide tile
@@ -76,14 +71,14 @@ def tile_reduce_1d_kernel(output: wp.array(dtype=int)):
     wp.tile_atomic_add(output, 0, 0, s)
 
 
-def test_tile_reduce_1d(test, device):
+def test_tile_reduce_simt(test, device):
     # use an unaligned grid dimension
     N = int(TILE_DIM * 3 / 2)
 
     output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_reduce_1d_kernel, dim=[N], inputs=[output], block_dim=TILE_DIM, device=device)
+        wp.launch(tile_reduce_simt_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
 
     test.assertEqual(output.numpy()[0], np.sum(np.arange(N)))
 
@@ -102,7 +97,7 @@ def test_tile_ones(test, device):
     output = wp.zeros(shape=1, dtype=float, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_ones_kernel, dim=[1, TILE_DIM], inputs=[output], block_dim=TILE_DIM, device=device)
+        wp.launch_tiled(tile_ones_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
 
     test.assertAlmostEqual(output.numpy()[0], 256.0)
 
@@ -126,7 +121,7 @@ def test_tile_arange(test, device):
     output = wp.zeros(shape=(3, N), dtype=int, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(tile_arange_kernel, dim=[1, N], inputs=[output], block_dim=TILE_DIM, device=device)
+        wp.launch_tiled(tile_arange_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
 
     assert_np_equal(output.numpy()[0], np.arange(17))
     assert_np_equal(output.numpy()[1], np.arange(5, 22))
@@ -141,7 +136,7 @@ class TestTileReduce(unittest.TestCase):
 
 
 add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
-add_function_test(TestTileReduce, "test_tile_reduce_1d", test_tile_reduce_1d, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_simt", test_tile_reduce_simt, devices=devices)
 add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
 add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
 

From 0d795e76f0bf86dddac25f107cc7981594a23dd0 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 27 Sep 2024 10:07:59 +0000
Subject: [PATCH 040/102] Add first pass at documentation section for tiles

---
 docs/index.rst             |   1 +
 docs/modules/functions.rst |   2 +-
 docs/modules/tiles.rst     | 165 +++++++++++++++++++++++++++++++++++++
 warp/builtins.py           |   2 +-
 warp/stubs.py              |   2 +-
 5 files changed, 169 insertions(+), 3 deletions(-)
 create mode 100644 docs/modules/tiles.rst

diff --git a/docs/index.rst b/docs/index.rst
index e3f45fa0..ac324f32 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -351,6 +351,7 @@ Full Table of Contents
     modules/devices
     modules/differentiability
     modules/generics
+    modules/tiles
     modules/interoperability
     configuration
     debugging
diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 4f328880..45a79f07 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -1022,7 +1022,7 @@ Tile Primitives
         * vec2h, vec2f, vec2d (complex)
 
     All input and output tiles must have the same datatype. Tile data will be automatically be migrated 
-    to shared memory if necessary and will use TensoreCore operations when available.
+    to shared memory if necessary and will use TensorCore operations when available.
        
     :param a: A tile with ``shape=(M, K)``
     :param b: A tile with ``shape=(K, N)``
diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst
new file mode 100644
index 00000000..bf7f40bb
--- /dev/null
+++ b/docs/modules/tiles.rst
@@ -0,0 +1,165 @@
+Tiles (Preview)
+===============
+
+Block-based programming models such as those in OpenAI Triton have proved to be effective ways of expressing high performance kernels that can leverage cooperative operations on modern GPUs.
+
+Warp 1.4.0 introduces tile extensions that expose a block-based programming to Warp kernels. 
+
+Execution Model
+---------------
+
+Warp's execution model allows users to specify an up to 4-dimensional grid of logical threads for kernel execution at launch time. With the introduction of tiles, users can also specify a block size, which partitions the grid into smaller sets of threads that are executed on a single compute unit.
+
+Inside kernels, tile operations are executed cooperatively across each block of threads, allowing them to take advantage of efficient memory access, local memory, and dedicated hardware units like TensorCores.
+
+As an example, consider the following kernel:
+
+.. code:: python
+    
+    TILE_SIZE = wp.constant(256)
+    TILE_THREADS = 64
+
+    @wp.kernel
+    def compute(a: array(dtype=float))
+        i = wp.tid()/TILE_SIZE
+
+        t = wp.tile_load(array, x=i, n=TILE_SIZE)
+        ...
+
+    wp.launch(compute, dim=[len(a)], inputs=[a], block_dim=TILE_THREADS)
+    
+Here, we load a 1D tile of 256 values from a global memory array ``a``, where the load operation is performed cooperatively by all 64 threads in the block, as specified by the ``block_dim`` argument to :func:`warp.launch`. In this case each thread is responsible for loading 4 values from global memory, which may then be stored in registers, or shared memory across the block.
+
+Tile Properties
+---------------
+
+In Warp, tile objects are 2D arrays of data where the tile elements may be scalars, vectors, matrices, or user defined structures.
+
+In a more complex example, we launch a grid of threads where each block is responsible for loading a row of data from a 2D array and computing its sum:
+
+.. code:: python
+    
+    TILE_SIZE = wp.constant(256)
+    TILE_THREADS = 64
+
+    @wp.kernel
+    def compute(a: array2d(dtype=float))
+        i, _= wp.tid()
+
+        # load a row from global memory
+        t = wp.tile_load(array, i, TILE_SIZE)
+        s = wp.sum(t)
+        ...
+
+    wp.launch(compute, dim=[a.shape[0], TILE_THREADS], inputs=[a], block_dim=TILE_THREADS)
+    
+Here, we launch a 2D grid of threads where the trailing dimension is equal to the block size. This ensures we have an entire block of threads dedicated to each row. Each block then loads an entire row of 256 values from the global memory array and computes its sum.
+
+To streamline this common pattern Warp provides a helper ``wp.tiled_launch()`` which takes care of adding the trailing tile dimension to the thread grid, for example, to assign a block of 64 threads to load and sum a 2D array of values we can do the following:
+
+.. code:: python
+    
+    TILE_M = wp.constant(16)
+    TILE_N = wp.constant(16)    
+    TILE_THREADS = 64
+
+    @wp.kernel
+    def compute(a: array2d(dtype=float))
+        i, j = wp.tid()
+
+        # load a row from global memory
+        t = wp.tile_load(array, i, j, TILE_M, TILE_N)
+        s = wp.sum(t)
+        ...
+
+    wp.launch_tiled(compute, dim=[a.shape[0]/TILE_M, a.shape[1]/TILE_N], inputs=[a], block_dim=TILE_THREADS)
+    
+In this example, we use :func:`warp.launch_tiled` to automatically insert the trailing dimension, and assign ``TILE_THREADS`` to each 2D tile of the array. Each tile consists of ``16*16=256`` values, which are loaded cooperatively by the 64 threads in each block.
+
+Tile Storage
+------------
+
+When tiles are created they are placed in either `register` or `shared` memory. In general Warp tries to determine the best storage for each, the default is generally for register storage, although some operations such as matrix multiplies may migrate data from register to shared as necessary.
+
+Register Tiles
+++++++++++++++
+
+Values in register tiles are stored across the entire block, for example, if the block dimension at launch is set to 64, a register tile with ``shape=(1, 256)`` will result in each thread storing 4 elements. Reigster based storage is the fastest storage on most hardware, however, because the tile storage is spread across the threads in the block, an individual thread cannot randomly access data that is assigned to another thread efficiently. For this reason operations on tiles tend to expressed as higher level maps, reductions, and reshaping operations that may transfer values through shared memory.
+
+Shared Memory Tiles
++++++++++++++++++++
+
+Some operations like matrix multiplication, require access to an entire tile of values. In this case the tile data may stored in shared memory, which allows efficient random access. Warp will automatically migrate tiles to shared memory as necessary for specific operations. Shared memory is a limited resource, and so tile size must be set appropriately to avoid exceeding the hardware limitations, otherwise kernel compilation may fail.
+
+Tile Operations
+---------------
+
+Creation
+++++++++
+
+* :func:`warp.tile_zeros`
+* :func:`warp.tile_ones`
+* :func:`warp.tile_arange`
+
+Conversion
+++++++++++
+
+* :func:`warp.tile`
+* :func:`warp.untile`
+
+
+Load/Store
+++++++++++
+
+* :func:`warp.tile_load`
+* :func:`warp.tile_store`
+* :func:`warp.tile_atomic_add`
+
+Maps/Reductions
++++++++++++++++
+
+* :func:`warp.tile_map`
+* :func:`warp.tile_sum`
+
+Linear Algebra
+++++++++++++++
+
+* :func:`warp.tile_matmul`
+* :func:`warp.tile_fft`
+* :func:`warp.tile_ifft`
+
+Tiles and SIMT Code
++++++++++++++++++++
+
+Warp kernels are primarily written in the SIMT programming model in mind, where each thread's execution happens completely independently. Tiles on the other hand allow threads to work cooperatively to perform operations.
+
+Warp aims to give users a way to seamlessly integrate tile operations with existing SIMT code. To this end, we expose two operations, :func:`warp.tile`, and :func:`warp.untile` which can be used as follows:
+
+.. code:: python
+    
+    TILE_THREADS = 64
+
+    @wp.kernel
+    def compute()
+        i = wp.tid()
+
+        # perform some per-thread computation
+        x = i*2.0 + wp.sin(float(i))
+
+        # tile the value x across the block
+        # returns a tile with shape=(1, TILE_THREADS)
+        t = wp.tile(x)
+        ...
+
+    # launch as regular SIMT kernel
+    wp.launch(compute, dim=[N], inputs=[], block_dim=TILE_THREADS)
+
+In this example we perform some per-thread computations, and then convert the scalar ``x`` value into a tile object using the  :func:`warp.tile` function. This function takes a single value as input, and returns a tile with the same dimensions as the number of threads in the block. From here, the tile can used in other regular cooperative operations such as reductions, GEMMs, etc.
+
+Similarly, we can `untile` tile objects back to their per-thread scalar equivalent values.
+
+
+
+
+
+
diff --git a/warp/builtins.py b/warp/builtins.py
index bcc3a573..b91f6dd2 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5192,7 +5192,7 @@ def make_transpose(t):
         * vec2h, vec2f, vec2d (complex)
 
     All input and output tiles must have the same datatype. Tile data will be automatically be migrated 
-    to shared memory if necessary and will use TensoreCore operations when available.
+    to shared memory if necessary and will use TensorCore operations when available.
        
     :param a: A tile with ``shape=(M, K)``
     :param b: A tile with ``shape=(K, N)``
diff --git a/warp/stubs.py b/warp/stubs.py
index a0e38e91..2e5b4bf9 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -2595,7 +2595,7 @@ def tile_matmul(a: Tile, b: Tile, out: Tile) -> Tile:
         * vec2h, vec2f, vec2d (complex)
 
     All input and output tiles must have the same datatype. Tile data will be automatically be migrated
-    to shared memory if necessary and will use TensoreCore operations when available.
+    to shared memory if necessary and will use TensorCore operations when available.
 
     :param a: A tile with ``shape=(M, K)``
     :param b: A tile with ``shape=(K, N)``

From ebd29e03ef5e2a79b20777fd6ac260849bf74ce6 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Fri, 27 Sep 2024 09:54:33 -0700
Subject: [PATCH 041/102] Various formatting and test updates

---
 docs/modules/functions.rst     |  8 ++--
 warp/builtins.py               | 16 ++++---
 warp/context.py                |  5 ++-
 warp/tests/test_misc.py        | 63 --------------------------
 warp/tests/test_tile.py        | 80 +++++++++++++++-------------------
 warp/tests/test_tile_mathdx.py |  7 +--
 warp/tests/test_tile_reduce.py |  7 +--
 7 files changed, 58 insertions(+), 128 deletions(-)
 delete mode 100644 warp/tests/test_misc.py

diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 45a79f07..0e3ec3de 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -1021,9 +1021,9 @@ Tile Primitives
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
 
-    All input and output tiles must have the same datatype. Tile data will be automatically be migrated 
+    All input and output tiles must have the same datatype. Tile data will be automatically be migrated
     to shared memory if necessary and will use TensorCore operations when available.
-       
+
     :param a: A tile with ``shape=(M, K)``
     :param b: A tile with ``shape=(K, N)``
     :param out: A tile with ``shape=(M, N)``
@@ -1033,7 +1033,7 @@ Tile Primitives
 .. py:function:: tile_fft(inout: Tile) -> Tile
 
     Compute the forward FFT along the second dimension of a 2D tile of data.
-    
+
     This function cooperatively computes the forward FFT on a tile of data inplace, treating each row individually.
 
     Supported datatypes are:
@@ -1045,7 +1045,7 @@ Tile Primitives
 .. py:function:: tile_ifft(inout: Tile) -> Tile
 
     Compute the inverse FFT along the second dimension of a 2D tile of data.
-    
+
     This function cooperatively computes the inverse FFT on a tile of data inplace, treating each row individually.
 
     Supported datatypes are:
diff --git a/warp/builtins.py b/warp/builtins.py
index b91f6dd2..b5994685 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5054,7 +5054,11 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
 
 
 def tile_matmul_generic_lto_dispatch_func(
-    arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var], options: Mapping[str, Any], builder: warp.context.ModuleBuilder
+    arg_types: Mapping[str, type],
+    return_type: Any,
+    arg_values: Mapping[str, Var],
+    options: Mapping[str, Any],
+    builder: warp.context.ModuleBuilder,
 ):
     a = arg_values["a"]
     b = arg_values["b"]
@@ -5157,7 +5161,7 @@ def make_transpose(t):
             with open(lto_code.name, "rb") as f:
                 lto_code = f.read()
 
-            builder.ltoirs[lto_symbol] = lto_code            
+            builder.ltoirs[lto_symbol] = lto_code
             return lto_symbol, lto_code
 
     (fun_forward, lto_forward) = make_function(M, N, K, "N", "N")  #    C += A * B
@@ -5191,9 +5195,9 @@ def make_transpose(t):
         * fp16, fp32, fp64 (real)
         * vec2h, vec2f, vec2d (complex)
 
-    All input and output tiles must have the same datatype. Tile data will be automatically be migrated 
+    All input and output tiles must have the same datatype. Tile data will be automatically be migrated
     to shared memory if necessary and will use TensorCore operations when available.
-       
+
     :param a: A tile with ``shape=(M, K)``
     :param b: A tile with ``shape=(K, N)``
     :param out: A tile with ``shape=(M, N)``
@@ -5317,7 +5321,7 @@ def tile_fft_generic_lto_dispatch_func(
     lto_dispatch_func=functools.partial(tile_fft_generic_lto_dispatch_func, direction="forward"),
     variadic=True,
     doc="""Compute the forward FFT along the second dimension of a 2D tile of data.
-    
+
     This function cooperatively computes the forward FFT on a tile of data inplace, treating each row individually.
 
     Supported datatypes are:
@@ -5336,7 +5340,7 @@ def tile_fft_generic_lto_dispatch_func(
     lto_dispatch_func=functools.partial(tile_fft_generic_lto_dispatch_func, direction="inverse"),
     variadic=True,
     doc="""Compute the inverse FFT along the second dimension of a 2D tile of data.
-    
+
     This function cooperatively computes the inverse FFT on a tile of data inplace, treating each row individually.
 
     Supported datatypes are:
diff --git a/warp/context.py b/warp/context.py
index efcf7fd6..ff92f0e3 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -1541,7 +1541,7 @@ def __init__(self, module, options, hasher=None):
         self.options = options
         self.module = module
         self.deferred_functions = []
-        self.ltoirs = {}    # map from lto symbol to lto binary
+        self.ltoirs = {}  # map from lto symbol to lto binary
 
         if hasher is None:
             hasher = ModuleHasher(module)
@@ -5176,6 +5176,7 @@ def pack_args(args, params, adjoint=False):
         if warp.config.verify_autograd_array_access:
             runtime.tape._check_kernel_array_access(kernel, fwd_args)
 
+
 def launch_tiled(*args, **kwargs):
     """A helper method for launching a grid with an extra trailing dimension equal to the block size.
 
@@ -5216,7 +5217,7 @@ def compute()
 
     # forward to original launch method
     launch(*args, **kwargs)
-    
+
 
 def synchronize():
     """Manually synchronize the calling CPU thread with any outstanding CUDA work on all devices
diff --git a/warp/tests/test_misc.py b/warp/tests/test_misc.py
deleted file mode 100644
index de9e5fc4..00000000
--- a/warp/tests/test_misc.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import numpy as np
-import warp as wp
-
-wp.clear_kernel_cache()
-
-TILE_M = wp.constant(8)
-TILE_N = wp.constant(4)
-TILE_K = wp.constant(8)
-
-# num threads per-tile
-TILE_DIM = 64
-
-
-@wp.kernel
-def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C: wp.array3d(dtype=float)):
-    # output tile index
-    i = wp.tid()
-
-    a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
-    b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
-
-    print(a)
-    print(b)
-
-    # sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
-
-    # wp.tile_matmul(a, b, sum)
-
-    # print(sum)
-
-    # wp.tile_store(C[i], 0, 0, sum)
-
-
-batch_count = 1
-
-M = TILE_M
-N = TILE_N
-K = TILE_K
-
-device = "cuda:0"
-
-rng = np.random.default_rng(42)
-A = rng.random((batch_count, M, K), dtype=np.float32)
-B = rng.random((batch_count, K, N), dtype=np.float32)
-C = A @ B
-
-A_wp = wp.array(A, requires_grad=True, device=device)
-B_wp = wp.array(B, requires_grad=True, device=device)
-C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
-
-with wp.Tape() as tape:
-    wp.launch(tile_grouped_gemm, 
-                dim=[batch_count, TILE_DIM], 
-                inputs=[A_wp, B_wp, C_wp], 
-                block_dim=TILE_DIM, 
-                device=device)
-
-wp.synchronize()
-
-# TODO: 32 mismatched elements
-#assert_np_equal(C_wp.numpy(), C)
-#print(C_wp.numpy())
-
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index f757be22..8aba083e 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -12,6 +12,8 @@
 import warp as wp
 from warp.tests.unittest_utils import *
 
+wp.init()  # For wp.context.runtime.core.is_mathdx_enabled()
+
 TILE_M = wp.constant(8)
 TILE_N = wp.constant(4)
 TILE_K = wp.constant(8)
@@ -167,22 +169,22 @@ def test_tile_binary_map(test, device):
     assert_np_equal(B_wp.grad.numpy(), B_grad)
 
 
-@wp.kernel
-def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C: wp.array3d(dtype=float)):
-    # output tile index
-    i = wp.tid()
-
-    a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
-    b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
+def test_tile_grouped_gemm(test, device):
+    @wp.kernel
+    def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C: wp.array3d(dtype=float)):
+        # output tile index
+        i = wp.tid()
 
-    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+        a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
 
-    wp.tile_matmul(a, b, sum)
+        sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
-    wp.tile_store(C[i], 0, 0, sum)
+        wp.tile_matmul(a, b, sum)
 
+        wp.tile_store(C[i], 0, 0, sum)
 
-def test_tile_grouped_gemm(test, device):
     batch_count = 56
 
     M = TILE_M
@@ -199,40 +201,38 @@ def test_tile_grouped_gemm(test, device):
     C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch_tiled(tile_grouped_gemm, 
-                  dim=[batch_count],
-                  inputs=[A_wp, B_wp, C_wp], 
-                  block_dim=TILE_DIM, 
-                  device=device)
+        wp.launch_tiled(
+            tile_grouped_gemm, dim=[batch_count], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM, device=device
+        )
 
     # TODO: 32 mismatched elements
     assert_np_equal(C_wp.numpy(), C)
 
 
-@wp.kernel
-def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
-    # output tile index
-    i, j = wp.tid()
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
+def test_tile_gemm(test, device):
+    @wp.kernel
+    def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
+        # output tile index
+        i, j = wp.tid()
 
-    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+        sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
-    M = A.shape[0]
-    N = B.shape[1]
-    K = A.shape[1]
+        M = A.shape[0]
+        N = B.shape[1]
+        K = A.shape[1]
 
-    count = int(K / TILE_K)
+        count = int(K / TILE_K)
 
-    for k in range(0, count):
-        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
-        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+        for k in range(0, count):
+            a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+            b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
 
-        # sum += a*b
-        wp.tile_matmul(a, b, sum)
+            # sum += a*b
+            wp.tile_matmul(a, b, sum)
 
-    wp.tile_store(C, i, j, sum)
+        wp.tile_store(C, i, j, sum)
 
-
-def test_tile_gemm(test, device):
     M = TILE_M * 7
     K = TILE_K * 6
     N = TILE_N * 5
@@ -302,11 +302,8 @@ def test_tile_operators(test, device):
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_operators, 
-            dim=[batch_count], 
-            inputs=[input_wp, output_wp], 
-            block_dim=TILE_DIM, 
-            device=device)
+            tile_operators, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
 
     assert_np_equal(output_wp.numpy(), output)
 
@@ -387,12 +384,7 @@ def test_tile_extract(test, device):
     output_wp = wp.zeros_like(input_wp, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch_tiled(
-            tile_extract_kernel, 
-            dim=[1], 
-            inputs=[input_wp, output_wp], 
-            block_dim=TILE_DIM, 
-            device=device)
+        wp.launch_tiled(tile_extract_kernel, dim=[1], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device)
 
     assert_array_equal(output_wp, input_wp)
 
diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
index 92e97ff0..50b71404 100644
--- a/warp/tests/test_tile_mathdx.py
+++ b/warp/tests/test_tile_mathdx.py
@@ -91,12 +91,7 @@ def test_tile_math_fft(test, device):
     Y_c64 = np.fft.fft(X_c64, axis=-1)
 
     with wp.Tape() as tape:
-        wp.launch_tiled(
-            tile_math_fft_kernel, 
-            dim=[1, 1], 
-            inputs=[X_wp, Y_wp], 
-            block_dim=TILE_DIM, 
-            device=device)
+        wp.launch_tiled(tile_math_fft_kernel, dim=[1, 1], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
 
     Y_wp_c64 = Y_wp.numpy().view(np.complex64).reshape(N_FFT, N_FFT)
 
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index 3f65b7cf..22578de8 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -21,7 +21,7 @@
 
 
 @wp.kernel
-def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):    
+def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
     i = wp.tid()
 
@@ -44,7 +44,9 @@ def test_tile_reduce_sum(test, device):
     output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
 
     with wp.Tape() as tape:
-        wp.launch_tiled(tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device)
+        wp.launch_tiled(
+            tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
 
     sum_wp = output_wp.numpy()
     for i in range(batch_count):
@@ -60,7 +62,6 @@ def test_tile_reduce_sum(test, device):
 
 @wp.kernel
 def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
-    
     # thread index
     i = wp.tid()
 

From ecb578f8bca1062a5bb36c64c11c375e91ad0bed Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 27 Sep 2024 21:10:39 +0000
Subject: [PATCH 042/102] Fix for partial warps using `wp.tile_sum()`

---
 warp/native/tile_reduce.h      | 2 +-
 warp/tests/test_tile_reduce.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index 1f618f6d..047177b9 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -30,7 +30,7 @@ inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset)
     Word* src  = reinterpret_cast<Word*>(&input);
 
     unsigned int shuffle_word;
-    unsigned int mask = 0xffffffff;
+    unsigned int mask = __activemask();
 
     constexpr int word_count = (sizeof(T) + sizeof(Word) - 1) / sizeof(Word);
 
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index 3f65b7cf..9d868be2 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -72,8 +72,9 @@ def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
 
 
 def test_tile_reduce_simt(test, device):
+    
     # use an unaligned grid dimension
-    N = int(TILE_DIM * 3 / 2)
+    N = TILE_DIM*4 + 5
 
     output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
 

From c913ad79366448f34becdc1f85ebbff27b98e9e6 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 27 Sep 2024 23:46:43 +0000
Subject: [PATCH 043/102] Handle partial warp case for tile_sum

---
 warp/native/tile_reduce.h | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index 047177b9..efa6ab4f 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -8,7 +8,7 @@ namespace wp
 {
 
 template <typename T>
-inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset)
+inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset, int mask)
 {
     typedef unsigned int Word;
 
@@ -30,7 +30,6 @@ inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset)
     Word* src  = reinterpret_cast<Word*>(&input);
 
     unsigned int shuffle_word;
-    unsigned int mask = __activemask();
 
     constexpr int word_count = (sizeof(T) + sizeof(Word) - 1) / sizeof(Word);
 
@@ -49,9 +48,25 @@ inline CUDA_CALLABLE T warp_reduce_sum(T val)
 {
     T sum = val;
 
-    for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
+    unsigned int mask = __activemask();
+
+    if (mask == 0xFFFFFFFF)
+    {
+        // handle case where entire warp is active
+        for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
+        {
+            sum += warp_shuffle_down(sum, offset, mask);
+        }
+    }
+    else
     {
-        sum += warp_shuffle_down(sum, offset);
+        // handle partial warp case
+        for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
+        {            
+            T shfl_val = warp_shuffle_down(sum, offset, mask);
+            if ((mask & (1 << ((threadIdx.x + offset)%WP_TILE_WARP_SIZE))) != 0)
+                sum += shfl_val;
+        }
     }
 
     return sum;

From d5909f69e1d12689e8c8a5c166942f178eee19eb Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Sat, 28 Sep 2024 02:07:03 +0000
Subject: [PATCH 044/102] Fix for uninitialized partial reduction results

---
 warp/native/tile_reduce.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index efa6ab4f..f14ff70c 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -113,9 +113,18 @@ auto tile_sum(Tile& t)
     // fixed size scratch pad for partial results in shared memory
     WP_TILE_SHARED T partials[warp_count];
 
+    // count of active warps
+    WP_TILE_SHARED int active_warps;
+    if (threadIdx.x == 0)
+        active_warps = 0;
+    
+    // ensure active_warps is initialized
+    WP_TILE_SYNC();
+
     if (lane_index == 0)
     {
         partials[warp_index] = warp_sum;
+        atomicAdd(&active_warps, 1);
     }
 
     // ensure partials are ready
@@ -127,7 +136,7 @@ auto tile_sum(Tile& t)
         T block_sum = partials[0];
         
         WP_PRAGMA_UNROLL
-        for (int i=1; i < warp_count; ++i)
+        for (int i=1; i < active_warps; ++i)
             block_sum += partials[i];
 
         output.data[0] = block_sum;

From 114a6b501bd77d1ec2037f60d557ea4c657f6300 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Sat, 28 Sep 2024 09:40:16 +0000
Subject: [PATCH 045/102] Add wp.untile() Make wp.tile_zeros() use register
 storage by default

---
 docs/modules/functions.rst     | 40 +++++++++++++++++++
 warp/builtins.py               | 73 +++++++++++++++++++++++++++++++---
 warp/native/tile.h             | 23 ++++++++---
 warp/stubs.py                  | 43 ++++++++++++++++++++
 warp/tests/test_tile_reduce.py | 28 ++++++++++++-
 warp/types.py                  |  2 +-
 6 files changed, 195 insertions(+), 14 deletions(-)

diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 0e3ec3de..f33d4339 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -902,6 +902,46 @@ Tile Primitives
     
 
 
+.. py:function:: untile(a: Any) -> Scalar
+
+    Convert a Tile back to per-thread values.
+
+    This function converts a block-wide tile back to per-thread values.
+
+    :param a: A tile with dimensions ``shape=(1, block_dim)``
+    :returns: A single value per-thread with the same dtype as the tile
+
+    This example shows how to create a linear sequence from thread variables:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            i = wp.tid()
+
+            # create block-wide tile
+            t = wp.tile(i)*2
+
+            # convert back to per-thread values
+            s = wp.untile()
+
+            print(s)
+
+        wp.launch(compute, dim=16, inputs=[], block_dim=16)
+
+    Prints:
+
+    .. code-block:: text
+
+        0
+        2
+        4
+        6
+        8
+        ...
+    
+
+
 .. py:function:: tile_extract(a: Tile, i: int32, j: int32) -> Scalar
 
     Extracts a single element from the tile and returns it as a scalar type.
diff --git a/warp/builtins.py b/warp/builtins.py
index b5994685..063fbbd6 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2083,6 +2083,73 @@ def compute():
 )
 
 
+def untile_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Scalar
+
+    if len(arg_types) != 1:
+        raise RuntimeError("untile() requires 1 positional arg")
+
+    t = arg_types["a"]
+
+    if not is_tile(t):
+        raise RuntimeError(f"untile() accepts arguments of type tile only, got {arg_types[0]}")
+
+    if t.N != warp.codegen.options["block_dim"]:
+        raise RuntimeError(
+            f"until() argument must have the same length as the block width, got {t.N}, expected {warp.codegen.options['block_dim']}"
+        )
+
+    return t.dtype
+
+
+add_builtin(
+    "untile",
+    input_types={"a": Any},
+    value_func=untile_value_func,
+    variadic=True,
+    doc="""Convert a Tile back to per-thread values.
+
+    This function converts a block-wide tile back to per-thread values.
+
+    :param a: A tile with dimensions ``shape=(M, block_dim)``
+    :returns: A single value per-thread with the same dtype as the tile
+
+    This example shows how to create a linear sequence from thread variables:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            i = wp.tid()
+
+            # create block-wide tile
+            t = wp.tile(i)*2
+
+            # convert back to per-thread values
+            s = wp.untile()
+
+            print(s)
+
+        wp.launch(compute, dim=16, inputs=[], block_dim=16)
+
+    Prints:
+
+    .. code-block:: text
+
+        0
+        2
+        4
+        6
+        8
+        ...
+    """,
+    group="Tile Primitives" "",
+    export=False,
+)
+
+
 def tile_extract_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2132,9 +2199,6 @@ def tile_matmul_value_func(arg_types, arg_values):
     if not isinstance(arg_types["out"], Tile):
         raise RuntimeError("tile_matmul() output argument must be a tile")
 
-    if arg_types["out"].storage != "shared":
-        raise RuntimeError("tile_matmul() output argument must have shared memory storage")
-
     return None
 
 
@@ -5047,9 +5111,6 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     if not isinstance(arg_types["out"], Tile):
         raise RuntimeError("tile_matmul() output argument must be a tile")
 
-    if arg_types["out"].storage != "shared":
-        raise RuntimeError("tile_matmul() output argument must have shared memory storage")
-
     return None
 
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index f3b1eea5..6f6ad654 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -647,28 +647,41 @@ inline CUDA_CALLABLE auto tile(const T& x)
 {
     tile_register_t<T, 1, WP_TILE_BLOCK_DIM> result;
     
-    // code-gen should have set the tile to 
-    // have exactly the block dimension so 
-    // there is exactly one value per-thread
     static_assert(result.NumRegs == 1);
 
     result.data[0] = x;
     return result;
 }
 
+
 // construct a tile from a local SIMT value (one per-thread)
 template <typename T, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, const AdjTile& adj_ret)
 {
     static_assert(AdjTile::M == 1);
     static_assert(AdjTile::N == WP_TILE_BLOCK_DIM);
+    static_assert(AdjTile::NumRegs == 1);
+
+    adj_x += adj_ret.data[0];
+}
 
+template <typename Tile>
+inline CUDA_CALLABLE auto untile(Tile& tile)
+{    
     // code-gen should have set the tile to 
     // have exactly the block dimension so 
     // there is exactly one value per-thread
-    static_assert(AdjTile::NumRegs == 1);
+    static_assert(Tile::NumRegs == 1);
 
-    adj_x += adj_ret.data[0];
+    return tile.copy_to_register().data[0];
+}
+
+template <typename Tile>
+inline CUDA_CALLABLE void adj_untile(Tile& tile, Tile& adj_tile, typename Tile::Type& adj_ret)
+{    
+    auto adj = adj_tile.copy_to_register();
+    adj.data[0] += adj_ret;
+    adj_tile.assign(adj);
 }
 
 // zero initialized tile
diff --git a/warp/stubs.py b/warp/stubs.py
index 2e5b4bf9..fc642827 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -1002,6 +1002,49 @@ def compute():
     ...
 
 
+@over
+def untile(a: Any) -> Scalar:
+    """Convert a Tile back to per-thread values.
+
+    This function converts a block-wide tile back to per-thread values.
+
+    :param a: A tile with dimensions ``shape=(M, block_dim)``
+    :returns: A single value per-thread with the same dtype as the tile
+
+    This example shows how to create a linear sequence from thread variables:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            i = wp.tid()
+
+            # create block-wide tile
+            t = wp.tile(i) * 2
+
+            # convert back to per-thread values
+            s = wp.untile()
+
+            print(s)
+
+
+        wp.launch(compute, dim=16, inputs=[], block_dim=16)
+
+    Prints:
+
+    .. code-block:: text
+
+        0
+        2
+        4
+        6
+        8
+        ...
+
+    """
+    ...
+
+
 @over
 def tile_extract(a: Tile, i: int32, j: int32) -> Scalar:
     """Extracts a single element from the tile and returns it as a scalar type.
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index 84be9e6b..723ab12f 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -73,9 +73,8 @@ def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
 
 
 def test_tile_reduce_simt(test, device):
-    
     # use an unaligned grid dimension
-    N = TILE_DIM*4 + 5
+    N = TILE_DIM * 4 + 5
 
     output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
 
@@ -85,6 +84,30 @@ def test_tile_reduce_simt(test, device):
     test.assertEqual(output.numpy()[0], np.sum(np.arange(N)))
 
 
+@wp.kernel
+def tile_untile_kernel(output: wp.array(dtype=int)):
+    # thread index
+    i = wp.tid()
+
+    # convert to block wide tile
+    t = wp.tile(i) * 2
+    s = wp.untile(t)
+
+    output[i] = s
+
+
+def test_tile_untile(test, device):
+    # use an unaligned grid dimension
+    N = TILE_DIM * 4 + 5
+
+    output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
+
+    assert_np_equal(output.numpy(), np.arange(N) * 2)
+
+
 @wp.kernel
 def tile_ones_kernel(out: wp.array(dtype=float)):
     i = wp.tid()
@@ -141,6 +164,7 @@ class TestTileReduce(unittest.TestCase):
 add_function_test(TestTileReduce, "test_tile_reduce_simt", test_tile_reduce_simt, devices=devices)
 add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
 add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
+add_function_test(TestTileReduce, "test_tile_untile", test_tile_untile, devices=devices)
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
diff --git a/warp/types.py b/warp/types.py
index 7dc725fb..3cae1be3 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2999,7 +2999,7 @@ def alloc(cls):
 
 class TileZeros(Tile):
     def __init__(self, dtype, M, N):
-        Tile.__init__(self, dtype, M, N, op="zeros", storage="shared")
+        Tile.__init__(self, dtype, M, N, op="zeros", storage="register")
 
 
 class TileRange(Tile):

From 6e9cee09a12ac31c080d1ef9508905b197bd6e83 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Sun, 29 Sep 2024 04:00:22 +0000
Subject: [PATCH 046/102] Add wp.tile_min() Add wp.tile_max() Add
 wp.tile_reduce()

---
 warp/builtins.py               | 188 ++++++++++++++++++++++++++++++++-
 warp/codegen.py                |  14 ---
 warp/native/tile.h             |  26 +++--
 warp/native/tile_reduce.h      | 135 ++++++++++-------------
 warp/tests/test_tile_reduce.py | 180 ++++++++++++++++++++++++++++++-
 5 files changed, 430 insertions(+), 113 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 063fbbd6..6771d22a 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2280,6 +2280,167 @@ def compute():
     export=False,
 )
 
+def tile_min_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile(dtype=Any, M=1, N=1)
+
+    if len(arg_types) != 1:
+        raise RuntimeError("tile_min() requires 1 positional args")
+
+    a = arg_types["a"]
+
+    if not is_tile(a):
+        raise RuntimeError("tile_min() argument 0 must be a tile")
+
+    return Tile(dtype=a.dtype, M=1, N=1, op="min")
+
+
+add_builtin(
+    "tile_min",
+    input_types={"a": Tile},
+    value_func=tile_min_value_func,
+    variadic=True,
+    doc="""Cooperatively compute the minimum of the tile elements using all threads in the block.
+
+    :param a: The tile to compute the minimum of
+    :returns: A single element tile with dimensions of (1,1) holding the minimum value
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            s = wp.tile_min(t)
+
+            print(t)
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[-10]]
+
+    """,
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_max_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile(dtype=Any, M=1, N=1)
+
+    if len(arg_types) != 1:
+        raise RuntimeError("tile_max() requires 1 positional args")
+
+    a = arg_types["a"]
+
+    if not is_tile(a):
+        raise RuntimeError("tile_max() argument 0 must be a tile")
+
+    return Tile(dtype=a.dtype, M=1, N=1, op="min")
+
+
+add_builtin(
+    "tile_max",
+    input_types={"a": Tile},
+    value_func=tile_max_value_func,
+    variadic=True,
+    doc="""Cooperatively compute the maximum of the tile elements using all threads in the block.
+
+    :param a: The tile to compute the maximum from
+    :returns: A single element tile with dimensions of (1,1) holding the maximum value
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            s = wp.tile_min(t)
+
+            print(t)
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[10]]
+
+    """,
+    group="Tile Primitives",
+    export=False,
+)
+
+# does type propagation for load()
+def tile_reduce_value_func(arg_types, arg_values):
+    if arg_types is None:
+        return Tile(dtype=Any, M=Any, N=Any)
+
+    a = arg_types["a"]
+
+    # check all args are tiles
+    if not is_tile(a):
+        raise RuntimeError(f"tile_reduce() arguments must be tiles, got type {a}")
+
+    return Tile(dtype=a.dtype, M=1, N=1, op="reduce")
+
+
+def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+    func_args = (args["op"], *args["args"])
+    template_args = ()
+    return (func_args, template_args)
+
+
+add_builtin(
+    "tile_reduce",
+    input_types={"op": Callable, "a": Any},
+    value_func=tile_reduce_value_func,
+    native_func="tile_reduce",
+    doc="""Apply a custom reduction operator across the tile.
+
+    This function cooperatively performs a reduction using the provided operator across the tile.
+
+    :param op: A callable function that accepts two arguments and returns one argument, may be a user function or builtin
+    :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A single element tile with ``shape=(1,1)`` with the same datatype as the input tile.
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_arange(1, 10, dtype=int)
+            s = wp.tile_reduce(wp.prod, t)
+
+            print(s)
+
+        wp.launch(compute, dim=[16], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[362880]]
+    """,
+    group="Tile Primitives",
+    export=False,
+)
+
+# maps
+
 
 # does type propagation for load()
 def tile_unary_map_value_func(arg_types, arg_values):
@@ -2356,7 +2517,7 @@ def tile_binary_map_value_func(arg_types, arg_values):
         raise RuntimeError(f"tile_map() arguments must be tiles, got type {b}")
 
     # use first argument to define output type
-    if a.dtype != b.dtype:
+    if not types_equal(a.dtype, b.dtype):
         raise RuntimeError(f"tile_map() arguments must all have the same type {a.dtype} != {b.dtype}")
 
     if a.M != b.M:
@@ -5108,7 +5269,7 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     if not is_tile(arg_types["b"]):
         raise RuntimeError("tile_matmul() argument 1 must be an tile")
 
-    if not isinstance(arg_types["out"], Tile):
+    if not is_tile(arg_types["out"]):
         raise RuntimeError("tile_matmul() output argument must be a tile")
 
     return None
@@ -5268,6 +5429,29 @@ def make_transpose(t):
     namespace="",
 )
 
+add_builtin(
+    "tile_matmul",
+    input_types={"a": Tile, "b": Tile},
+    value_func=tile_matmul_generic_value_func,
+    lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
+    variadic=True,
+    doc="""Computes the matrix product ``out = a*b``.
+
+    Supported datatypes are:
+        * fp16, fp32, fp64 (real)
+        * vec2h, vec2f, vec2d (complex)
+
+    Both input tiles must have the same datatype. Tile data will be automatically be migrated
+    to shared memory if necessary and will use TensorCore operations when available.
+
+    :param a: A tile with ``shape=(M, K)``
+    :param b: A tile with ``shape=(K, N)``
+    :returns: A tile with ``shape=(M, N)``
+    """,
+    group="Tile Primitives",
+    export=False,
+    namespace="",
+)
 
 ##
 ## FFT
diff --git a/warp/codegen.py b/warp/codegen.py
index 0eb26c1b..9628f795 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -1298,9 +1298,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
             fwd_args.append(strip_reference(func_arg))
 
-        # used to create an alias of the adjoint var to the primal var for tile ops
-        alias_call = None
-
         if return_type is None:
             # handles expression (zero output) functions, e.g.: void do_something();
 
@@ -1324,11 +1321,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
             forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
 
-            # prepend auto if it is an anonymously typed var (e.g.: a tile op)
-            if output.ctype() == "auto":
-                forward_call = "auto " + forward_call
-                alias_call = f"auto& adj_{output} = var_{output};"
-
             replay_call = forward_call
             if func.custom_replay_func is not None:
                 replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
@@ -1349,9 +1341,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
         else:
             adj.add_forward(forward_call, replay=replay_call)
 
-        if alias_call:
-            adj.add_forward(alias_call)
-
         if not func.missing_grad and len(args):
             adj_args = tuple(strip_reference(x) for x in func_args)
             reverse_has_output_args = (
@@ -3090,9 +3079,6 @@ def codegen_func_forward(adj, func_type="kernel", device="cpu"):
     lines += ["// primal vars\n"]
 
     for var in adj.variables:
-        # do not predeclare vars with auto type
-        if var.ctype() == "auto":
-            continue
 
         if is_tile(var.type):
             lines += [f"{var.ctype()} {var.emit()} = {var.type.cinit()};\n"]
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 6f6ad654..cdc338a2 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -37,13 +37,14 @@
         [x] Simple
         [ ] Cute
     [x] Remove Alloc type from tile_shared_t
-    [ ] wp.launch_tiled() helper
+    [x] wp.launch_tiled() helper
 [ ] Creation
     [x] zeros
     [x] ones
     [x] arange
     [x] tile()
-    [ ] untile()
+    [x] untile()
+    [ ] fromfunction()
     [ ] explicit storage
 [ ] Load/Store
     [ ] 1D load/store variants
@@ -76,23 +77,22 @@
     [ ] Slice
 [ ] Runtime
     [x] Compile-time block dimensions
-    [x] Switch between SIMT / Tile based execution if `tile_dim` not provided to wp.launch()
+    [x] Switch between SIMT / Tile based execution if `block_dim` not provided to wp.launch()
 [ ] Examples
+    [ ] Point registration
     [ ] GEMM
+    [ ] MLP
+    [ ] LayerNorm  
+    [ ] SoftMax
+    [ ] GEMM
+    [ ] warp.sim (CRBA)
     [ ] Batched MLP
-    [ ] Point cloud alignment
     [ ] Layer norm
     [ ] Convolution: https://github.com/NVIDIA/MinkowskiEngine/blob/master/src/convolution_kernel.cu#L123
     [ ] MeshCNN (Modulus, Oliver)
     [ ] BioNemo (Ali)
     [ ] Skinning (David/Or/Vismay)
     [ ] warp.sim (VBD)
-    [ ] warp.sim (CRBA)
-    [ ] Point clustering
-    [ ] GEMM
-    [ ] MLP
-    [ ] LayerNorm  
-    [ ] SoftMax
 [ ] Error checking
     [ ] Ensure functions passed to tile_map() are compatible with tile type
     [ ] Ensure that args passed to tile ops are compatible
@@ -237,6 +237,12 @@ struct tile_register_t
             data[i] = tile.data[i];
     }
 
+    inline CUDA_CALLABLE void zero()
+    {
+        for (int i=0; i < NumRegs; ++i)
+            data[i] = T(0);        
+    }
+
     // extract a single tile element to a native type
     inline CUDA_CALLABLE Type extract(int i, int j)
     {
diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index f14ff70c..35107f35 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -43,19 +43,17 @@ inline CUDA_CALLABLE T warp_shuffle_down(T val, int offset, int mask)
   return output;
 }
 
-template <typename T>
-inline CUDA_CALLABLE T warp_reduce_sum(T val)
+template <typename T, typename Op>
+inline CUDA_CALLABLE T warp_reduce(T val, Op f, unsigned int mask)
 {
     T sum = val;
 
-    unsigned int mask = __activemask();
-
     if (mask == 0xFFFFFFFF)
     {
         // handle case where entire warp is active
         for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
         {
-            sum += warp_shuffle_down(sum, offset, mask);
+            sum = f(sum, warp_shuffle_down(sum, offset, mask));
         }
     }
     else
@@ -65,31 +63,17 @@ inline CUDA_CALLABLE T warp_reduce_sum(T val)
         {            
             T shfl_val = warp_shuffle_down(sum, offset, mask);
             if ((mask & (1 << ((threadIdx.x + offset)%WP_TILE_WARP_SIZE))) != 0)
-                sum += shfl_val;
+                sum = f(sum, shfl_val);
         }
     }
 
     return sum;
 }
 
-template <typename T, typename Op>
-inline CUDA_CALLABLE T warp_reduce(T val, Op op)
-{
-    T sum = val;
-
-    for (int offset=WP_TILE_WARP_SIZE/2; offset > 0; offset /= 2)
-    {
-        sum = op(sum, warp_shuffle_down(sum, offset));
-    }
-
-    return sum;
-}
-
-
 // non-axis version which computes sum 
 // across the entire tile using the whole block
-template <typename Tile>
-auto tile_sum(Tile& t)
+template <typename Tile, typename Op>
+auto tile_reduce_impl(Op f, Tile& t)
 {
     using T = typename Tile::Type;
 
@@ -105,10 +89,19 @@ auto tile_sum(Tile& t)
     // thread reduction
     WP_PRAGMA_UNROLL
     for (int i=1; i < input.NumRegs; ++i)
-        thread_sum += input.data[i];
+    {
+        int linear = t.index(i);
+        if (!Tile::Aligned && linear >= Tile::Size)
+            break;
+
+        thread_sum = f(thread_sum, input.data[i]);
+    }
+
+    // ensure that only threads with at least one valid item participate in the reduction
+    unsigned int mask = __ballot_sync(__activemask(), t.index(0) < Tile::Size);
 
     // warp reduction
-    T warp_sum = warp_reduce_sum(thread_sum);
+    T warp_sum = warp_reduce(thread_sum, f, mask);
 
     // fixed size scratch pad for partial results in shared memory
     WP_TILE_SHARED T partials[warp_count];
@@ -137,7 +130,7 @@ auto tile_sum(Tile& t)
         
         WP_PRAGMA_UNROLL
         for (int i=1; i < active_warps; ++i)
-            block_sum += partials[i];
+            block_sum = f(block_sum, partials[i]);
 
         output.data[0] = block_sum;
     }
@@ -145,6 +138,24 @@ auto tile_sum(Tile& t)
     return output;
 }
 
+void adj_tile_reduce_impl() 
+{
+    // todo: general purpose reduction gradients not implemented 
+}
+
+// entry point for Python code-gen, wraps op in a lambda to perform overload resolution
+#define tile_reduce(op, t) tile_reduce_impl([](auto x, auto y) { return op(x, y);}, t)
+#define adj_tile_reduce(op, a, adj_op, adj_a, adj_ret) adj_tile_reduce_impl()
+
+// convenience methods for specific reductions
+
+template <typename Tile>
+auto tile_sum(Tile& t)
+{
+    return tile_reduce(add, t);
+}
+
+// special case adjoint for summation
 template <typename Tile, typename AdjTile>
 void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
@@ -163,70 +174,30 @@ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
     adj_t.assign(tile_add(adj_t_reg, adj_ret_reg));
 }
 
-
-template <typename Tile, typename Fwd>
-auto tile_reduce(Fwd op, Tile& t, int axis)
+template <typename Tile>
+auto tile_max(Tile& t)
 {
-    using T = typename Tile::Type;
-
-    auto input = t.copy_to_register();
-    auto output = tile_register_t<T, 1, 1>();
-
-    const int warp_count = (WP_TILE_BLOCK_DIM + WP_TILE_WARP_SIZE - 1)/WP_TILE_WARP_SIZE;
-    const int warp_index = threadIdx.x/WP_TILE_WARP_SIZE;
-    const int lane_index = threadIdx.x%WP_TILE_WARP_SIZE;
-
-    T thread_sum = input.data[0];
-
-    // thread reduction
-    WP_PRAGMA_UNROLL
-    for (int i=1; i < input.NumRegs; ++i)
-        thread_sum = op(thread_sum, input.data[i]);
-
-    // warp reduction
-    T warp_sum = warp_reduce(thread_sum, op);
-
-    // fixed size scratch pad for partial results
-    WP_TILE_SHARED T partials[warp_count];
-
-    if (lane_index == 0)
-    {
-        partials[warp_index] = warp_sum;
-    }
-
-    WP_TILE_SYNC();
-
-    // reduce across block, todo: use warp_reduce() here
-    if (threadIdx.x == 0)
-    {
-        T block_sum = partials[0];
-        
-        WP_PRAGMA_UNROLL
-        for (int i=1; i < warp_count; ++i)
-            block_sum = op(block_sum, partials[i]);
-
-        output.data[0] = block_sum;
-    }
-
-    return output;
+    return tile_reduce(max, t);
 }
 
-template <typename Tile, typename AdjTile, typename Fwd>
-void adj_tile_reduce(Tile& t, int axis, Tile& adj_t, int adj_axis, AdjTile& adj_ret)
+template <typename Tile, typename AdjTile>
+void adj_tile_max(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
-    using T = typename Tile::Type;
+    // todo: not implemented
+}
 
-    // broadcast incoming adjoint to block
-    WP_TILE_SHARED T scratch;
-    if (threadIdx.x == 0)
-        scratch = adj_ret.data[0];
+template <typename Tile>
+auto tile_min(Tile& t)
+{
+    return tile_reduce(min, t);
+}
 
-    WP_TILE_SYNC();
+template <typename Tile, typename AdjTile>
+void adj_tile_min(Tile& t, Tile& adj_t, AdjTile& adj_ret)
+{
+    // todo: not implemented
+}
 
-    auto adj_t_reg = adj_t.copy_to_register();
-    auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, -1, 0, 0>(&scratch).copy_to_register();
 
-    adj_t.assign(tile_add(adj_t_reg, adj_ret_reg));
-}
 
 } // namespace wp
\ No newline at end of file
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index 723ab12f..ff040d23 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -19,19 +19,184 @@
 # num threads per-tile
 TILE_DIM = 64
 
+@wp.kernel
+def tile_sum_kernel(input: wp.array2d(dtype=float), 
+                    output: wp.array(dtype=float)):
+    
+    # output tile index
+    i = wp.tid()
+
+    n = input.shape[1]
+    count = int(n / TILE_DIM)
+
+    s = wp.tile_zeros(m=1, n=1, dtype=float)
+
+    for j in range(count):
+        a = wp.tile_load(input, i, j, m=1, n=TILE_DIM)
+        s += wp.tile_sum(a) * 0.5 
+
+    wp.tile_store(output, i, 0, s)
+
+
+def test_tile_reduce_sum(test, device):
+    batch_count = 56
+
+    N = TILE_DIM*3
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_sum_kernel, 
+            dim=[batch_count], 
+            inputs=[input_wp, output_wp], 
+            block_dim=TILE_DIM, 
+            device=device)
+
+    sum_wp = output_wp.numpy()
+    for i in range(batch_count):
+        sum_np = np.sum(input[i]) * 0.5
+        test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
+
+    output_wp.grad.fill_(1.0)
+
+    tape.backward()
+
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.e-4)
+
+
+@wp.kernel
+def tile_min_kernel(input: wp.array2d(dtype=float), 
+                    output: wp.array(dtype=float)):
+    
+    # output tile index
+    i = wp.tid()
+
+    a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
+    m = wp.tile_min(a)
+
+    wp.tile_store(output, i, 0, m)
+
+
+def test_tile_reduce_min(test, device):
+    batch_count = 56
+
+    N = TILE_DIM
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_min_kernel, 
+            dim=[batch_count], 
+            inputs=[input_wp, output_wp], 
+            block_dim=TILE_DIM, 
+            device=device)
+
+    min_wp = output_wp.numpy()
+    for i in range(batch_count):
+        min_np = np.min(input[i])
+        test.assertAlmostEqual(min_wp[i], min_np, places=4)
+
 
 @wp.kernel
-def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
+def tile_max_kernel(input: wp.array2d(dtype=float), 
+                    output: wp.array(dtype=float)):
+    
     # output tile index
     i = wp.tid()
 
-    a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
+    a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
+    m = wp.tile_max(a)
+
+    wp.tile_store(output, i, 0, m)
+
+
+def test_tile_reduce_max(test, device):
+    batch_count = 56
+
+    N = TILE_DIM
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_max_kernel, 
+            dim=[batch_count], 
+            inputs=[input_wp, output_wp], 
+            block_dim=TILE_DIM, 
+            device=device)
+
+    max_wp = output_wp.numpy()
+    for i in range(batch_count):
+        max_np = np.max(input[i])
+        test.assertAlmostEqual(max_wp[i], max_np, places=4)
+
+
+@wp.kernel
+def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), 
+                              output: wp.array(dtype=float)):
+    
+    # output tile index
+    i = wp.tid()
+
+    a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
+    m = wp.tile_reduce(wp.mul, a)
+
+    wp.tile_store(output, i, 0, m)
+
+
+def test_tile_reduce_custom(test, device):
+    batch_count = 56
+
+    N = TILE_DIM
+
+    rng = np.random.default_rng(42)
+    input = rng.random((batch_count, N), dtype=np.float32)
+
+    input_wp = wp.array(input, requires_grad=True, device=device)
+    output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_reduce_custom_kernel, 
+            dim=[batch_count], 
+            inputs=[input_wp, output_wp], 
+            block_dim=TILE_DIM, 
+            device=device)
+
+    prod_wp = output_wp.numpy()
+    for i in range(batch_count):
+        prod_np = np.prod(input[i])
+        test.assertAlmostEqual(prod_wp[i], prod_np, places=4)
+
+
+
+
+@wp.kernel
+def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
+    # output tile index
+    i = wp.tid()
+
+    a = wp.tile_load(input, i, 0, m=TILE_M, n=TILE_N)
     s = wp.tile_sum(a) * 0.5
 
     wp.tile_store(output, i, 0, s)
 
 
-def test_tile_reduce_sum(test, device):
+def test_tile_reduce_grouped_sum(test, device):
     batch_count = 56
 
     M = TILE_M
@@ -51,13 +216,13 @@ def test_tile_reduce_sum(test, device):
     sum_wp = output_wp.numpy()
     for i in range(batch_count):
         sum_np = np.sum(input[i]) * 0.5
-        test.assertAlmostEqual(sum_wp[i], sum_np, places=5)
+        test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
 
     output_wp.grad.fill_(1.0)
 
     tape.backward()
 
-    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.e-4)
 
 
 @wp.kernel
@@ -160,7 +325,12 @@ class TestTileReduce(unittest.TestCase):
     pass
 
 
+
 add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_min", test_tile_reduce_min, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_max", test_tile_reduce_max, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_custom", test_tile_reduce_custom, devices=devices)
+add_function_test(TestTileReduce, "test_tile_reduce_grouped_sum", test_tile_reduce_sum, devices=devices)
 add_function_test(TestTileReduce, "test_tile_reduce_simt", test_tile_reduce_simt, devices=devices)
 add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
 add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)

From d0e4eca9ab3b8b84f18f240e786f1ea427a123be Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 30 Sep 2024 00:07:37 +0000
Subject: [PATCH 047/102] Add support for wp.tile_transpose()

---
 docs/modules/differentiability.rst |   2 +
 docs/modules/functions.rst         | 123 ++++++++++++++++++++++++++-
 docs/modules/tiles.rst             |  41 +++++----
 warp/builtins.py                   |  79 ++++++++++++++++--
 warp/codegen.py                    |   1 -
 warp/native/tile.h                 |  46 ++++++++--
 warp/stubs.py                      | 129 +++++++++++++++++++++++++++++
 warp/tests/test_tile.py            |  41 +++++++++
 warp/tests/test_tile_reduce.py     |  60 +++++---------
 warp/types.py                      |  22 ++++-
 10 files changed, 468 insertions(+), 76 deletions(-)

diff --git a/docs/modules/differentiability.rst b/docs/modules/differentiability.rst
index 72436104..81145d8d 100644
--- a/docs/modules/differentiability.rst
+++ b/docs/modules/differentiability.rst
@@ -1,3 +1,5 @@
+.. _differentiability:
+
 Differentiability
 =================
 
diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index f33d4339..ffd87dc9 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -908,7 +908,7 @@ Tile Primitives
 
     This function converts a block-wide tile back to per-thread values.
 
-    :param a: A tile with dimensions ``shape=(1, block_dim)``
+    :param a: A tile with dimensions ``shape=(M, block_dim)``
     :returns: A single value per-thread with the same dtype as the tile
 
     This example shows how to create a linear sequence from thread variables:
@@ -954,6 +954,16 @@ Tile Primitives
     :returns: The value of the element at the specified tile location, with the same type as the input tile's per-element dtype
 
 
+.. py:function:: tile_transpose(a: Tile) -> Tile
+
+    Transpose a tile.
+
+    For shared memory tiles this operation will alias the input tile, register tiles will first be transferred to shared memory before transposition.
+
+    :param a: Tile to transpose with ``shape=(M,N)``
+    :returns: Tile with ``shape=(N,M)``
+
+
 .. py:function:: tile_sum(a: Tile) -> Tile
 
     Cooperatively compute the sum the tile elements using all threads in the block.
@@ -984,6 +994,98 @@ Tile Primitives
     
 
 
+.. py:function:: tile_min(a: Tile) -> Tile
+
+    Cooperatively compute the minimum of the tile elements using all threads in the block.
+
+    :param a: The tile to compute the minimum of
+    :returns: A single element tile with dimensions of (1,1) holding the minimum value
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            s = wp.tile_min(t)
+
+            print(t)
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[-10]]
+
+    
+
+
+.. py:function:: tile_max(a: Tile) -> Tile
+
+    Cooperatively compute the maximum of the tile elements using all threads in the block.
+
+    :param a: The tile to compute the maximum from
+    :returns: A single element tile with dimensions of (1,1) holding the maximum value
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+
+            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            s = wp.tile_min(t)
+
+            print(t)
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[10]]
+
+    
+
+
+.. py:function:: tile_reduce(op: Callable, a: Any) -> Tile
+
+    Apply a custom reduction operator across the tile.
+
+    This function cooperatively performs a reduction using the provided operator across the tile.
+
+    :param op: A callable function that accepts two arguments and returns one argument, may be a user function or builtin
+    :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A single element tile with ``shape=(1,1)`` with the same datatype as the input tile.
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def factorial():
+
+            t = wp.tile_arange(1, 10, dtype=int)
+            s = wp.tile_reduce(wp.mul, t)
+
+            print(s)
+
+        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[362880]]
+    
+
+
 .. py:function:: tile_map(op: Callable, a: Any) -> Tile
 
     Apply a unary function onto the tile.
@@ -1070,6 +1172,25 @@ Tile Primitives
     
 
 
+.. py:function:: tile_matmul(a: Tile, b: Tile) -> Tile
+    :noindex:
+    :nocontentsentry:
+
+    Computes the matrix product ``out = a*b``.
+
+    Supported datatypes are:
+        * fp16, fp32, fp64 (real)
+        * vec2h, vec2f, vec2d (complex)
+
+    Both input tiles must have the same datatype. Tile data will be automatically be migrated
+    to shared memory if necessary and will use TensorCore operations when available.
+
+    :param a: A tile with ``shape=(M, K)``
+    :param b: A tile with ``shape=(K, N)``
+    :returns: A tile with ``shape=(M, N)``
+    
+
+
 .. py:function:: tile_fft(inout: Tile) -> Tile
 
     Compute the forward FFT along the second dimension of a 2D tile of data.
diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst
index bf7f40bb..48d2c788 100644
--- a/docs/modules/tiles.rst
+++ b/docs/modules/tiles.rst
@@ -1,5 +1,7 @@
-Tiles (Preview)
-===============
+Tiles
+=====
+
+.. warning:: Tile-based operations in Warp are under preview, APIs are subject to change.
 
 Block-based programming models such as those in OpenAI Triton have proved to be effective ways of expressing high performance kernels that can leverage cooperative operations on modern GPUs.
 
@@ -8,7 +10,7 @@ Warp 1.4.0 introduces tile extensions that expose a block-based programming to W
 Execution Model
 ---------------
 
-Warp's execution model allows users to specify an up to 4-dimensional grid of logical threads for kernel execution at launch time. With the introduction of tiles, users can also specify a block size, which partitions the grid into smaller sets of threads that are executed on a single compute unit.
+Warp's execution model allows users to specify an up to 4-dimensional grid of logical threads for kernel execution at launch time. With the introduction of tile primitives, users can additionally specify a block size, which partitions the thread grid into smaller sets of threads that are executed on a single compute unit.
 
 Inside kernels, tile operations are executed cooperatively across each block of threads, allowing them to take advantage of efficient memory access, local memory, and dedicated hardware units like TensorCores.
 
@@ -23,19 +25,19 @@ As an example, consider the following kernel:
     def compute(a: array(dtype=float))
         i = wp.tid()/TILE_SIZE
 
-        t = wp.tile_load(array, x=i, n=TILE_SIZE)
+        t = wp.tile_load(array, i, TILE_SIZE)
         ...
 
     wp.launch(compute, dim=[len(a)], inputs=[a], block_dim=TILE_THREADS)
     
-Here, we load a 1D tile of 256 values from a global memory array ``a``, where the load operation is performed cooperatively by all 64 threads in the block, as specified by the ``block_dim`` argument to :func:`warp.launch`. In this case each thread is responsible for loading 4 values from global memory, which may then be stored in registers, or shared memory across the block.
+Here, each block loads a 1D tile of 256 values from a global memory array ``a``, where the load operation is performed cooperatively by all 64 threads in the block, as specified by the ``block_dim`` argument to :func:`warp.launch`. In this case, each thread is responsible for loading 4 values from global memory, which may then be stored in registers, or shared memory across the block.
 
 Tile Properties
 ---------------
 
 In Warp, tile objects are 2D arrays of data where the tile elements may be scalars, vectors, matrices, or user defined structures.
 
-In a more complex example, we launch a grid of threads where each block is responsible for loading a row of data from a 2D array and computing its sum:
+In the following example, we launch a grid of threads where each block is responsible for loading a row of data from a 2D array and computing its sum:
 
 .. code:: python
     
@@ -44,10 +46,10 @@ In a more complex example, we launch a grid of threads where each block is respo
 
     @wp.kernel
     def compute(a: array2d(dtype=float))
-        i, _= wp.tid()
+        i, _ = wp.tid()
 
         # load a row from global memory
-        t = wp.tile_load(array, i, TILE_SIZE)
+        t = wp.tile_load(array, i, 0, 1, TILE_SIZE)
         s = wp.sum(t)
         ...
 
@@ -79,7 +81,7 @@ In this example, we use :func:`warp.launch_tiled` to automatically insert the tr
 Tile Storage
 ------------
 
-When tiles are created they are placed in either `register` or `shared` memory. In general Warp tries to determine the best storage for each, the default is generally for register storage, although some operations such as matrix multiplies may migrate data from register to shared as necessary.
+When tiles are created they are placed in either `register` or `shared` memory. In general Warp tries to determine the best storage for each, by default tiles are allocated in register storage, however some operations such as matrix multiplies may migrate data from register to shared as necessary.
 
 Register Tiles
 ++++++++++++++
@@ -94,20 +96,15 @@ Some operations like matrix multiplication, require access to an entire tile of
 Tile Operations
 ---------------
 
-Creation
-++++++++
+Construction
+++++++++++++
 
 * :func:`warp.tile_zeros`
 * :func:`warp.tile_ones`
 * :func:`warp.tile_arange`
-
-Conversion
-++++++++++
-
 * :func:`warp.tile`
 * :func:`warp.untile`
 
-
 Load/Store
 ++++++++++
 
@@ -119,17 +116,21 @@ Maps/Reductions
 +++++++++++++++
 
 * :func:`warp.tile_map`
+* :func:`warp.tile_reduce`
 * :func:`warp.tile_sum`
+* :func:`warp.tile_min`
+* :func:`warp.tile_max`
 
 Linear Algebra
 ++++++++++++++
 
 * :func:`warp.tile_matmul`
+* :func:`warp.tile_transpose`
 * :func:`warp.tile_fft`
 * :func:`warp.tile_ifft`
 
 Tiles and SIMT Code
-+++++++++++++++++++
+-------------------
 
 Warp kernels are primarily written in the SIMT programming model in mind, where each thread's execution happens completely independently. Tiles on the other hand allow threads to work cooperatively to perform operations.
 
@@ -158,6 +159,12 @@ In this example we perform some per-thread computations, and then convert the sc
 
 Similarly, we can `untile` tile objects back to their per-thread scalar equivalent values.
 
+.. Note:: All threads in a block must execute tile operations, however code surrounding tile operations may contain arbitrary conditional logic.
+
+Automatic Differentiation
+-------------------------
+
+Warp can automatically generate the backward version of tile-based programs, in general tile programs must obey the same rules for auto-diff as regular Warp programs, e.g.: avoiding in-place operations, etc. Please see the :ref:`differentiability` section for more details.
 
 
 
diff --git a/warp/builtins.py b/warp/builtins.py
index 6771d22a..02d31d31 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2182,6 +2182,49 @@ def tile_extract_value_func(arg_types, arg_values):
 )
 
 
+def tile_transpose_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile
+
+    if len(arg_types) != 1:
+        raise RuntimeError("tile_transpose() requires 1 positional args")
+
+    t = arg_types["a"]
+
+    if not is_tile(t):
+        raise RuntimeError("tile_transpose() argument 0 must be a tile")
+
+    layout = None
+
+    # flip layout
+    if t.layout == "rowmajor":
+        layout = "colmajor"
+    elif t.layout == "colmajor":
+        layout = "rowmajor"
+
+    # force the input tile to shared memory
+    t.storage = "shared"
+
+    return Tile(dtype=t.dtype, M=t.N, N=t.M, op="transpose", storage=t.storage, layout=layout, owner=False)
+
+
+add_builtin(
+    "tile_transpose",
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any)},
+    value_func=tile_transpose_value_func,
+    variadic=True,
+    doc="""Transpose a tile.
+
+    For shared memory tiles this operation will alias the input tile, register tiles will first be transferred to shared memory before transposition.
+
+    :param a: Tile to transpose with ``shape=(M,N)``
+    :returns: Tile with ``shape=(N,M)``""",
+    group="Tile Primitives",
+    export=False,
+)
+
+
 def tile_matmul_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2280,6 +2323,7 @@ def compute():
     export=False,
 )
 
+
 def tile_min_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2331,6 +2375,7 @@ def compute():
     export=False,
 )
 
+
 def tile_max_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2382,6 +2427,7 @@ def compute():
     export=False,
 )
 
+
 # does type propagation for load()
 def tile_reduce_value_func(arg_types, arg_values):
     if arg_types is None:
@@ -2420,14 +2466,14 @@ def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any,
     .. code-block:: python
 
         @wp.kernel
-        def compute():
+        def factorial():
 
             t = wp.tile_arange(1, 10, dtype=int)
-            s = wp.tile_reduce(wp.prod, t)
+            s = wp.tile_reduce(wp.mul, t)
 
             print(s)
 
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
 
     Prints:
 
@@ -5386,9 +5432,29 @@ def make_transpose(t):
             builder.ltoirs[lto_symbol] = lto_code
             return lto_symbol, lto_code
 
-    (fun_forward, lto_forward) = make_function(M, N, K, "N", "N")  #    C += A * B
-    (fun_backward_A, lto_backward_A) = make_function(M, K, N, "N", "T")  # adjA += adjC * B^T
-    (fun_backward_B, lto_backward_B) = make_function(K, N, M, "T", "N")  # adjB += A^T * adjC
+    def tile_layout_mode(tile):
+        if tile.layout == "rowmajor":
+            return "N"
+        if tile.layout == "colmajor":
+            return "T"
+
+    def tile_flip_layout(layout):
+        if layout == "N":
+            return "T"
+        elif layout == "T":
+            return "N"
+
+    a_layout = tile_layout_mode(a.type)
+    b_layout = tile_layout_mode(b.type)
+    c_layout = tile_layout_mode(out.type)
+
+    (fun_forward, lto_forward) = make_function(M, N, K, a_layout, b_layout)  #    C += A * B
+    (fun_backward_A, lto_backward_A) = make_function(
+        M, K, N, c_layout, tile_flip_layout(b_layout)
+    )  # adjA += adjC * B^T
+    (fun_backward_B, lto_backward_B) = make_function(
+        K, N, M, tile_flip_layout(a_layout), c_layout
+    )  # adjB += A^T * adjC
 
     return (
         (
@@ -5453,6 +5519,7 @@ def make_transpose(t):
     namespace="",
 )
 
+
 ##
 ## FFT
 ##
diff --git a/warp/codegen.py b/warp/codegen.py
index 9628f795..697f3d33 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -3079,7 +3079,6 @@ def codegen_func_forward(adj, func_type="kernel", device="cpu"):
     lines += ["// primal vars\n"]
 
     for var in adj.variables:
-
         if is_tile(var.type):
             lines += [f"{var.ctype()} {var.emit()} = {var.type.cinit()};\n"]
         elif var.constant is None:
diff --git a/warp/native/tile.h b/warp/native/tile.h
index cdc338a2..a856b643 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -60,9 +60,9 @@
     [x] Sum
         [x] Forward
         [x] Reverse
-    [ ] Min
-    [ ] Max
-    [ ] Custom
+    [x] Min
+    [x] Max
+    [x] Custom
 [x] MatMul
     [x] Forward
     [x] Reverse
@@ -380,6 +380,22 @@ struct tile_shared_t
         copy_from_global(t.array, t.x, t.y);
     }
 
+    // construct from another shared tile, this constructor
+    // is invoked for reshape operations like `wp.tile_transpose()`
+    template <typename OtherT, int OtherM, int OtherN, int OtherStrideM, int OtherStrideN>
+    inline CUDA_CALLABLE auto& operator=(const tile_shared_t<OtherT, OtherM, OtherN, OtherStrideM, OtherStrideN>& stile) 
+    {
+        using OtherTile = tile_shared_t<OtherT, OtherM, OtherN, OtherStrideM, OtherStrideN>;
+
+        // check dimensions are compatible
+        static_assert(Size == OtherTile::Size);
+
+        // alias tile directly
+        data = stile.data;
+
+        return *this;
+    }    
+
     // assign from a global tile
     inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, M, N>& t)
     {
@@ -637,12 +653,6 @@ inline CUDA_CALLABLE auto tile_alloc_zeros()
     return tile_shared_t<T, M, N>(data);
 }
 
-template <typename Tile>
-inline CUDA_CALLABLE auto tile_transpose(Tile& t)
-{    
-    // alias incoming tile 
-    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
-}
 
 //-----------------------------------------------------------------------------------------------------
 // High level entry points for each op (correspond to one Warp builtin)
@@ -1091,4 +1101,22 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
         tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, adj_Xinout); \
     } while (0)
 
+
+template <typename Tile>
+inline CUDA_CALLABLE auto tile_transpose(Tile& t)
+{    
+    // alias incoming tile 
+    return tile_shared_t<typename Tile::Type, Tile::N, Tile::M, Tile::StrideN, Tile::StrideM>(t.data);
+}
+
+template <typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_transpose(Tile& t, Tile& adj_t, AdjTile& adj_ret)
+{    
+    auto a = adj_t.copy_to_register();
+    auto b = t.copy_to_register();
+    
+    adj_t.assign(tile_add(a,b));
+}
+
+
 } // namespace wp
diff --git a/warp/stubs.py b/warp/stubs.py
index fc642827..301b056b 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -1059,6 +1059,18 @@ def tile_extract(a: Tile, i: int32, j: int32) -> Scalar:
     ...
 
 
+@over
+def tile_transpose(a: Tile) -> Tile:
+    """Transpose a tile.
+
+    For shared memory tiles this operation will alias the input tile, register tiles will first be transferred to shared memory before transposition.
+
+    :param a: Tile to transpose with ``shape=(M,N)``
+    :returns: Tile with ``shape=(N,M)``
+    """
+    ...
+
+
 @over
 def tile_sum(a: Tile) -> Tile:
     """Cooperatively compute the sum the tile elements using all threads in the block.
@@ -1091,6 +1103,104 @@ def compute():
     ...
 
 
+@over
+def tile_min(a: Tile) -> Tile:
+    """Cooperatively compute the minimum of the tile elements using all threads in the block.
+
+    :param a: The tile to compute the minimum of
+    :returns: A single element tile with dimensions of (1,1) holding the minimum value
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            s = wp.tile_min(t)
+
+            print(t)
+
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[-10]]
+
+
+    """
+    ...
+
+
+@over
+def tile_max(a: Tile) -> Tile:
+    """Cooperatively compute the maximum of the tile elements using all threads in the block.
+
+    :param a: The tile to compute the maximum from
+    :returns: A single element tile with dimensions of (1,1) holding the maximum value
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def compute():
+            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            s = wp.tile_min(t)
+
+            print(t)
+
+
+        wp.launch(compute, dim=[64], inputs=[])
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[10]]
+
+
+    """
+    ...
+
+
+@over
+def tile_reduce(op: Callable, a: Any) -> Tile:
+    """Apply a custom reduction operator across the tile.
+
+    This function cooperatively performs a reduction using the provided operator across the tile.
+
+    :param op: A callable function that accepts two arguments and returns one argument, may be a user function or builtin
+    :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
+    :returns: A single element tile with ``shape=(1,1)`` with the same datatype as the input tile.
+
+    Example:
+
+    .. code-block:: python
+
+        @wp.kernel
+        def factorial():
+            t = wp.tile_arange(1, 10, dtype=int)
+            s = wp.tile_reduce(wp.mul, t)
+
+            print(s)
+
+
+        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
+
+    Prints:
+
+    .. code-block:: text
+
+        tile(m=1, n=1, storage=register) = [[362880]]
+
+    """
+    ...
+
+
 @over
 def tile_map(op: Callable, a: Any) -> Tile:
     """Apply a unary function onto the tile.
@@ -2648,6 +2758,25 @@ def tile_matmul(a: Tile, b: Tile, out: Tile) -> Tile:
     ...
 
 
+@over
+def tile_matmul(a: Tile, b: Tile) -> Tile:
+    """Computes the matrix product ``out = a*b``.
+
+    Supported datatypes are:
+        * fp16, fp32, fp64 (real)
+        * vec2h, vec2f, vec2d (complex)
+
+    Both input tiles must have the same datatype. Tile data will be automatically be migrated
+    to shared memory if necessary and will use TensorCore operations when available.
+
+    :param a: A tile with ``shape=(M, K)``
+    :param b: A tile with ``shape=(K, N)``
+    :returns: A tile with ``shape=(M, N)``
+
+    """
+    ...
+
+
 @over
 def tile_fft(inout: Tile) -> Tile:
     """Compute the forward FFT along the second dimension of a 2D tile of data.
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 8aba083e..fcd394aa 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -395,6 +395,45 @@ def test_tile_extract(test, device):
     assert_np_equal(input_wp.grad.numpy(), np.ones_like(input))
 
 
+@wp.kernel
+def test_tile_transpose_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
+    x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
+    y = wp.tile_transpose(x)
+
+    wp.tile_store(output, 0, 0, y)
+
+
+def test_tile_transpose(test, device):
+    rng = np.random.default_rng(42)
+    input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
+    output = wp.zeros_like(input.transpose(), device=device)
+
+    wp.launch_tiled(test_tile_transpose_kernel, dim=[1], inputs=[input, output], block_dim=32, device=device)
+
+    assert_np_equal(output.numpy(), input.numpy().T)
+
+
+@wp.kernel
+def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
+    x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
+    y = wp.tile_transpose(x)
+
+    z = wp.tile_zeros(dtype=float, m=TILE_N, n=TILE_N)
+    wp.tile_matmul(y, x, z)
+
+    wp.tile_store(output, 0, 0, z)
+
+
+def test_tile_transpose_matmul(test, device):
+    rng = np.random.default_rng(42)
+    input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
+    output = wp.zeros((TILE_N, TILE_N), dtype=float, device=device)
+
+    wp.launch_tiled(test_tile_transpose_matmul_kernel, dim=[1], inputs=[input, output], block_dim=32, device=device)
+
+    assert_np_equal(output.numpy(), input.numpy().T @ input.numpy())
+
+
 # #-----------------------------------------
 # # center of mass computation
 
@@ -486,6 +525,8 @@ class TestTile(unittest.TestCase):
 add_function_test(TestTile, "test_tile_binary_map", test_tile_binary_map, devices=devices)
 add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)  # FAILS
 add_function_test(TestTile, "test_tile_gemm", test_tile_gemm, devices=devices)
+add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=devices)  # FAILS
+add_function_test(TestTile, "test_tile_transpose_matmul", test_tile_transpose_matmul, devices=devices)
 add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
 add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
 add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index ff040d23..f0b60d86 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -19,10 +19,9 @@
 # num threads per-tile
 TILE_DIM = 64
 
+
 @wp.kernel
-def tile_sum_kernel(input: wp.array2d(dtype=float), 
-                    output: wp.array(dtype=float)):
-    
+def tile_sum_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
     i = wp.tid()
 
@@ -33,7 +32,7 @@ def tile_sum_kernel(input: wp.array2d(dtype=float),
 
     for j in range(count):
         a = wp.tile_load(input, i, j, m=1, n=TILE_DIM)
-        s += wp.tile_sum(a) * 0.5 
+        s += wp.tile_sum(a) * 0.5
 
     wp.tile_store(output, i, 0, s)
 
@@ -41,7 +40,7 @@ def tile_sum_kernel(input: wp.array2d(dtype=float),
 def test_tile_reduce_sum(test, device):
     batch_count = 56
 
-    N = TILE_DIM*3
+    N = TILE_DIM * 3
 
     rng = np.random.default_rng(42)
     input = rng.random((batch_count, N), dtype=np.float32)
@@ -51,11 +50,8 @@ def test_tile_reduce_sum(test, device):
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_sum_kernel, 
-            dim=[batch_count], 
-            inputs=[input_wp, output_wp], 
-            block_dim=TILE_DIM, 
-            device=device)
+            tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
 
     sum_wp = output_wp.numpy()
     for i in range(batch_count):
@@ -66,13 +62,11 @@ def test_tile_reduce_sum(test, device):
 
     tape.backward()
 
-    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.e-4)
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
 
 
 @wp.kernel
-def tile_min_kernel(input: wp.array2d(dtype=float), 
-                    output: wp.array(dtype=float)):
-    
+def tile_min_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
     i = wp.tid()
 
@@ -95,11 +89,8 @@ def test_tile_reduce_min(test, device):
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_min_kernel, 
-            dim=[batch_count], 
-            inputs=[input_wp, output_wp], 
-            block_dim=TILE_DIM, 
-            device=device)
+            tile_min_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
 
     min_wp = output_wp.numpy()
     for i in range(batch_count):
@@ -108,9 +99,7 @@ def test_tile_reduce_min(test, device):
 
 
 @wp.kernel
-def tile_max_kernel(input: wp.array2d(dtype=float), 
-                    output: wp.array(dtype=float)):
-    
+def tile_max_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
     i = wp.tid()
 
@@ -133,11 +122,8 @@ def test_tile_reduce_max(test, device):
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_max_kernel, 
-            dim=[batch_count], 
-            inputs=[input_wp, output_wp], 
-            block_dim=TILE_DIM, 
-            device=device)
+            tile_max_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
+        )
 
     max_wp = output_wp.numpy()
     for i in range(batch_count):
@@ -146,9 +132,7 @@ def test_tile_reduce_max(test, device):
 
 
 @wp.kernel
-def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), 
-                              output: wp.array(dtype=float)):
-    
+def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
     i = wp.tid()
 
@@ -171,11 +155,12 @@ def test_tile_reduce_custom(test, device):
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_reduce_custom_kernel, 
-            dim=[batch_count], 
-            inputs=[input_wp, output_wp], 
-            block_dim=TILE_DIM, 
-            device=device)
+            tile_reduce_custom_kernel,
+            dim=[batch_count],
+            inputs=[input_wp, output_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
 
     prod_wp = output_wp.numpy()
     for i in range(batch_count):
@@ -183,8 +168,6 @@ def test_tile_reduce_custom(test, device):
         test.assertAlmostEqual(prod_wp[i], prod_np, places=4)
 
 
-
-
 @wp.kernel
 def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
     # output tile index
@@ -222,7 +205,7 @@ def test_tile_reduce_grouped_sum(test, device):
 
     tape.backward()
 
-    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.e-4)
+    assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
 
 
 @wp.kernel
@@ -325,7 +308,6 @@ class TestTileReduce(unittest.TestCase):
     pass
 
 
-
 add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
 add_function_test(TestTileReduce, "test_tile_reduce_min", test_tile_reduce_min, devices=devices)
 add_function_test(TestTileReduce, "test_tile_reduce_max", test_tile_reduce_max, devices=devices)
diff --git a/warp/types.py b/warp/types.py
index 3cae1be3..7e244863 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2959,12 +2959,21 @@ def array_type_id(a):
 class Tile:
     allocation = 0
 
-    def __init__(self, dtype, M, N, op=None, storage="register"):
-        self.dtype = dtype
+    def __init__(self, dtype, M, N, op=None, storage="register", layout="rowmajor", owner=True):
+        self.dtype = type_to_warp(dtype)
         self.M = M
         self.N = N
         self.op = op
         self.storage = storage
+        self.layout = layout
+
+        # default to row major layout
+        if layout == "rowmajor":
+            self.strides = (N, 1)
+        elif layout == "colmajor":
+            self.strides = (1, M)
+
+        self.owner = owner
 
     # generates C-type string
     def ctype(self):
@@ -2973,7 +2982,9 @@ def ctype(self):
         if self.storage == "register":
             return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
         elif self.storage == "shared":
-            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}, {self.strides[0]}, {self.strides[1]}>"
+        else:
+            raise RuntimeError(f"Unrecognized tile storage type {self.storage}")
 
     # generates C-initializer string
     def cinit(self, adjoint=False):
@@ -2982,6 +2993,11 @@ def cinit(self, adjoint=False):
         if self.storage == "register":
             return self.ctype() + "(0.0)"
         elif self.storage == "shared":
+            # if this is a reference to another tile
+            # then don't allocate any memory
+            if self.owner == False:
+                return "NULL"
+
             if adjoint:
                 # backward pass requires zeroed memory
                 return f"wp::tile_alloc_zeros<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"

From a1b79ccff6968a34583d921fab54774fd3e4eec6 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 30 Sep 2024 20:48:03 +0000
Subject: [PATCH 048/102] Add wp.tile_broadcast() Add wp.tile_matmul() overload
 with no explicit output variable

---
 warp/builtins.py        | 112 +++++++++++++++++++++++++++++++++++-----
 warp/codegen.py         |  33 ++++++------
 warp/context.py         |   9 +++-
 warp/native/tile.h      |  87 ++++++++++++++++++++++---------
 warp/tests/test_tile.py |   2 +-
 5 files changed, 187 insertions(+), 56 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 02d31d31..5491e3de 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1832,8 +1832,10 @@ def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
     if start is None or stop is None or step is None:
         raise RuntimeError("wp.tile_arange() arguments must be compile time constants")
 
-    if arg_values["dtype"] is not None:
+    if "dtype" in arg_values:
         dtype = arg_values["dtype"]
+    else:
+        dtype = float
 
     return TileRange(dtype=dtype, start=start, stop=stop, step=step)
 
@@ -2224,6 +2226,76 @@ def tile_transpose_value_func(arg_types, arg_values):
     export=False,
 )
 
+def tile_broadcast_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile
+
+    if len(arg_types) != 3:
+        raise RuntimeError("tile_broadcast() requires 1 positional args")
+
+    t = arg_types["a"]
+    m = arg_values["m"]
+    n = arg_values["n"]
+
+    if not is_tile(t):
+        raise RuntimeError("tile_transpose() argument 0 must be a tile")
+
+    # try to broadcast last dimension
+    if t.N == 1:
+        stride_n = 0
+    elif t.N == n:
+        stride_n = t.strides[1]
+    else:
+        raise RuntimeError(f"Broadcast dimension must be 1 or match destination, shape(src) = {t.m, t.n}, shape(dest) = {m, n}")
+
+    # try to broadcast first dimension
+    if t.M == 1:
+        stride_m = 0
+    elif t.M == m:
+        stride_m = t.strides[0]
+    else:
+        raise RuntimeError(f"Broadcast dimension must be 1 or match destination, shape(src) = {t.m, t.n}, shape(dest) = {m, n}")
+
+    # force the input tile to shared memory
+    t.storage = "shared"
+
+    tile_type = Tile(dtype=t.dtype, M=m, N=n, op="broadcast", storage=t.storage, owner=False)
+    tile_type.strides = (stride_m, stride_n)
+
+    return tile_type
+
+def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+
+    tile = arg_values["a"]
+
+    template_args = []
+    template_args.append(return_type.M)
+    template_args.append(return_type.N)
+    template_args.append(return_type.strides[0])
+    template_args.append(return_type.strides[1])
+
+    return ((tile,), template_args)
+
+
+add_builtin(
+    "tile_broadcast",
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "m": int, "n": int},
+    value_func=tile_broadcast_value_func,
+    dispatch_func=tile_broadcast_dispatch_func,
+    variadic=True,
+    doc="""Broadcast a tile.
+
+    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+
+    :param a: Tile to broadcast
+    :returns: Tile with broadcast ``shape=(m, n)``""",
+    group="Tile Primitives",
+    export=False,
+)
+
+
+
 
 def tile_matmul_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
@@ -5306,17 +5378,22 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
 
-    if len(arg_types) != 3:
-        raise RuntimeError("tile_matmul() requires 4 positional args")
+    a = arg_types["a"]
+    b = arg_types["b"]
 
-    if not is_tile(arg_types["a"]):
+    if not is_tile(a):
         raise RuntimeError("tile_matmul() argument 0 must be a tile")
-
-    if not is_tile(arg_types["b"]):
+    if not is_tile(b):
         raise RuntimeError("tile_matmul() argument 1 must be an tile")
 
-    if not is_tile(arg_types["out"]):
-        raise RuntimeError("tile_matmul() output argument must be a tile")
+    # out = wp.tile_matmul(a, b)
+    if len(arg_types) == 2:        
+        return Tile(dtype=a.dtype, M=a.M, N=b.N, storage="shared")
+
+    # wp.tile_matmul(a, b, out)
+    elif len(arg_types) == 3:
+        if not is_tile(arg_types["out"]):
+            raise RuntimeError("tile_matmul() output argument must be a tile")
 
     return None
 
@@ -5324,13 +5401,18 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
 def tile_matmul_generic_lto_dispatch_func(
     arg_types: Mapping[str, type],
     return_type: Any,
+    return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
     builder: warp.context.ModuleBuilder,
 ):
     a = arg_values["a"]
     b = arg_values["b"]
-    out = arg_values["out"]
+
+    if len(return_values) > 0:
+        out = return_values[0]
+    else:
+        out = arg_values["out"]
 
     if any(not is_tile(arg.type) for arg in [a, b, out]):
         raise RuntimeError("tile_matmul() requires three Tile arguments")
@@ -5430,6 +5512,8 @@ def make_transpose(t):
                 lto_code = f.read()
 
             builder.ltoirs[lto_symbol] = lto_code
+            builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({dtype}, {dtype}*, {dtype}*, {dtype}, {dtype}*);"
+
             return lto_symbol, lto_code
 
     def tile_layout_mode(tile):
@@ -5461,7 +5545,6 @@ def tile_flip_layout(layout):
             Var(fun_forward, str, False, True, False),
             Var(fun_backward_A, str, False, True, False),
             Var(fun_backward_B, str, False, True, False),
-            Var(dtype, str, False, True, False),
             a,
             b,
             out,
@@ -5473,10 +5556,10 @@ def tile_flip_layout(layout):
 
 add_builtin(
     "tile_matmul",
-    input_types={"a": Tile, "b": Tile, "out": Tile},
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any), "out": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_matmul_generic_value_func,
     lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
-    variadic=True,
+    variadic=False,
     doc="""Computes the matrix product and accumulates ``out += a*b``.
 
     Supported datatypes are:
@@ -5497,10 +5580,10 @@ def tile_flip_layout(layout):
 
 add_builtin(
     "tile_matmul",
-    input_types={"a": Tile, "b": Tile},
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_matmul_generic_value_func,
     lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
-    variadic=True,
+    variadic=False,
     doc="""Computes the matrix product ``out = a*b``.
 
     Supported datatypes are:
@@ -5542,6 +5625,7 @@ def tile_fft_generic_value_func(arg_types, arg_values):
 def tile_fft_generic_lto_dispatch_func(
     arg_types: Mapping[str, type],
     return_type: Any,
+    return_values: List[Var],
     arg_values: Mapping[str, Var],
     options: Mapping[str, Any],
     builder: warp.context.ModuleBuilder,
diff --git a/warp/codegen.py b/warp/codegen.py
index 697f3d33..f347c2fc 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -1266,6 +1266,23 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             bound_arg_values,
         )
 
+        # immediately allocate output variables so we can pass them into the dispatch method
+        if return_type is None:
+            # void function 
+            output = None
+            output_list = []
+        elif not isinstance(return_type, Sequence) or len(return_type) == 1:
+            # single return value function
+            if isinstance(return_type, Sequence):
+                return_type = return_type[0]
+            output = adj.add_var(return_type)
+            output_list = [output]
+        else:
+            # multiple return value function
+            output = [adj.add_var(v) for v in return_type]
+            output_list = output
+
+
         # If we have a built-in that requires special handling to dispatch
         # the arguments to the underlying C++ function, then we can resolve
         # these using the `dispatch_func`. Since this is only called from
@@ -1275,7 +1292,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
         # a literal value or references a variable.
         if func.lto_dispatch_func is not None:
             func_args, template_args, ltoirs = func.lto_dispatch_func(
-                func.input_types, return_type, bound_args, options=adj.builder_options, builder=adj.builder
+                func.input_types, return_type, output_list, bound_args, options=adj.builder_options, builder=adj.builder
             )
         elif func.dispatch_func is not None:
             func_args, template_args = func.dispatch_func(func.input_types, return_type, bound_args)
@@ -1300,10 +1317,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         if return_type is None:
             # handles expression (zero output) functions, e.g.: void do_something();
-
-            output = None
-            output_list = []
-
             forward_call = (
                 f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
             )
@@ -1313,12 +1326,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         elif not isinstance(return_type, Sequence) or len(return_type) == 1:
             # handle simple function (one output)
-
-            if isinstance(return_type, Sequence):
-                return_type = return_type[0]
-            output = adj.add_var(return_type)
-            output_list = [output]
-
             forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
 
             replay_call = forward_call
@@ -1327,10 +1334,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         else:
             # handle multiple value functions
-
-            output = [adj.add_var(v) for v in return_type]
-            output_list = output
-
             forward_call = (
                 f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args + output, use_initializer_list)});"
             )
diff --git a/warp/context.py b/warp/context.py
index ff92f0e3..a66577fd 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -1541,7 +1541,8 @@ def __init__(self, module, options, hasher=None):
         self.options = options
         self.module = module
         self.deferred_functions = []
-        self.ltoirs = {}  # map from lto symbol to lto binary
+        self.ltoirs = {}        # map from lto symbol to lto binary
+        self.ltoirs_decl = {}   # map from lto symbol to lto forward declaration
 
         if hasher is None:
             hasher = ModuleHasher(module)
@@ -1612,6 +1613,12 @@ def value_type(arg_types, arg_values):
     def codegen(self, device):
         source = ""
 
+        # code-gen LTO forward declarations
+        source += 'extern "C" {\n'
+        for fwd in self.ltoirs_decl.values():
+            source += fwd + "\n"
+        source += '}\n'
+
         # code-gen structs
         visited_structs = set()
         for struct in self.structs.keys():
diff --git a/warp/native/tile.h b/warp/native/tile.h
index a856b643..7910c21d 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -380,6 +380,14 @@ struct tile_shared_t
         copy_from_global(t.array, t.x, t.y);
     }
 
+    // assign from a register tile
+    template <typename Tile>
+    inline CUDA_CALLABLE auto& operator=(const Tile& t)
+    {
+        assign(t);
+        return *this;
+    }
+
     // construct from another shared tile, this constructor
     // is invoked for reshape operations like `wp.tile_transpose()`
     template <typename OtherT, int OtherM, int OtherN, int OtherStrideM, int OtherStrideN>
@@ -738,9 +746,9 @@ inline CUDA_CALLABLE auto tile_arange(T start, T stop, T step)
     return out;
 }
 
-template <typename T, int M, int N>
+template <typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_arange(int start, int stop, int step,
-                                          int adj_start, int adj_stop, int adj_step, const tile_register_t<T,M,N>& adj_ret) {}
+                                          int adj_start, int adj_stop, int adj_step, AdjTile& adj_ret) {}
 
 // entry point for load
 template <typename T, int M, int N>
@@ -1048,29 +1056,45 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
     adj_t.adj_extract(i, j, adj_ret);
 }
 
-// But cuBLASDx follows the BLAS convention: matrices are col-major, so we swap A & B in the code below
+// cuBLASDx follows the BLAS convention: matrices are col-major, so we swap A & B in the code below
+template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
+TileC& tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C)
+{       
+    using T = typename TileA::Type;
 
-#define tile_matmul(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C) \
-    do { \
-        void fun_forward(dtype, dtype*, dtype*, dtype, dtype*); \
-        WP_TILE_SYNC(); \
-        fun_forward(dtype(1.0), B.data, A.data, dtype(1.0), C.data); \
-        WP_TILE_SYNC(); \
-    } while (0)
+    WP_TILE_SYNC();
+    fun_forward(T(1.0), B.data, A.data, T(1.0), C.data);
+    WP_TILE_SYNC();
+    
+    return C;
+}
+
+// backward for the wp.tile_matmul(a, b, out) syntax
+template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
+void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C,
+                   Fwd adj_fun_forward, AdjA adj_fun_backward_A, AdjB adj_fun_backward_B, TileA& adj_A, TileB& adj_B, TileC& adj_C)
+{   
+    using T = typename TileA::Type;    
+
+    WP_TILE_SYNC();
+    fun_backward_A(T(1.0), B.data, adj_C.data, T(1.0), adj_A.data);
+    fun_backward_B(T(1.0), adj_C.data, A.data, T(1.0), adj_B.data);
+    WP_TILE_SYNC();
+}
+
+// backward for the out = wp.tile_matmul(a, b) syntax
+template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
+void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C,
+                   Fwd adj_fun_forward, AdjA adj_fun_backward_A, AdjB adj_fun_backward_B, TileA& adj_A, TileB& adj_B, TileC& adj_C, TileC& adj_ret)
+{   
+    using T = typename TileA::Type;    
+
+    WP_TILE_SYNC();
+    fun_backward_A(T(1.0), B.data, adj_C.data, T(1.0), adj_A.data);
+    fun_backward_B(T(1.0), adj_C.data, A.data, T(1.0), adj_B.data);
+    WP_TILE_SYNC();
+}
 
-// adj_fun_forward, adj_fun_backward_A, adj_fun_backward_B, adj_dtype are in practice ignored
-// but are here because builtins.py creates them even though those are effectively compile time constants
-#define adj_tile_matmul(fun_forward, fun_backward_A, fun_backward_B, dtype, A, B, C, \
-                           adj_fun_forward, adj_fun_backward_A, adj_fun_backward_B, adj_dtype, \
-                           adjA, adjB, adjC) \
-    do { \
-        void fun_backward_A(dtype, dtype*, dtype*, dtype, dtype*); \
-        void fun_backward_B(dtype, dtype*, dtype*, dtype, dtype*); \
-        WP_TILE_SYNC(); \
-        fun_backward_A(dtype(1.0), B.data, adjC.data, dtype(1.0), adjA.data); \
-        fun_backward_B(dtype(1.0), adjC.data, A.data, dtype(1.0), adjB.data); \
-        WP_TILE_SYNC(); \
-    } while (0)
 
 #define tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
     do { \
@@ -1112,11 +1136,24 @@ inline CUDA_CALLABLE auto tile_transpose(Tile& t)
 template <typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_transpose(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {    
-    auto a = adj_t.copy_to_register();
-    auto b = t.copy_to_register();
+    auto a = tile_transpose(adj_ret);
+    auto b = adj_t;
     
     adj_t.assign(tile_add(a,b));
 }
 
+template <int M, int N, int StrideM, int StrideN, typename Tile>
+inline CUDA_CALLABLE auto tile_broadcast(Tile& t)
+{    
+    // alias incoming tile with new strides
+    return tile_shared_t<typename Tile::Type, M, N, StrideN, StrideM>(t.data);
+}
+
+template <typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_broadcast(Tile& t, Tile& adj_t, AdjTile& adj_ret)
+{   
+    // todo: 
+}
+
 
 } // namespace wp
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index fcd394aa..e94521e0 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -534,4 +534,4 @@ class TestTile(unittest.TestCase):
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
-    unittest.main(verbosity=2)
+    unittest.main(verbosity=2, failfast=True)

From f92d9e9396b08fa4bff8bfa427588e77076639fe Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 1 Oct 2024 02:30:25 +0000
Subject: [PATCH 049/102] Add tests for wp.tile_broadcast() Add support for 1D
 tile loads

---
 warp/builtins.py        | 148 +++++++++++++++++++++++++------
 warp/native/tile.h      | 190 ++++++++++++++++++++++++++++++++++------
 warp/tests/test_tile.py |  80 +++++++++++++++--
 3 files changed, 356 insertions(+), 62 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 5491e3de..23dbfcca 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1883,29 +1883,82 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     export=False,
 )
 
-
-def tile_load_value_func(arg_types, arg_values):
+def tile_load_1d_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
         return Tile(dtype=Any, M=Any, N=Any)
 
-    # if len(arg_types) != 3:
-    #     raise RuntimeError("tile_load() requires 3 positional args")
+    if not is_array(arg_types["a"]):
+        raise RuntimeError("tile_load() argument 0 must be an array")
+
+    if arg_types["a"].ndim != 1:
+        raise RuntimeError("tile_load() argument 0 must be 1-dimensional if using the ``wp.tile_load(array, i, n)`` syntax.")
+
+    if not type_is_int(arg_types["i"]):
+        raise RuntimeError("tile_load() argument 1 must be an integer")
+
+    if "n" not in arg_values:
+        raise RuntimeError("'n' keyword argument must be specified when calling tile_load() function")
+
+    a = arg_types["a"]
+    m, n = 1, arg_values["n"]
+
+    return TileLoad(a, 1, n)
+
+
+def tile_load_1d_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+    array = arg_values["a"]
+    i = arg_values["i"]
+    n = arg_values["n"].constant
+    dtype = arg_values["a"].type.dtype
+
+    template_args = []
+    template_args.append(dtype)
+    template_args.append(n)
+
+    return ((array, i), template_args)
+
+add_builtin(
+    "tile_load",
+    input_types={"a": array(dtype=Any), "i": int, "n": int},
+    value_func=tile_load_1d_value_func,
+    dispatch_func=tile_load_1d_dispatch_func,
+    variadic=False,
+    doc="""Loads a 1D tile from a global memory array.
+
+    This method will cooperatively load a tile from global memory using all threads in the block.
+
+    :param a: The source array in global memory
+    :param i: Offset in the source array measured in multiples of ``n``, i.e.: ``offset=i*n``
+    :param n: The number of elements in the tile
+    :returns: A tile with ``shape=(1,n)`` and dtype the same as the source array""",
+    group="Tile Primitives",
+    export=False,
+)
+
+
+def tile_load_2d_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return Tile(dtype=Any, M=Any, N=Any)
 
     if not is_array(arg_types["a"]):
         raise RuntimeError("tile_load() argument 0 must be an array")
 
-    if not type_is_int(arg_types["x"]):
+    if arg_types["a"].ndim != 2:
+        raise RuntimeError("tile_load() argument 0 must be 2-dimensional if using the ``wp.tile_load(array, i, j, m, n)`` syntax.")
+
+    if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_load() argument 1 must be an integer")
 
-    if not type_is_int(arg_types["y"]):
+    if not type_is_int(arg_types["j"]):
         raise RuntimeError("tile_load() argument 1 must be an integer")
 
     if "m" not in arg_values:
-        raise RuntimeError("'m' keyword argument must be specified when calling tile_zeros() function")
+        raise RuntimeError("'m' keyword argument must be specified when calling tile_load() function")
 
     if "n" not in arg_values:
-        raise RuntimeError("'n' keyword argument must be specified when calling tile_zeros() function")
+        raise RuntimeError("'n' keyword argument must be specified when calling tile_load() function")
 
     a = arg_types["a"]
     m, n = arg_values["m"], arg_values["n"]
@@ -1913,9 +1966,9 @@ def tile_load_value_func(arg_types, arg_values):
     return TileLoad(a, m, n)
 
 
-def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
+def tile_load_2d_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
     array = arg_values["a"]
-    x, y = arg_values["x"], arg_values["y"]
+    i, j = arg_values["i"], arg_values["j"]
     m, n = arg_values["m"].constant, arg_values["n"].constant
     dtype = arg_values["a"].type.dtype
 
@@ -1924,31 +1977,70 @@ def tile_load_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     template_args.append(m)
     template_args.append(n)
 
-    return ((array, x, y), template_args)
+    return ((array, i, j), template_args)
 
 
 add_builtin(
     "tile_load",
-    input_types={"a": array(dtype=Any), "x": int, "y": int, "m": int, "n": int},
-    value_func=tile_load_value_func,
-    dispatch_func=tile_load_dispatch_func,
-    variadic=True,
-    doc="""Loads a tile from a global memory array.
+    input_types={"a": array(dtype=Any), "i": int, "j": int, "m": int, "n": int},
+    value_func=tile_load_2d_value_func,
+    dispatch_func=tile_load_2d_dispatch_func,
+    variadic=False,
+    doc="""Loads a 2D tile from a global memory array.
 
     This method will cooperatively load a tile from global memory using all threads in the block.
 
     :param a: The source array in global memory
-    :param x: Offset in the source array measured in multiples of ``m``, i.e.: ``i=x*m``
-    :param y: Offset in the source array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param i: Offset in the source array measured in multiples of ``m``, i.e.: ``row=i*m``
+    :param j: Offset in the source array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param m: The size of the tile's first dimension
-    :param n: The size of the tile's second dimensions
+    :param n: The size of the tile's second dimension
     :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array""",
     group="Tile Primitives",
     export=False,
 )
 
 
-def tile_store_value_func(arg_types, arg_values):
+def tile_store_1d_value_func(arg_types, arg_values):
+    # return generic type (for doc builds)
+    if arg_types is None:
+        return None
+
+    if len(arg_types) != 3:
+        raise RuntimeError("tile_store() requires 3 positional args")
+
+    if not is_array(arg_types["a"]):
+        raise RuntimeError("tile_store() argument 0 must be an array")
+
+    if not type_is_int(arg_types["i"]):
+        raise RuntimeError("tile_store() argument 1 must be an integer")
+
+    if not is_tile(arg_types["t"]):
+        raise RuntimeError("tile_store() argument 2 must be a tile")
+
+    if not types_equal(arg_types["a"].dtype, arg_types["t"].dtype):
+        raise RuntimeError("tile_store() destination array must have same type as source tile")
+
+    return None
+
+
+add_builtin(
+    "tile_store",
+    input_types={"a": array(dtype=Any), "i": int, "t": Any},
+    value_func=tile_store_1d_value_func,
+    variadic=False,
+    doc="""Stores a 1D tile to a global memory array.
+
+    This method will cooperatively store a tile to global memory using all threads in the block.
+
+    :param a: The destination array in global memory
+    :param i: Offset in the destination array measured in multiples of ``n``, i.e.: ``offset=i*n``
+    :param t: The source tile to store data from, must have the same dtype as the destination array""",
+    group="Tile Primitives",
+    export=False,
+)
+
+def tile_store_2d_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
         return None
@@ -1959,10 +2051,10 @@ def tile_store_value_func(arg_types, arg_values):
     if not is_array(arg_types["a"]):
         raise RuntimeError("tile_store() argument 0 must be an array")
 
-    if not type_is_int(arg_types["x"]):
+    if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_store() argument 1 must be an integer")
 
-    if not type_is_int(arg_types["y"]):
+    if not type_is_int(arg_types["j"]):
         raise RuntimeError("tile_store() argument 2 must be an integer")
 
     if not is_tile(arg_types["t"]):
@@ -1976,16 +2068,16 @@ def tile_store_value_func(arg_types, arg_values):
 
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
-    value_func=tile_store_value_func,
-    variadic=True,
+    input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Any},
+    value_func=tile_store_2d_value_func,
+    variadic=False,
     doc="""Stores a tile to a global memory array.
 
     This method will cooperatively store a tile to global memory using all threads in the block.
 
     :param a: The destination array in global memory
-    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*m``
-    :param y: Offset in the destination array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param i: Offset in the destination array measured in multiples of ``m``, i.e.: ``row=i*m``
+    :param j: Offset in the destination array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param t: The source tile to store data from, must have the same dtype as the destination array""",
     group="Tile Primitives",
     export=False,
@@ -2239,7 +2331,7 @@ def tile_broadcast_value_func(arg_types, arg_values):
     n = arg_values["n"]
 
     if not is_tile(t):
-        raise RuntimeError("tile_transpose() argument 0 must be a tile")
+        raise RuntimeError("tile_broadcast() argument 0 must be a tile")
 
     # try to broadcast last dimension
     if t.N == 1:
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 7910c21d..1f3b5119 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -88,6 +88,8 @@
     [ ] warp.sim (CRBA)
     [ ] Batched MLP
     [ ] Layer norm
+    [ ] FNO + Burgers equation
+    [ ] Stochastic financial modeling
     [ ] Convolution: https://github.com/NVIDIA/MinkowskiEngine/blob/master/src/convolution_kernel.cu#L123
     [ ] MeshCNN (Modulus, Oliver)
     [ ] BioNemo (Ali)
@@ -142,7 +144,7 @@ struct coord_t
 
 // represents a tile stored in global memory with dynamic strides
 // only used to represent the source for tile loads to register/shared
-template <typename T, int M_, int N_>
+template <typename T>
 struct tile_global_t
 {
     using Type = T;
@@ -183,20 +185,16 @@ struct tile_register_t
             data[i] = value;
     }
 
-    inline CUDA_CALLABLE tile_register_t(tile_global_t<T, M, N>& t)
-    {
-        // construct from a global tile
-        copy_from_global(t.data, t.x, t.y);
-    }
-
-
-    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, M, N>& t)
+    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T>& t)
     {
-        // assign from a global tile
-        copy_from_global(t.data, t.x, t.y);
+        if (t.data.ndim == 1)
+            copy_from_global(t.data, t.x); // 1d load
+        else
+            copy_from_global(t.data, t.x, t.y); // 2d load
+       
         return *this;
-    }
 
+    }
 
     inline CUDA_CALLABLE T& operator()(int index)
     {
@@ -288,11 +286,34 @@ struct tile_register_t
 
 
     // return the in-register version of this tile (nop)
-    inline CUDA_CALLABLE auto& copy_to_register() { return *this; }
+    inline CUDA_CALLABLE auto& copy_to_register() 
+    {
+        return *this; 
+    }
+
+    void copy_to_global(array_t<T> dest, int x)
+    {
+        assert(dest.ndim == 1);
 
+        const int tile_i = x*N;
+
+        WP_PRAGMA_UNROLL
+        for (int i=0; i < NumRegs; ++i)
+        {
+            // handle case where tile size is not 
+            // aligned to block dimensions
+            int linear = index(i);
+            if (!Aligned && linear >= Size)
+                break;
+
+            wp::index(dest, tile_i + linear) = data[i];
+        }
+    }
 
     void copy_to_global(array_t<T> dest, int x, int y)
     {
+        assert(dest.ndim == 2);
+
         const int tile_i = x*M;
         const int tile_j = y*N;
 
@@ -317,6 +338,22 @@ struct tile_register_t
         }
     }
 
+    inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x)
+    {
+        // todo: use async pipelines or TMA here
+        const int tile_i = x*N;
+
+        WP_PRAGMA_UNROLL
+        for (int i=0; i < NumRegs; ++i)
+        {  
+            int linear = index(i);
+            if (!Aligned && linear >= Size)
+                break;
+
+            data[i] = wp::index(src, tile_i + linear);
+        }
+    }
+
     inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x, int y)
     {
         // todo: use async pipelines or TMA here
@@ -374,12 +411,6 @@ struct tile_shared_t
     {
     }    
 
-    // construct from a global tile
-    inline CUDA_CALLABLE tile_shared_t(tile_global_t<T, M, N>& t)
-    {        
-        copy_from_global(t.array, t.x, t.y);
-    }
-
     // assign from a register tile
     template <typename Tile>
     inline CUDA_CALLABLE auto& operator=(const Tile& t)
@@ -405,9 +436,13 @@ struct tile_shared_t
     }    
 
     // assign from a global tile
-    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T, M, N>& t)
-    {
-        copy_from_global(t.data, t.x, t.y);
+    inline CUDA_CALLABLE auto& operator=(const tile_global_t<T>& t)
+    {        
+        if (t.data.ndim == 1)
+            copy_from_global(t.data, t.x);  // 1d load
+        else
+            copy_from_global(t.data, t.x, t.y); // 2d load
+        
         return *this;
     }
 
@@ -549,6 +584,21 @@ struct tile_shared_t
         return out;
     }
 
+    inline CUDA_CALLABLE void copy_to_global(array_t<T> dest, int x)
+    {
+        assert(dest.ndim == 1);
+
+        // todo: use TMA here
+        const int tile_i = x*N;
+
+        WP_PRAGMA_UNROLL
+        for (int i=threadIdx.x; i < Size; i += WP_TILE_BLOCK_DIM)
+        {
+            coord_t c = coord(i);
+            wp::index(dest, tile_i + linear) = (*this)(c.i, c.j);
+        }
+    }
+
     inline CUDA_CALLABLE void copy_to_global(array_t<T> dest, int x, int y)
     {
         // todo: use TMA here
@@ -570,6 +620,18 @@ struct tile_shared_t
         }
     }
 
+    inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x)
+    {
+        // todo: use async pipelines or TMA here
+        const int tile_i = x*N;
+
+        WP_PRAGMA_UNROLL
+        for (int i=threadIdx.x; i < Size; i += WP_TILE_BLOCK_DIM)
+        {  
+            (*this)(i) = wp::index(src, tile_i + i);
+        }
+    }
+
     inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x, int y)
     {
         // todo: use async pipelines or TMA here
@@ -750,17 +812,29 @@ template <typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_arange(int start, int stop, int step,
                                           int adj_start, int adj_stop, int adj_step, AdjTile& adj_ret) {}
 
-// entry point for load
+// entry point for 1d load
+template <typename T, int N>
+inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x)
+{
+    return tile_global_t<T>(src, x, 0);
+}
+
+// entry point for 2d load
 template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_load(array_t<T>& src, int x, int y)
 {
-    // just return a ref. to the global memory
-    // it will be loaded to shared or registers
-    // on assignment to the variable
-    return tile_global_t<T, M, N>(src, x, y);
+    return tile_global_t<T>(src, x, y);
 }
 
-// entry point for store
+// entry point for 1d store
+template <typename T, typename Tile>
+inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, Tile& src)
+{
+    // dispatch to tile type
+    src.copy_to_global(dest, x);
+}
+
+// entry point for 2d store
 template <typename T, typename Tile>
 inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src)
 {
@@ -800,6 +874,36 @@ inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, int y, Tile&
 //-------------------------------------
 // Adjoints
 
+template <typename T, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x,
+                                        array_t<T>& adj_src, int adj_x,
+                                        AdjTile& adj_ret)
+{
+    // early out
+    // if (!src.grad)
+    //     return;
+
+    auto adj_reg = adj_ret.copy_to_register();
+
+    const int tile_i = x*adj_reg.N;
+
+    // add gradients to src array
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < adj_reg.NumRegs; ++i)
+    {  
+        int linear = adj_reg.index(i);
+        if (!adj_reg.Aligned && linear >= adj_reg.Size)
+            break;
+
+        auto grad = adj_reg.data[i];
+
+        if (adj_src.data)
+            adj_atomic_add(&index(adj_src, tile_i + linear), grad);
+        else if (src.grad)
+            adj_atomic_add(&index_grad(src, tile_i + linear), grad);
+    }
+}
+
 template <typename T, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
                                         array_t<T>& adj_src, int adj_x, int adj_y,
@@ -833,6 +937,36 @@ inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
     }
 }
 
+
+template <typename T, typename Tile, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, Tile& t, array_t<T>& adj_dest, int adj_x, AdjTile& adj_t)
+{  
+    // if (!dest.grad)
+    //     return;
+
+    // convert to register if necessary
+    auto adj_reg = adj_t.copy_to_register();
+
+    const int tile_i = x*adj_reg.N;
+
+    // load gradients from output
+    WP_PRAGMA_UNROLL
+    for (int i=0; i < adj_reg.NumRegs; ++i)
+    {  
+        int linear = adj_reg.index(i);
+        if (!adj_reg.Aligned && linear >= adj_reg.Size)
+            break;
+
+         if (adj_dest.data)
+            adj_reg.data[i] += index(adj_dest, tile_i + linear);
+        else if (dest.grad)
+            adj_reg.data[i] += index_grad(dest, tile_i + linear);
+    }
+
+    // store adjoint back to tile
+    adj_t.assign(adj_reg);    
+}
+
 template <typename T, typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, AdjTile& adj_t)
 {  
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index e94521e0..aceff12e 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -21,9 +21,46 @@
 # num threads per-tile
 TILE_DIM = 64
 
+@wp.kernel
+def tile_copy_1d_kernel(A: wp.array(dtype=float), B: wp.array(dtype=float)):
+    # tile index
+    i = wp.tid()
+
+    a = wp.tile_load(A, i, n=TILE_N)
+    wp.tile_store(B, i, a)
+
+
+def test_tile_copy_1d(test, device):
+    rng = np.random.default_rng(42)
+
+    N = TILE_N * 5
+
+    A = rng.random((N), dtype=np.float32)
+    B = rng.random((N), dtype=np.float32)
+
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_copy_1d_kernel,
+            dim=[int(N / TILE_N)],
+            inputs=[A_wp, B_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
+
+    # verify forward pass
+    assert_array_equal(B_wp, A_wp)
+
+    # verify backward pass
+    B_wp.grad = wp.ones_like(B_wp, device=device)
+    tape.backward()
+
+    assert_array_equal(B_wp.grad, A_wp.grad)
 
 @wp.kernel
-def tile_copy(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
+def tile_copy_2d_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
     # tile index
     i, j = wp.tid()
 
@@ -31,7 +68,7 @@ def tile_copy(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
     wp.tile_store(B, i, j, a)
 
 
-def test_tile_copy(test, device):
+def test_tile_copy_2d(test, device):
     rng = np.random.default_rng(42)
 
     M = TILE_M * 7
@@ -45,7 +82,7 @@ def test_tile_copy(test, device):
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_copy,
+            tile_copy_2d_kernel,
             dim=[int(M / TILE_M), int(N / TILE_N)],
             inputs=[A_wp, B_wp],
             block_dim=TILE_DIM,
@@ -434,6 +471,35 @@ def test_tile_transpose_matmul(test, device):
     assert_np_equal(output.numpy(), input.numpy().T @ input.numpy())
 
 
+@wp.kernel
+def test_tile_broadcast_kernel(
+    input_a: wp.array2d(dtype=float),
+    input_b: wp.array(dtype=float),
+    output: wp.array2d(dtype=float)):
+
+    a = wp.tile_load(input_a, 0, 0, m=10, n=10)
+    b = wp.tile_load(input_b, 0, n=10)
+
+    c = wp.tile_broadcast(b, 10, 10)
+    d = a + c
+
+    wp.tile_store(output, 0, 0, d)   
+
+def test_tile_broadcast(test, device):
+
+    M = 10
+    N = 10
+    
+    a = wp.array(np.ones((M,N), dtype=np.float32), device=device)
+    b = wp.array(np.arange(0, N, dtype=np.float32), device=device)
+    out = wp.zeros((M,N), dtype=float, device=device)
+
+    wp.launch_tiled(test_tile_broadcast_kernel, dim=[1], inputs=[a, b, out], block_dim=32)
+    
+    assert_np_equal(out.numpy(), a.numpy() + b.numpy())
+
+
+
 # #-----------------------------------------
 # # center of mass computation
 
@@ -520,16 +586,18 @@ class TestTile(unittest.TestCase):
     pass
 
 
-add_function_test(TestTile, "test_tile_copy", test_tile_copy, devices=devices)
+add_function_test(TestTile, "test_tile_copy_1d", test_tile_copy_1d, devices=devices)
+add_function_test(TestTile, "test_tile_copy_2d", test_tile_copy_2d, devices=devices)
 add_function_test(TestTile, "test_tile_unary_map", test_tile_unary_map, devices=devices)
 add_function_test(TestTile, "test_tile_binary_map", test_tile_binary_map, devices=devices)
-add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)  # FAILS
+add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)  
 add_function_test(TestTile, "test_tile_gemm", test_tile_gemm, devices=devices)
-add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=devices)  # FAILS
+add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=devices)  
 add_function_test(TestTile, "test_tile_transpose_matmul", test_tile_transpose_matmul, devices=devices)
 add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
 add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
 add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
+add_function_test(TestTile, "test_tile_broadcast", test_tile_broadcast, devices=devices)
 
 
 if __name__ == "__main__":

From 226770e438d54344d3dcbbc32ff53f83b8ab22be Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 1 Oct 2024 04:34:51 +0000
Subject: [PATCH 050/102] Add support for gradients through broadcasting

---
 warp/builtins.py        |   9 ++++
 warp/native/tile.h      | 101 ++++++++++++++++++++++++++++++----------
 warp/tests/test_tile.py |  36 ++++++++++++--
 warp/types.py           |  14 +++---
 4 files changed, 124 insertions(+), 36 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 23dbfcca..3747777c 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2012,6 +2012,9 @@ def tile_store_1d_value_func(arg_types, arg_values):
     if not is_array(arg_types["a"]):
         raise RuntimeError("tile_store() argument 0 must be an array")
 
+    if arg_types["a"].ndim != 1:
+        raise RuntimeError("tile_load() argument 0 must be a 1-dimensional array if using the ``wp.tile_store(array, i, t)`` syntax.")
+
     if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_store() argument 1 must be an integer")
 
@@ -2029,6 +2032,7 @@ def tile_store_1d_value_func(arg_types, arg_values):
     input_types={"a": array(dtype=Any), "i": int, "t": Any},
     value_func=tile_store_1d_value_func,
     variadic=False,
+    skip_replay=True,
     doc="""Stores a 1D tile to a global memory array.
 
     This method will cooperatively store a tile to global memory using all threads in the block.
@@ -2051,6 +2055,9 @@ def tile_store_2d_value_func(arg_types, arg_values):
     if not is_array(arg_types["a"]):
         raise RuntimeError("tile_store() argument 0 must be an array")
 
+    if arg_types["a"].ndim != 2:
+        raise RuntimeError("tile_load() argument 0 must be a 2-dimensional array if using the ``wp.tile_store(array, i, j, t)`` syntax.")
+
     if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_store() argument 1 must be an integer")
 
@@ -2071,6 +2078,7 @@ def tile_store_2d_value_func(arg_types, arg_values):
     input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Any},
     value_func=tile_store_2d_value_func,
     variadic=False,
+    skip_replay=True,
     doc="""Stores a tile to a global memory array.
 
     This method will cooperatively store a tile to global memory using all threads in the block.
@@ -2115,6 +2123,7 @@ def tile_atomic_add_value_func(arg_types, arg_values):
     input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
     value_func=tile_atomic_add_value_func,
     variadic=True,
+    skip_replay=True,
     doc="""Atomically add a tile to the array `a`, each element will be updated atomically.
 
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 1f3b5119..e8b9bd8d 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -235,6 +235,12 @@ struct tile_register_t
             data[i] = tile.data[i];
     }
 
+    inline CUDA_CALLABLE void add(const tile_register_t<T, M, N>& tile) 
+    { 
+        for (int i=0; i < NumRegs; ++i)
+            data[i] += tile.data[i];
+    }
+
     inline CUDA_CALLABLE void zero()
     {
         for (int i=0; i < NumRegs; ++i)
@@ -384,6 +390,15 @@ struct tile_register_t
     }
 };
 
+// helper to allocate a register tile like another tile
+template<typename Tile>
+auto tile_register_like()
+{
+    using T = typename Tile::Type;
+
+    return tile_register_t<T, Tile::M, Tile::N>(T(0.0));
+}
+
 
 
 template <typename T, int M_, int N_, int StrideM_=N_, int StrideN_=1>
@@ -540,6 +555,25 @@ struct tile_shared_t
         }
     }
 
+    inline CUDA_CALLABLE void add(const tile_register_t<T, M, N>& tile) 
+    { 
+        WP_PRAGMA_UNROLL
+        for (int i=0; i < tile.NumRegs; ++i)
+        {
+            const int linear = tile.index(i);
+
+            // handle case where tile size is not
+            // aligned to block dimensions
+            if (!Aligned && linear >= Size)
+                break;
+
+            // use shared memory atomics to accumulate gradients
+            // since for broadcast tiles multiple incoming values 
+            // may map to a single location in shared memory
+            atomic_add(&(*this)(linear), tile.data[i]);
+        }
+    }
+
     inline CUDA_CALLABLE void print()
     {
         WP_TILE_SYNC();
@@ -594,8 +628,7 @@ struct tile_shared_t
         WP_PRAGMA_UNROLL
         for (int i=threadIdx.x; i < Size; i += WP_TILE_BLOCK_DIM)
         {
-            coord_t c = coord(i);
-            wp::index(dest, tile_i + linear) = (*this)(c.i, c.j);
+            wp::index(dest, tile_i + i) = (*this)(i);
         }
     }
 
@@ -712,15 +745,18 @@ inline CUDA_CALLABLE auto tile_alloc_empty()
     return tile_shared_t<T, M, N>(data);
 }
 
-template <typename T, int M, int N, int Alloc>
+template <typename T, int M, int N, int StrideM=N, int StrideN=1, int Alloc>
 inline CUDA_CALLABLE auto tile_alloc_zeros()
 {
-    WP_TILE_SHARED __align__(16) T data[M*N];
+    // compute the total storage required for the tile (may be different from M*N) for broadcast tiles
+    constexpr int Len = (M-1)*StrideM + (N-1)*StrideN + 1;
+
+    WP_TILE_SHARED __align__(16) T data[Len];
 
-    for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
+    for (int i=threadIdx.x; i < Len; i+= WP_TILE_BLOCK_DIM)
         data[i] = T(0);
 
-    return tile_shared_t<T, M, N>(data);
+    return tile_shared_t<T, M, N, StrideM, StrideN>(data);
 }
 
 
@@ -808,9 +844,9 @@ inline CUDA_CALLABLE auto tile_arange(T start, T stop, T step)
     return out;
 }
 
-template <typename AdjTile>
-inline CUDA_CALLABLE void adj_tile_arange(int start, int stop, int step,
-                                          int adj_start, int adj_stop, int adj_step, AdjTile& adj_ret) {}
+template <typename T, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile_arange(T start, T stop, T step,
+                                          T& adj_start, T& adj_stop, T& adj_step, AdjTile& adj_ret) {}
 
 // entry point for 1d load
 template <typename T, int N>
@@ -945,7 +981,7 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, Tile& t, array
     //     return;
 
     // convert to register if necessary
-    auto adj_reg = adj_t.copy_to_register();
+    tile_register_t<T, AdjTile::M, AdjTile::N> adj_reg;
 
     const int tile_i = x*adj_reg.N;
 
@@ -958,13 +994,13 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, Tile& t, array
             break;
 
          if (adj_dest.data)
-            adj_reg.data[i] += index(adj_dest, tile_i + linear);
+            adj_reg.data[i] = index(adj_dest, tile_i + linear);
         else if (dest.grad)
-            adj_reg.data[i] += index_grad(dest, tile_i + linear);
+            adj_reg.data[i] = index_grad(dest, tile_i + linear);
     }
 
     // store adjoint back to tile
-    adj_t.assign(adj_reg);    
+    adj_t.add(adj_reg);
 }
 
 template <typename T, typename Tile, typename AdjTile>
@@ -974,7 +1010,7 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t
     //     return;
 
     // convert to register if necessary
-    auto adj_reg = adj_t.copy_to_register();
+    tile_register_t<T, AdjTile::M, AdjTile::N> adj_reg;
 
     const int tile_i = x*adj_reg.M;
     const int tile_j = y*adj_reg.N;
@@ -990,13 +1026,13 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t
         coord_t coord = adj_reg.coord(linear);
 
          if (adj_dest.data)
-            adj_reg.data[i] += index(adj_dest, tile_i + coord.i, tile_j + coord.j);
+            adj_reg.data[i] = index(adj_dest, tile_i + coord.i, tile_j + coord.j);
         else if (dest.grad)
-            adj_reg.data[i] += index_grad(dest, tile_i + coord.i, tile_j + coord.j);
+            adj_reg.data[i] = index_grad(dest, tile_i + coord.i, tile_j + coord.j);
     }
 
     // store adjoint back to tile
-    adj_t.assign(adj_reg);    
+    adj_t.add(adj_reg);
 }
 
 template <typename T, typename Tile, typename AdjTile, typename AdjRet>
@@ -1023,6 +1059,7 @@ inline CUDA_CALLABLE auto tile_map(Fwd op,
     return out;
 }
 
+
 template <typename Tile, typename AdjTile, typename Fwd, typename Adj>
 inline CUDA_CALLABLE void adj_tile_map(Fwd op,
                                        Tile& a,
@@ -1031,7 +1068,7 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
                                        AdjTile& adj_ret)
 {
     auto a_reg = a.copy_to_register();   
-    auto adj_a_reg = adj_a.copy_to_register();
+    auto adj_a_reg = tile_register_like<Tile>();
     auto adj_ret_reg = adj_ret.copy_to_register();
 
     WP_PRAGMA_UNROLL
@@ -1041,7 +1078,7 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
     }
 
     // write adjoints back
-    adj_a.assign(adj_a_reg);
+    adj_a.add(adj_a_reg);
 }
 
 // binary map
@@ -1062,6 +1099,7 @@ inline CUDA_CALLABLE auto tile_map(Fwd op,
     return out;
 }
 
+
 template <typename TileA, typename TileB, typename Fwd, typename Adj, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_map(Fwd op,
                                        TileA &a,
@@ -1073,8 +1111,11 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
 {
     auto a_reg = a.copy_to_register();   
     auto b_reg = b.copy_to_register();
-    auto adj_a_reg = adj_a.copy_to_register();
-    auto adj_b_reg = adj_b.copy_to_register();    
+
+    // allocate storage for adjoints
+    auto adj_a_reg = tile_register_like<TileA>();
+    auto adj_b_reg = tile_register_like<TileB>();
+
     auto adj_ret_reg = adj_ret.copy_to_register();
 
     WP_PRAGMA_UNROLL
@@ -1083,8 +1124,8 @@ inline CUDA_CALLABLE void adj_tile_map(Fwd op,
         adj_op(a_reg.data[i], b_reg.data[i], adj_a_reg.data[i], adj_b_reg.data[i], adj_ret_reg.data[i]);
     }
 
-    adj_a.assign(adj_a_reg);
-    adj_b.assign(adj_b_reg);
+    adj_a.add(adj_a_reg);
+    adj_b.add(adj_b_reg);
 }
 
 // wrap the operator in a lambda so that we don't have to do overload resolution for things like e.g.: wp.sin()
@@ -1286,8 +1327,18 @@ inline CUDA_CALLABLE auto tile_broadcast(Tile& t)
 template <typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_broadcast(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {   
-    // todo: 
-}
+    constexpr int LenTile = (Tile::M-1)*Tile::StrideM + (Tile::N-1)*Tile::StrideN + 1;
+    constexpr int LenAdjTile = (AdjTile::M-1)*AdjTile::StrideM + (AdjTile::N-1)*AdjTile::StrideN + 1;
 
+    static_assert(LenTile == LenAdjTile);
+
+    // since the incoming adjoint will have the same physical storage 
+    // as the original tile (just with different strides and expanded dimensions), 
+    // we can simply update the gradient element by element
+    for (int i=threadIdx.x; i < LenTile; i+=WP_TILE_BLOCK_DIM)
+    {
+        adj_t.data[i] += adj_ret.data[i];
+    }
+}
 
 } // namespace wp
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index aceff12e..8b1d3157 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -359,7 +359,7 @@ def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float
     a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
     s = wp.tile_sum(a) * 0.5
 
-    wp.tile_store(output, i, 0, s)
+    wp.tile_store(output, i, s)
 
 
 def test_tile_sum(test, device):
@@ -472,7 +472,7 @@ def test_tile_transpose_matmul(test, device):
 
 
 @wp.kernel
-def test_tile_broadcast_kernel(
+def test_tile_broadcast_add_kernel(
     input_a: wp.array2d(dtype=float),
     input_b: wp.array(dtype=float),
     output: wp.array2d(dtype=float)):
@@ -485,7 +485,7 @@ def test_tile_broadcast_kernel(
 
     wp.tile_store(output, 0, 0, d)   
 
-def test_tile_broadcast(test, device):
+def test_tile_broadcast_add(test, device):
 
     M = 10
     N = 10
@@ -494,11 +494,36 @@ def test_tile_broadcast(test, device):
     b = wp.array(np.arange(0, N, dtype=np.float32), device=device)
     out = wp.zeros((M,N), dtype=float, device=device)
 
-    wp.launch_tiled(test_tile_broadcast_kernel, dim=[1], inputs=[a, b, out], block_dim=32)
+    wp.launch_tiled(test_tile_broadcast_add_kernel, dim=[1], inputs=[a, b, out], block_dim=32)
     
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
 
 
+@wp.kernel
+def test_tile_broadcast_grad_kernel(
+    a: wp.array(dtype=float),
+    b: wp.array2d(dtype=float)):
+
+    x = wp.tile_load(a, i=0, n=5)
+    y = wp.tile_broadcast(x, m=5, n=5)
+
+    w = wp.tile_ones(dtype=float, m=5, n=5)
+    z = w + y
+    
+    wp.tile_store(b, 0, 0, z)
+
+def test_tile_broadcast_grad(test, device):
+        
+    a = wp.array(np.arange(0, 5, dtype=np.float32), requires_grad=True)
+    b = wp.array(np.ones((5, 5), dtype=np.float32), requires_grad=True)
+
+    with wp.Tape() as tape:   
+        wp.launch_tiled(test_tile_broadcast_grad_kernel, dim=[1], inputs=[a, b], block_dim=32)
+
+    b.grad = wp.ones_like(b)
+    tape.backward()
+
+    assert_np_equal(a.grad.numpy(), np.ones(5)*5.0)
 
 # #-----------------------------------------
 # # center of mass computation
@@ -597,7 +622,8 @@ class TestTile(unittest.TestCase):
 add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
 add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
 add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
-add_function_test(TestTile, "test_tile_broadcast", test_tile_broadcast, devices=devices)
+add_function_test(TestTile, "test_tile_broadcast_add", test_tile_broadcast_add, devices=devices)
+add_function_test(TestTile, "test_tile_broadcast_grad", test_tile_broadcast_grad, devices=devices)
 
 
 if __name__ == "__main__":
diff --git a/warp/types.py b/warp/types.py
index 7e244863..9a0aa8a0 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2982,7 +2982,7 @@ def ctype(self):
         if self.storage == "register":
             return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
         elif self.storage == "shared":
-            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}, {self.strides[0]}, {self.strides[1]}>"
+            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{self.strides[0]}, {self.strides[1]}>"
         else:
             raise RuntimeError(f"Unrecognized tile storage type {self.storage}")
 
@@ -2995,15 +2995,17 @@ def cinit(self, adjoint=False):
         elif self.storage == "shared":
             # if this is a reference to another tile
             # then don't allocate any memory
-            if self.owner == False:
-                return "NULL"
 
             if adjoint:
                 # backward pass requires zeroed memory
-                return f"wp::tile_alloc_zeros<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"
+                return f"wp::tile_alloc_zeros<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{self.strides[0]}, {self.strides[1]}, {Tile.alloc()}>()"
             else:
-                # forward mode can be uninitialized until first used by the kernel
-                return f"wp::tile_alloc_empty<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"
+                if self.owner == False:
+                    # will be initialized by subsequent call, e.g.: t = tile_broadcast(a)
+                    return "NULL"
+                else:
+                    # forward mode can be uninitialized until first used by the kernel
+                    return f"wp::tile_alloc_empty<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{Tile.alloc()}>()"
 
     # generate a unique allocation index for shared memory
     @classmethod

From deafbe9c5db6f463f37de2af8941fe5713026882 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 1 Oct 2024 07:49:58 +0000
Subject: [PATCH 051/102] Add support for tiling vectors with gradients

---
 warp/builtins.py   | 36 ++++++++++++++++-------
 warp/native/tile.h | 73 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 3747777c..19b1254c 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2144,11 +2144,17 @@ def tile_value_func(arg_types, arg_values):
     if len(arg_types) != 1:
         raise RuntimeError("tile() requires 1 positional arg")
 
-    # todo: we need a way to pass things like current compiler options
-    # into the value_func, for now we use a single global options dictionary
-    # we should ideally pass in the Adjoint object if it exists
+    dtype = None
+    length = None
 
-    return Tile(dtype=arg_types["x"], M=1, N=warp.codegen.options["block_dim"], op="Tile")
+    if type_is_vector(arg_types["x"]):
+        dtype = arg_types["x"]._wp_scalar_type_
+        length = arg_types["x"]._shape_[0]
+    else:
+        dtype = arg_types["x"]
+        length = 1
+
+    return Tile(dtype=dtype, M=length, N=warp.codegen.options["block_dim"], op="tile")
 
 
 add_builtin(
@@ -2160,8 +2166,11 @@ def tile_value_func(arg_types, arg_values):
 
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
+    * If the input value is a scalar then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a vector then the resulting tile has ``shape=(length(vector), block_dim)``
+
     :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
-    :returns: A tile with ``shape=(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch()``.
+    :returns: A tile with first dimension according to the value type length and a second dimension equal to ``block_dim``
 
     This example shows how to create a linear sequence from thread variables:
 
@@ -2179,9 +2188,10 @@ def compute():
 
     .. code-block:: text
 
-        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 10 12 14...]]
+        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 ...]]
+
     """,
-    group="Tile Primitives" "",
+    group="Tile Primitives",
     export=False,
 )
 
@@ -2201,10 +2211,13 @@ def untile_value_func(arg_types, arg_values):
 
     if t.N != warp.codegen.options["block_dim"]:
         raise RuntimeError(
-            f"until() argument must have the same length as the block width, got {t.N}, expected {warp.codegen.options['block_dim']}"
+            f"untile() argument must have the same length as the block width, got {t.N}, expected {warp.codegen.options['block_dim']}"
         )
 
-    return t.dtype
+    if t.M == 1:
+        return t.dtype
+    elif t.M > 1:
+        return warp.types.vector(t.M, t.dtype)
 
 
 add_builtin(
@@ -2216,6 +2229,9 @@ def untile_value_func(arg_types, arg_values):
 
     This function converts a block-wide tile back to per-thread values.
 
+    * If the input tile is 1-dimensional then the resulting value will be a per-thread scalar
+    * If the input tile is 2-dimensional then the the resulting value will be a per-thread vector of length M
+
     :param a: A tile with dimensions ``shape=(M, block_dim)``
     :returns: A single value per-thread with the same dtype as the tile
 
@@ -2248,7 +2264,7 @@ def compute():
         8
         ...
     """,
-    group="Tile Primitives" "",
+    group="Tile Primitives",
     export=False,
 )
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index e8b9bd8d..cd25c674 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -775,16 +775,42 @@ inline CUDA_CALLABLE auto tile(const T& x)
     return result;
 }
 
+// overload for constructing a tile from a per-thread vector
+template <typename T, unsigned Length>
+inline CUDA_CALLABLE auto tile(const wp::vec_t<Length, T>& x)
+{
+    tile_register_t<T, Length, WP_TILE_BLOCK_DIM> result;
+    
+    static_assert(result.NumRegs == Length);
+
+    for (int i=0; i < Length; ++i)
+        result.data[i] = x[i]; 
+
+    return result;
+}
 
 // construct a tile from a local SIMT value (one per-thread)
 template <typename T, typename AdjTile>
-inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, const AdjTile& adj_ret)
+inline CUDA_CALLABLE void adj_tile(const T& x, T& adj_x, AdjTile& adj_ret)
 {
     static_assert(AdjTile::M == 1);
     static_assert(AdjTile::N == WP_TILE_BLOCK_DIM);
-    static_assert(AdjTile::NumRegs == 1);
+    
+    auto adj_reg = adj_ret.copy_to_register();
+
+    adj_x += adj_reg.data[0];
+}
+
+template <typename T, unsigned Length, typename AdjTile>
+inline CUDA_CALLABLE void adj_tile(const wp::vec_t<Length, T>& x, wp::vec_t<Length, T>& adj_x, AdjTile& adj_ret)
+{
+    static_assert(AdjTile::M == Length);
+    static_assert(AdjTile::N == WP_TILE_BLOCK_DIM);
+
+    auto adj_reg = adj_ret.copy_to_register();
 
-    adj_x += adj_ret.data[0];
+    for (int i=0; i < Length; ++i)
+        adj_x[i] += adj_reg.data[i];
 }
 
 template <typename Tile>
@@ -793,16 +819,45 @@ inline CUDA_CALLABLE auto untile(Tile& tile)
     // code-gen should have set the tile to 
     // have exactly the block dimension so 
     // there is exactly one value per-thread
-    static_assert(Tile::NumRegs == 1);
+    auto reg = tile.copy_to_register();
 
-    return tile.copy_to_register().data[0];
+    // scalar case
+    if constexpr(Tile::M == 1)
+    {
+        return reg.data[0];
+    }
+        
+    // vector case
+    if constexpr(Tile::M > 1)
+    {
+        wp::vec_t<Tile::M, typename Tile::Type> v;
+        for (int i=0; i < Tile::M; ++i)
+            v[i] = reg.data[i];
+
+        return v;
+    }
 }
 
-template <typename Tile>
-inline CUDA_CALLABLE void adj_untile(Tile& tile, Tile& adj_tile, typename Tile::Type& adj_ret)
+
+
+template <typename Tile, typename Value>
+inline CUDA_CALLABLE void adj_untile(Tile& tile, Tile& adj_tile, Value& adj_ret)
 {    
-    auto adj = adj_tile.copy_to_register();
-    adj.data[0] += adj_ret;
+    auto adj = adj_tile.copy_to_register();   
+    
+    // scalar case
+    if constexpr(Tile::M == 1)
+    {
+        adj.data[0] += adj_ret;
+    }
+
+    // vector case
+    if constexpr(Tile::M > 1)
+    {
+        for (int i=0; i < Tile::M; ++i)
+            adj.data[i] = adj_ret[i];
+    }
+
     adj_tile.assign(adj);
 }
 

From a947d5b657a327e7a04816cd329873598ed528f8 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 1 Oct 2024 08:42:52 +0000
Subject: [PATCH 052/102] Fix up tile reduce unit tests

---
 warp/tests/test_tile_reduce.py | 76 +++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index f0b60d86..5e48b62f 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -34,7 +34,7 @@ def tile_sum_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
         a = wp.tile_load(input, i, j, m=1, n=TILE_DIM)
         s += wp.tile_sum(a) * 0.5
 
-    wp.tile_store(output, i, 0, s)
+    wp.tile_store(output, i, s)
 
 
 def test_tile_reduce_sum(test, device):
@@ -73,7 +73,7 @@ def tile_min_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
     a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
     m = wp.tile_min(a)
 
-    wp.tile_store(output, i, 0, m)
+    wp.tile_store(output, i, m)
 
 
 def test_tile_reduce_min(test, device):
@@ -106,7 +106,7 @@ def tile_max_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
     a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
     m = wp.tile_max(a)
 
-    wp.tile_store(output, i, 0, m)
+    wp.tile_store(output, i, m)
 
 
 def test_tile_reduce_max(test, device):
@@ -139,7 +139,7 @@ def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), output: wp.array(d
     a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
     m = wp.tile_reduce(wp.mul, a)
 
-    wp.tile_store(output, i, 0, m)
+    wp.tile_store(output, i, m)
 
 
 def test_tile_reduce_custom(test, device):
@@ -173,10 +173,10 @@ def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dty
     # output tile index
     i = wp.tid()
 
-    a = wp.tile_load(input, i, 0, m=TILE_M, n=TILE_N)
+    a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
     s = wp.tile_sum(a) * 0.5
 
-    wp.tile_store(output, i, 0, s)
+    wp.tile_store(output, i, s)
 
 
 def test_tile_reduce_grouped_sum(test, device):
@@ -256,6 +256,60 @@ def test_tile_untile(test, device):
     assert_np_equal(output.numpy(), np.arange(N) * 2)
 
 
+@wp.kernel
+def tile_untile_scalar_kernel(output: wp.array(dtype=int)):
+    # thread index
+    i = wp.tid()
+
+    # convert to block wide tile
+    t = wp.tile(i) * 2
+    s = wp.untile(t)
+
+    output[i] = s
+
+
+def test_tile_untile_scalar(test, device):
+    # use an unaligned grid dimension
+    N = TILE_DIM * 4 + 5
+
+    output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
+
+    with wp.Tape() as tape:
+        wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
+
+    assert_np_equal(output.numpy(), np.arange(N) * 2)
+
+
+
+@wp.kernel
+def test_untile_vector_kernel(
+    input: wp.array(dtype=wp.vec3),
+    output: wp.array(dtype=wp.vec3)):
+
+    i = wp.tid()
+
+    v = input[i]*0.5
+
+    t = wp.tile(v)
+    u = wp.untile(t)
+
+    output[i] = u*2.0
+
+def test_tile_untile_vector(test, device):
+
+    input = wp.full(16, wp.vec3(1.0, 2.0, 3.0), requires_grad=True)
+    output = wp.zeros_like(input)
+
+    with wp.Tape() as tape:
+        wp.launch(test_untile_vector_kernel, dim=16, inputs=[input, output], block_dim=16)
+
+    output.grad = wp.ones_like(output)
+    tape.backward()
+
+    assert_np_equal(output.numpy(), input.numpy())
+    assert_np_equal(input.grad.numpy(), np.ones((16, 3)))
+
+
 @wp.kernel
 def tile_ones_kernel(out: wp.array(dtype=float)):
     i = wp.tid()
@@ -263,11 +317,12 @@ def tile_ones_kernel(out: wp.array(dtype=float)):
     t = wp.tile_ones(dtype=float, m=16, n=16)
     s = wp.tile_sum(t)
 
-    wp.tile_store(out, 0, 0, s)
+    wp.tile_store(out, 0, s)
 
 
 def test_tile_ones(test, device):
-    output = wp.zeros(shape=1, dtype=float, device=device)
+    
+    output = wp.zeros(1, dtype=float, device=device)
 
     with wp.Tape() as tape:
         wp.launch_tiled(tile_ones_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
@@ -316,8 +371,9 @@ class TestTileReduce(unittest.TestCase):
 add_function_test(TestTileReduce, "test_tile_reduce_simt", test_tile_reduce_simt, devices=devices)
 add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
 add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
-add_function_test(TestTileReduce, "test_tile_untile", test_tile_untile, devices=devices)
+add_function_test(TestTileReduce, "test_tile_untile_scalar", test_tile_untile_scalar, devices=devices)
+add_function_test(TestTileReduce, "test_tile_untile_vector", test_tile_untile_vector, devices=devices)
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
-    unittest.main(verbosity=2)
+    unittest.main(verbosity=2, failfast=True)

From 34f44c0f470160d3d212a46b434202bdbafb5f2c Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 1 Oct 2024 19:05:45 +0000
Subject: [PATCH 053/102] Update GEMM example

---
 warp/examples/tile/example_tile_matmul.py | 80 +++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 warp/examples/tile/example_tile_matmul.py

diff --git a/warp/examples/tile/example_tile_matmul.py b/warp/examples/tile/example_tile_matmul.py
new file mode 100644
index 00000000..881396f9
--- /dev/null
+++ b/warp/examples/tile/example_tile_matmul.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+###########################################################################
+# Example Tile MatMul
+#
+# Shows how to write a simple GEMM kernel using Warp tile primitives.
+#
+###########################################################################
+
+import numpy as np
+import warp as wp
+
+# tile size
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+
+# num threads per-tile
+TILE_THREADS = 64
+
+@wp.kernel
+def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
+    
+    # output tile index
+    i, j = wp.tid()
+
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+
+    M = A.shape[0]
+    N = B.shape[1]
+    K = A.shape[1]
+
+    count = int(K / TILE_K)
+
+    for k in range(0, count):
+        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+
+        # sum += a*b
+        wp.tile_matmul(a, b, sum)
+
+    wp.tile_store(C, i, j, sum)
+
+
+
+if __name__ == "__main__":
+
+    wp.set_device("cuda:0")
+
+    # generate some tile aligned matrix dimensions
+    M = TILE_M * 7
+    K = TILE_K * 6
+    N = TILE_N * 5
+
+    rng = np.random.default_rng(42)
+    A = rng.random((M, K), dtype=np.float32)
+    B = rng.random((K, N), dtype=np.float32)
+    C = np.zeros((M, N), dtype=np.float32)
+
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
+    C_wp = wp.array(C, requires_grad=True)
+
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_gemm,
+            dim=(int(M / TILE_M), int(N / TILE_N)),
+            inputs=[A_wp, B_wp, C_wp],
+            block_dim=TILE_THREADS)
+
+    assert(np.allclose(C_wp.numpy(), A@B))
+
+    print("Example matrix multiplication passed")
+
+

From 8caa97b590afecfdfe53afca952b2915e4875bc5 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 1 Oct 2024 12:05:57 -0700
Subject: [PATCH 054/102] Update tile branch with Warp 1.4.0

---
 .github/workflows/ci.yml                    |    8 +-
 .github/workflows/sphinx.yml                |    6 +-
 .gitlab-ci.yml                              |   14 +-
 .pre-commit-config.yaml                     |    2 +-
 CHANGELOG.md                                |   46 +-
 README.md                                   |   11 +-
 VERSION.md                                  |    2 +-
 docs/basics.rst                             |    4 +
 docs/codegen.rst                            | 1047 +++++++++++++++++++
 docs/configuration.rst                      |    2 +
 docs/index.rst                              |    7 +-
 docs/installation.rst                       |    7 +-
 docs/modules/contribution_guide.rst         |  333 ++++++
 docs/modules/differentiability.rst          |    1 +
 docs/modules/functions.rst                  |  129 ++-
 docs/modules/interoperability.rst           |  181 ++++
 docs/modules/runtime.rst                    |   78 +-
 docs/modules/sim.rst                        |    3 +
 docs/requirements.txt                       |    8 +-
 exts/omni.warp.core/config/extension.toml   |    3 +-
 exts/omni.warp.core/docs/CHANGELOG.md       |   68 ++
 exts/omni.warp/config/extension.toml        |    4 +-
 exts/omni.warp/docs/CHANGELOG.md            |   68 ++
 warp/__init__.py                            |    6 +
 warp/builtins.py                            |  107 +-
 warp/codegen.py                             |  280 ++++-
 warp/config.py                              |    2 +-
 warp/context.py                             |   14 +-
 warp/dlpack.py                              |    2 +
 warp/examples/benchmarks/benchmark.bat      |    2 +
 warp/examples/benchmarks/benchmark.sh       |    2 +
 warp/examples/benchmarks/benchmark_cloth.py |   10 +
 warp/examples/sim/example_cloth.py          |   56 +-
 warp/native/mat.h                           |    6 +
 warp/native/quat.h                          |    8 +
 warp/native/spatial.h                       |    6 +
 warp/paddle.py                              |  382 +++++++
 warp/sim/integrator_euler.py                |   20 +-
 warp/sim/integrator_featherstone.py         |    4 +-
 warp/sim/integrator_vbd.py                  |  435 +++++++-
 warp/sim/model.py                           |   14 +-
 warp/stubs.py                               |  207 +++-
 warp/tape.py                                |    2 +
 warp/tests/test_array.py                    |   20 +
 warp/tests/test_codegen.py                  |   33 +-
 warp/tests/test_dlpack.py                   |  118 +++
 warp/tests/test_implicit_init.py            |   49 +
 warp/tests/test_paddle.py                   |  852 +++++++++++++++
 warp/tests/test_static.py                   |  412 ++++++++
 warp/tests/test_tile.py                     |   66 +-
 warp/tests/test_tile_reduce.py              |   13 +-
 warp/tests/test_torch.py                    |   24 +
 warp/tests/test_types.py                    |    2 +-
 warp/thirdparty/dlpack.py                   |    4 +-
 warp/types.py                               |   18 +-
 55 files changed, 4904 insertions(+), 304 deletions(-)
 create mode 100644 docs/codegen.rst
 create mode 100644 docs/modules/contribution_guide.rst
 create mode 100644 warp/paddle.py
 create mode 100644 warp/tests/test_paddle.py
 create mode 100644 warp/tests/test_static.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0e4a87d4..2a05aa0e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -232,6 +232,7 @@ jobs:
 
   pull-request-docs:
     runs-on: ubuntu-latest
+    needs: build-warp-ubuntu
     if: ${{ github.event_name == 'pull_request' }}
     outputs:
       artifact-url: ${{ steps.build-docs-output.outputs.artifact-url }}
@@ -242,10 +243,15 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: "3.10"
+      - name: Download Warp binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: build-artifact-ubuntu
+          path: warp/bin/
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r docs/requirements.txt
+          pip install --no-cache-dir -r docs/requirements.txt
       - name: Build Sphinx documentation
         run: python build_docs.py
       - name: Upload artifacts
diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml
index f649cd39..18413d26 100644
--- a/.github/workflows/sphinx.yml
+++ b/.github/workflows/sphinx.yml
@@ -29,7 +29,9 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r docs/requirements.txt
+          pip install --no-cache-dir -r docs/requirements.txt
+      - name: Build Warp without CUDA Support
+        run: python build_lib.py
       - name: Build Sphinx documentation
         run: python build_docs.py
       - name: Upload artifacts
@@ -46,7 +48,7 @@ jobs:
           mv docs/_build/html/* .
           mv docs/_build/html/.nojekyll .
           mv docs/_build/html/.buildinfo .
-          rm -rf docs warp
+          rm -rf docs warp _build __pycache__
           git add . .nojekyll .buildinfo
           git commit -m "Deploy Sphinx documentation"
           git push -f origin gh-pages
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ea8ae21c..554b9273 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -119,7 +119,7 @@ ruff lint:
     - .runner-utility-linux-x86_64
   before_script:
     - python -m pip install --upgrade pip
-    - pip install --upgrade ruff==0.5.5
+    - pip install --upgrade ruff==0.6.8
   script:
     - ruff check --output-format full  --exit-zero # Just to get something in the log
     - ruff check --output-format gitlab > gl-code-quality-report.json
@@ -135,7 +135,7 @@ ruff format:
     - .runner-utility-linux-x86_64
   before_script:
     - python -m pip install --upgrade pip
-    - pip install --upgrade ruff==0.5.5
+    - pip install --upgrade ruff==0.6.8
   script:
     - ruff format --diff
 
@@ -397,7 +397,7 @@ linux-x86_64 test warp-init:
 # artifacts.
 windows-x86_64 docs:
   stage: test
-  needs: []
+  needs: [windows-x86_64 build]
   extends:
     - .runner-utility-windows-x86_64
   artifacts:
@@ -407,7 +407,7 @@ windows-x86_64 docs:
     - !reference [.snippets, define-powershell-GetTime]
     - Write-Output "$([char]27)[0Ksection_start:$(GetTime):install_dependencies[collapsed=true]$([char]13)$([char]27)[0KInstalling dependencies"
     - powershell -command "Get-Volume | Format-Table -AutoSize"
-    - $python_name = $DEFAULT_PYTHON + "-windows-x86_64"
+    - $python_name = "3.12.6+nv1-windows-x86_64"
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
@@ -696,7 +696,7 @@ publish wheels to gitlab package registry:
 .build-docs-common:
   stage: deploy
   image: python:3.11-slim
-  needs: []
+  needs: [linux-x86_64 build]
   extends:
     - .runner-utility-linux-x86_64
   artifacts:
@@ -704,7 +704,11 @@ publish wheels to gitlab package registry:
       - public
   before_script:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KSet up docs environment"
+    - df -h
     - apt-get update && apt-get install make --no-install-recommends -y
+    # Move compiled binaries out of platform-specific directory
+    - mv warp/bin/linux-x86_64/warp.so warp/bin/
+    - mv warp/bin/linux-x86_64/warp-clang.so warp/bin/
     - python -m pip install --upgrade pip
     - python -m pip install -r docs/requirements.txt
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b263a09..fd7faf1d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ ci:
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.5.0
+    rev: v0.6.8
     hooks:
       # Run the linter.
       - id: ruff
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7d8450ce..1ac1e54d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,61 +1,72 @@
 # CHANGELOG
 
-## [Unreleased] - 2024-??
+## [1.4.0] - 2024-10-01
 
 ### Added
 
-- Support for fp64 `atomic_add`, `atomic_max`, and `atomic_min` ([GH-284](https://github.com/NVIDIA/warp/issues/284)).
+- Support for a new `wp.static(expr)` function that allows arbitrary Python expressions to be evaluated at the time of
+  function/kernel definition ([docs](https://nvidia.github.io/warp/codegen.html#static-expressions)).
 - Support for stream priorities to hint to the device that it should process pending work
   in high-priority streams over pending work in low-priority streams when possible
   ([docs](https://nvidia.github.io/warp/modules/concurrency.html#stream-priorities)).
+- Adaptive sparse grid geometry to `warp.fem` ([docs](https://nvidia.github.io/warp/modules/fem.html#adaptivity)).
+- Support for defining `wp.kernel` and `wp.func` objects from within closures.
+- Support for defining multiple versions of kernels, functions, and structs without manually assigning unique keys.
+- Support for default argument values for user functions decorated with `wp.func`.
+- Allow passing custom launch dimensions to `jax_kernel()` ([GH-310](https://github.com/NVIDIA/warp/pull/310)).
+- JAX interoperability examples for sharding and matrix multiplication ([docs](https://nvidia.github.io/warp/modules/interoperability.html#using-shardmap-for-distributed-computation)).
+- Interoperability support for the PaddlePaddle ML framework ([GH-318](https://github.com/NVIDIA/warp/pull/318)).
 - Support `wp.mod()` for vector types ([GH-282](https://github.com/NVIDIA/warp/issues/282)).
 - Expose the modulo operator `%` to Python's runtime scalar and vector types.
-- Support for local vec/mat/quat component gradient tracking in backwards mode.
+- Support for fp64 `atomic_add`, `atomic_max`, and `atomic_min` ([GH-284](https://github.com/NVIDIA/warp/issues/284)).
 - Support for quaternion indexing (e.g. `q.w`).
-- Support for default argument values for user functions decorated with `wp.func`.
 - Support shadowing builtin functions ([GH-308](https://github.com/NVIDIA/warp/issues/308)).
-- Allow passing custom launch dimensions to `jax_kernel()` ([GH-310](https://github.com/NVIDIA/warp/pull/310)).
-- Jax interoperability examples for sharding and matrix multiplication (see Interoperability documentation).
-- Include all non-hidden builtins in the stub file.
-- Adaptive sparse grid geometry to `warp.fem` ([docs](https://nvidia.github.io/warp/modules/fem.html#adaptivity)).
-- Improve accuracy of symmetric eigenvalues routine in `warp.fem`.
-- Support for `wp.kernel` and `wp.func` closures.
-- Support for defining multiple versions of kernels, functions, and structs without manually assigning unique keys.
 - Support for redefining function overloads.
 - Add an ocean sample to the `omni.warp` extension.
+- `warp.sim.VBDIntegrator` now supports body-particle collision.
+- Add a [contributing guide](https://nvidia.github.io/warp/modules/contribution_guide.html) to the Sphinx docs .
+- Add documentation for dynamic code generation ([docs](https://nvidia.github.io/warp/codegen.html#dynamic-kernel-creation)).
 
 ### Changed
 
-- **Breaking:** Rename function `plot_kernel_jacobians` to `jacobian_plot` in `autograd` module.
 - `wp.sim.Model.edge_indices` now includes boundary edges.
 - Unexposed `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` from the Python scope.
 - Skip unused functions in module code generation, improving performance.
 - Avoid reloading modules if their content does not change, improving performance.
 - `wp.Mesh.points` is now a property instead of a raw data member, its reference can be changed after the mesh is initialized.
+- Improve error message when invalid objects are referenced in a Warp kernel.
+- `if`/`else`/`elif` statements with constant conditions are resolved at compile time with no branches being inserted in the generated code.
+- Include all non-hidden builtins in the stub file.
+- Improve accuracy of symmetric eigenvalues routine in `warp.fem`.
 
 ### Fixed
 
 - Fix for `wp.func` erroring out when defining a `Tuple` as a return type hint ([GH-302](https://github.com/NVIDIA/warp/issues/302)).
-- Fix array in-place op (`+=`, `-=`) adjoints to compute gradients correctly in the backwards pass.
+- Fix array in-place op (`+=`, `-=`) adjoints to compute gradients correctly in the backwards pass
+- Fix vector, matrix in-place assignment adjoints to compute gradients correctly in the backwards pass, e.g.: `v[1] = x`
 - Fix a bug in which Python docstrings would be created as local function variables in generated code.
-- Fix a rare crash during error reporting on some systems.
 - Fix a bug with autograd array access validation in functions from different modules.
 - Fix a rare crash during error reporting on some systems due to glibc mismatches.
 - Handle `--num_tiles 1` in `example_render_opengl.py` ([GH-306](https://github.com/NVIDIA/warp/issues/306)).
+- Fix the computation of body contact forces in `FeatherstoneIntegrator` when bodies and particles collide.
 - Fix bug in `FeatherstoneIntegrator` where `eval_rigid_jacobian` could give incorrect results or reach an infinite
   loop when the body and joint indices were not in the same order. Added `Model.joint_ancestor` to fix the indexing
   from a joint to its parent joint in the articulation.
-- Add a workaround for `__threadfence()` issues in the Compute Sanitizer initcheck tool.
+- Fix wrong vertex index passed to `add_edges()` called from `ModelBuilder.add_cloth_mesh()` ([GH-319](https://github.com/NVIDIA/warp/issues/319)).
+- Add a workaround for uninitialized memory read warning in the `compute-sanitizer` initcheck tool when using `wp.Mesh`.
 - Fix name clashes when Warp functions and structs are returned from Python functions multiple times.
 - Fix name clashes between Warp functions and structs defined in different modules.
 - Fix code generation errors when overloading generic kernels defined in a Python function.
-- Fix some bugs related to module hashing and caching.
 - Fix issues with unrelated functions being treated as overloads (e.g., closures).
 - Fix handling of `stream` argument in `array.__dlpack__()`.
 - Fix a bug related to reloading CPU modules.
 - Fix a crash when kernel functions are not found in CPU modules.
 - Fix conditions not being evaluated as expected in `while` statements.
 - Fix printing Boolean and 8-bit integer values.
+- Fix array interface type strings used for Boolean and 8-bit integer values.
+- Fix initialization error when setting struct members.
+- Fix Warp not being initialized upon entering a `wp.Tape` context.
+- Use `kDLBool` instead of `kDLUInt` for DLPack interop of Booleans.
 
 ## [1.3.3] - 2024-09-04
 
@@ -1119,7 +1130,8 @@
 
 - Initial publish for alpha testing
 
-[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.3.3...HEAD
+[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.4.0...HEAD
+[1.4.0]: https://github.com/NVIDIA/warp/releases/tag/v1.4.0
 [1.3.3]: https://github.com/NVIDIA/warp/releases/tag/v1.3.3
 [1.3.2]: https://github.com/NVIDIA/warp/releases/tag/v1.3.2
 [1.3.1]: https://github.com/NVIDIA/warp/releases/tag/v1.3.1
diff --git a/README.md b/README.md
index 44c5d326..54c1bbfd 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ regular Python functions and JIT compiles them to efficient kernel code that can
 Warp is designed for [spatial computing](https://en.wikipedia.org/wiki/Spatial_computing)
 and comes with a rich set of primitives that make it easy to write
 programs for physics simulation, perception, robotics, and geometry processing. In addition, Warp kernels
-are differentiable and can be used as part of machine-learning pipelines with frameworks such as PyTorch and JAX.
+are differentiable and can be used as part of machine-learning pipelines with frameworks such as PyTorch, JAX and Paddle.
 
 Please refer to the project [Documentation](https://nvidia.github.io/warp/) for API and language reference and [CHANGELOG.md](./CHANGELOG.md) for release history.
 
@@ -45,9 +45,9 @@ the `pip install` command, e.g.
 
 | Platform        | Install Command                                                                                                               |
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.3.3/warp_lang-1.3.3+cu11-py3-none-manylinux2014_aarch64.whl` |
-| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.3.3/warp_lang-1.3.3+cu11-py3-none-manylinux2014_x86_64.whl`  |
-| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.3.3/warp_lang-1.3.3+cu11-py3-none-win_amd64.whl`             |
+| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_aarch64.whl` |
+| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_x86_64.whl`  |
+| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-win_amd64.whl`             |
 
 The `--force-reinstall` option may need to be used to overwrite a previous installation.
 
@@ -372,7 +372,8 @@ Warp is provided under the NVIDIA Software License, please see [LICENSE.md](./LI
 
 Contributions and pull requests from the community are welcome and are taken under the
 terms described in the **Feedback** section of [LICENSE.md](LICENSE.md#9-feedback).
-[CONTRIBUTING.md](./CONTRIBUTING.md) provides additional information on how to open a pull request for Warp.
+Please see the [Contribution Guide](https://nvidia.github.io/warp/modules/contribution_guide.html) for more
+information on contributing to the development of Warp.
 
 ## Citing
 
diff --git a/VERSION.md b/VERSION.md
index 31e5c843..88c5fb89 100644
--- a/VERSION.md
+++ b/VERSION.md
@@ -1 +1 @@
-1.3.3
+1.4.0
diff --git a/docs/basics.rst b/docs/basics.rst
index 2b042edf..2fb1b880 100644
--- a/docs/basics.rst
+++ b/docs/basics.rst
@@ -3,6 +3,8 @@ Basics
 
 .. currentmodule:: warp
 
+.. _warp-initialization:
+
 Initialization
 --------------
 
@@ -273,6 +275,8 @@ less time to load since code compilation is skipped:
     step took 0.04 ms
     render took 5.05 ms
 
+For more information, see the :doc:`codegen` section.
+
 Language Details
 ----------------
 
diff --git a/docs/codegen.rst b/docs/codegen.rst
new file mode 100644
index 00000000..fe5ed81b
--- /dev/null
+++ b/docs/codegen.rst
@@ -0,0 +1,1047 @@
+.. _code_generation:
+
+Code Generation
+===============
+
+Overview
+--------
+
+Warp kernels are grouped together by Python module.  Before they can run on a device, they must be translated and compiled for the device architecture.  All kernels in a module are compiled together, which is faster than compiling each one individually.  When a kernel is launched, Warp checks if the module is up-to-date and will compile it if needed.  Adding new kernels to a module at runtime modifies the module, which means that it will need to be reloaded on next launch.
+
+.. code:: python
+
+    @wp.kernel
+    def kernel_foo():
+        print("foo")
+
+    wp.launch(kernel_foo, dim=1)
+
+    @wp.kernel
+    def kernel_bar():
+        print("bar")
+
+    wp.launch(kernel_bar, dim=1)
+
+In the snippet above, kernel definitions are interspersed with kernel launches.  To execute ``kernel_foo``, the module is compiled during the first launch.  Defining ``kernel_bar`` modifies the module, so it needs to be recompiled during the second launch:
+
+.. code:: text
+
+    Module __main__ 6cd1d53 load on device 'cuda:0' took 168.19 ms  (compiled)
+    foo
+    Module __main__ c7c0e9a load on device 'cuda:0' took 160.35 ms  (compiled)
+    bar
+
+The compilation can take a long time for modules with numerous complex kernels, so Warp caches the compiled modules and can reuse them on the next run of the program:
+
+.. code:: text
+
+    Module __main__ 6cd1d53 load on device 'cuda:0' took 4.97 ms  (cached)
+    foo
+    Module __main__ c7c0e9a load on device 'cuda:0' took 0.40 ms  (cached)
+    bar
+
+Loading cached modules is much faster, but it's not free.  In addition, module reloading can cause problems during CUDA graph capture, so there are good reasons to try to avoid it.
+
+The best way to avoid module reloading is to define all the kernels before launching any of them.  This way, the module will be compiled only once:
+
+.. code:: python
+
+    @wp.kernel
+    def kernel_foo():
+        print("foo")
+
+    @wp.kernel
+    def kernel_bar():
+        print("bar")
+
+    wp.launch(kernel_foo, dim=1)
+    wp.launch(kernel_bar, dim=1)
+
+.. code:: text
+
+    Module __main__ c7c0e9a load on device 'cuda:0' took 174.57 ms  (compiled)
+    foo
+    bar
+
+On subsequent runs it will be loaded from the kernel cache only once:
+
+.. code:: text
+
+    Module __main__ c7c0e9a load on device 'cuda:0' took 4.96 ms  (cached)
+    foo
+    bar
+
+Warp tries to recognize duplicate kernels to avoid unnecessary module reloading.  For example, this program creates kernels in a loop, but they are always identical, so the module does not need to be recompiled on every launch:
+
+.. code:: python
+
+    for i in range(3):
+
+        @wp.kernel
+        def kernel_hello():
+            print("hello")
+
+        wp.launch(kernel_hello, dim=1)
+
+Warp filters out the duplicate kernels, so the module is only loaded once:
+
+.. code:: text
+
+    Module __main__ 8194f57 load on device 'cuda:0' took 178.24 ms  (compiled)
+    hello
+    hello
+    hello
+
+
+Warp generates C++/CUDA source code for CPU/GPU and stores the .cpp/.cu source files under the module directories of the kernel cache.
+The kernel cache folder path is printed during the :ref:`Warp initialization <warp-initialization>` and
+can be retrieved after Warp has been initialized from the ``warp.config.kernel_cache_dir`` :ref:`configuration setting <global-settings>`.
+
+Consider the following example:
+
+.. code:: python
+
+    @wp.func
+    def my_func(a: float, b: float):
+        c = wp.sin(b) * a
+        return c
+
+The resulting CUDA code looks similar to this:
+
+.. code:: cpp
+
+    // example.py:5
+    static CUDA_CALLABLE wp::float32 my_func_0(
+        wp::float32 var_a,
+        wp::float32 var_b)
+    {
+        //---------
+        // primal vars
+        wp::float32 var_0;
+        wp::float32 var_1;
+        //---------
+        // forward
+        // def my_func(a: float, b: float):                                                       <L 6>
+        // c = wp.sin(b) * a                                                                      <L 7>
+        var_0 = wp::sin(var_b);
+        var_1 = wp::mul(var_0, var_a);
+        // return c                                                                               <L 8>
+        return var_1;
+    }
+
+The generated code follows `static-single-assignment (SSA) form <https://en.wikipedia.org/wiki/Static_single-assignment_form>`__.
+To ease the readability, comments referring to the original Python source code lines are inserted.
+Besides the forward pass, the gradient function is also generated, and,
+if a :ref:`custom replay function <custom-gradient-functions>` is provided, the replay function is generated as well.
+
+Warp passes the generated source code to native compilers (e.g., LLVM for CPU and NVRTC for CUDA) to produce executable code that is invoked when launching kernels.
+
+.. _external_references:
+
+External References and Constants
+---------------------------------
+
+A Warp kernel can access regular Python variables defined outside of the kernel itself, as long as those variables are of a supported type. Such external references are treated as compile-time constants in the kernel. It's not possible for code running on a different device to access the state of the Python interpreter, so these variables are folded into the kernels by value:
+
+.. code:: python
+
+    C = 42
+
+    @wp.kernel
+    def k():
+        print(C)
+
+    wp.launch(k, dim=1)
+
+During code generation, the external variable ``C`` becomes a constant:
+
+.. code:: c++
+
+    {
+        //---------
+        // primal vars
+        const wp::int32 var_0 = 42;
+        //---------
+        // forward
+        // def k():
+        // print(C)
+        wp::print(var_0);
+    }
+
+
+Supported Constant Types
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Only value types can be used as constants in Warp kernels.  This includes integers, floating point numbers, vectors (``wp.vec*``), matrices (``wp.mat*``) and other built-in math types.  Attempting to capture other variables types will result in an exception:
+
+.. code:: python
+
+    global_array = wp.zeros(5, dtype=int)
+
+    @wp.kernel
+    def k():
+        tid = wp.tid()
+        global_array[tid] = 42  # referencing external arrays is not allowed!
+
+    wp.launch(k, dim=global_array.shape, inputs=[])
+
+Output:
+
+.. code:: text
+
+    TypeError: Invalid external reference type: <class 'warp.types.array'>
+
+The reason why arrays cannot be captured is because they exist on a particular device and contain pointers to the device memory, which would make the kernel not portable across different devices.  Arrays should always be passed as kernel inputs.
+
+
+Usage of ``wp.constant()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In older versions of Warp, ``wp.constant()`` was required to declare constants that can be used in a kernel.  This is no longer necessary, but the old syntax is still supported for backward compatibility.  ``wp.constant()`` can still be used to check if a value can be referenced in a kernel:
+
+.. code:: python
+
+    x = wp.constant(17.0)  # ok
+    v = wp.constant(wp.vec3(1.0, 2.0, 3.0))  # ok
+    a = wp.constant(wp.zeros(n=5, dtype=int))  # error, invalid constant type
+
+    @wp.kernel
+    def k():
+        tid = wp.tid()
+        a[tid] = x * v
+
+In this snippet, a ``TypeError`` will be raised when declaring the array with ``wp.constant()``.  If ``wp.constant()`` was omitted, the error would be raised later during code generation, which might be slightly harder to debug.
+
+
+Updating Constants
+~~~~~~~~~~~~~~~~~~
+
+One limitation of using external variables in Warp kernels is that Warp doesn't know when the value is modified:
+
+.. code:: python
+
+    C = 17
+
+    @wp.kernel
+    def k():
+        print(C)
+
+    wp.launch(k, dim=1)
+
+    # redefine constant
+    C = 42
+
+    wp.launch(k, dim=1)
+
+This prints:
+
+.. code:: text
+
+    Module __main__ 4494df2 load on device 'cuda:0' took 163.54 ms  (compiled)
+    17
+    17
+
+During the first launch of kernel ``k``, the kernel is compiled using the existing value of ``C`` (17).  Since ``C`` is just a plain Python variable, Warp has no way of detecting when it is modified.  Thus on the second launch the old value is printed again.
+
+One way to get around this limitation is to tell Warp that the module was modified:
+
+.. code:: python
+
+    C = 17
+
+    @wp.kernel
+    def k():
+        print(C)
+
+    wp.launch(k, dim=1)
+
+    # redefine constant
+    C = 42
+
+    # tell Warp that the module was modified
+    k.module.mark_modified()
+
+    wp.launch(k, dim=1)
+
+This produces the updated output:
+
+.. code:: text
+
+    Module __main__ 4494df2 load on device 'cuda:0' took 167.92 ms  (compiled)
+    17
+    Module __main__ 9a0664f load on device 'cuda:0' took 164.83 ms  (compiled)
+    42
+
+Notice that calling ``module.mark_modified()`` caused the module to be recompiled on the second launch using the latest value of ``C``.
+
+.. note::
+    The ``Module`` class and the ``mark_modified()`` method are considered internal.  A public API for working with modules is planned, but currently it is subject to change without notice.  Programs should not overly rely on the ``mark_modified()`` method, but it can be used in a pinch.
+
+
+.. _static_expressions:
+
+Static Expressions
+------------------
+
+We often encounter situations where a kernel needs to be specialized for a given input or where certain parts of the code are static by the time the code is executed.
+With static expressions, we can write Python expressions to be evaluated at the time of declaring a Warp function or kernel.
+
+``wp.static(...)`` expressions allow the user to run arbitrary Python code at the time the Warp function or kernel containing the expression is defined.
+:func:`wp.static(expr) <static>` accepts a Python expression and replaces it with the result.
+Note that the expression can only access variables that can be evaluated at the time the expression is declared.
+This includes global variables and variables captured in a closure in which the Warp function or kernel is defined.
+Additionally, Warp constants from within the kernel or function can be accessed, such as the constant iteration variable for static for-loops (i.e. when the range is known at the time of code generation).
+
+The result from ``wp.static()`` must be a non-null value of one of the following types:
+
+- A Warp function
+- A string
+- Any type that is supported by Warp inside kernels (e.g. scalars, structs, matrices, vectors, etc.), excluding Warp arrays or structs containing Warp arrays
+
+Example: Static Math Expressions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+    import warp as wp
+    import scipy.linalg
+
+    @wp.kernel
+    def my_kernel():
+        static_var = wp.static(3 + 2)
+        # we can call arbitrary Python code inside wp.static()
+        static_norm = wp.static(wp.float64(scipy.linalg.norm([3, 4])))
+        wp.printf("static_var = %i\n", static_var)
+        wp.printf("static_norm = %f\n", static_norm)
+
+    wp.launch(my_kernel, 1)
+
+The static expressions are evaluated at the time of when the ``@wp.kernel`` decorator is evaluated and replaced in the code by their respective constant result values. The generated code will therefore contain the results of the expressions hard-coded in the source file (shown an abbreviated version):
+
+.. code:: cpp
+
+    const wp::int32 var_0 = 5;
+    const wp::float64 var_1 = 5.0;
+    const wp::str var_2 = "static_var = %i\n";
+    const wp::str var_3 = "static_norm = %f\n";
+    
+    // wp.printf("static_var = %i\n", static_var)                                             <L 10>
+    printf(var_2, var_0);
+    // wp.printf("static_norm = %f\n", static_norm)                                           <L 11>
+    printf(var_3, var_1);
+
+
+Example: Static Conditionals
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If/else/elif conditions that are constant can be eliminated from the generated code by using ``wp.static()`` inside the branch condition to yield a constant boolean.
+This can provide improved performance by avoiding branching and can be useful for generating specialized kernels:
+
+.. code:: python
+
+    import warp as wp
+
+    available_colors = {"red", "green", "blue"}
+
+    @wp.kernel
+    def my_kernel():
+        if wp.static("red" in available_colors):
+            print("red is available")
+        else:
+            print("red is not available")
+
+The global variable ``available_colors`` is known at the time of declaring the kernel and the generated code will contain only the branch that is taken:
+
+.. code:: cpp
+
+    const wp::str var_1 = "red is available";
+    wp::print(var_1);
+
+Example: Static Loop Unrolling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Static expressions can be used to unroll for-loops during code generation. We place ``wp.static()`` expressions inside the loop's ``range`` to yield static for-loops that can be unrolled. The iteration variable becomes a constant and can therefore be accessed from within a static expression in the loop body:
+
+.. code:: python
+
+    import warp as wp
+
+    def loop_limit():
+        return 3
+
+    @wp.kernel
+    def my_kernel():
+        for i in range(wp.static(loop_limit())):
+            static_i = wp.static(i)
+            wp.printf("i = %i\n", static_i)
+
+    wp.launch(my_kernel, 1)
+
+The generated code will not contain the for-loop but instead the loop body will be repeated three times:
+
+.. code:: cpp
+
+    const wp::int32 var_0 = 3;
+    const wp::int32 var_1 = 0;
+    const wp::int32 var_2 = 0;
+    const wp::str var_3 = "i = %i\n";
+    const wp::int32 var_4 = 1;
+    const wp::int32 var_5 = 1;
+    const wp::str var_6 = "i = %i\n";
+    const wp::int32 var_7 = 2;
+    const wp::int32 var_8 = 2;
+    const wp::str var_9 = "i = %i\n";
+    printf(var_3, var_2);
+    printf(var_6, var_5);
+    printf(var_9, var_8);
+
+Example: Function Pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``wp.static(...)`` may also return a Warp function. This can be useful to specialize a kernel or function based on information available at the time of declaring the Warp function or kernel, or to automatically generate overloads for different types.
+
+.. code:: python
+
+    import warp as wp
+
+    @wp.func
+    def do_add(a: float, b: float):
+        return a + b
+
+    @wp.func
+    def do_sub(a: float, b: float):
+        return a - b
+
+    @wp.func
+    def do_mul(a: float, b: float):
+        return a * b
+
+    op_handlers = {
+        "add": do_add,
+        "sub": do_sub,
+        "mul": do_mul,
+    }
+
+    inputs = wp.array([[1, 2], [3, 0]], dtype=wp.float32)
+    outputs = wp.empty(2, dtype=wp.float32)
+
+    for op in op_handlers.keys():
+
+        @wp.kernel
+        def operate(input: wp.array(dtype=inputs.dtype, ndim=2), output: wp.array(dtype=wp.float32)):
+            tid = wp.tid()
+            a, b = input[tid, 0], input[tid, 1]
+            # retrieve the right function to use for the captured dtype variable
+            output[tid] = wp.static(op_handlers[op])(a, b)
+
+        wp.launch(operate, dim=2, inputs=[inputs], outputs=[outputs])
+        print(outputs.numpy())
+
+The above program uses a static expression to select the right function given the captured ``op`` variable and prints the following output while compiling the module containing the ``operate`` kernel three times:
+
+.. code:: text
+
+    [3. 3.]
+    [-1.  3.]
+    [2. 0.]
+
+
+.. _dynamic_generation:
+
+Dynamic Kernel Creation
+-----------------------
+
+It is often desirable to dynamically customize kernels with different constants, types, or functions.  We can achieve this through runtime kernel specialization using Python closures.
+
+Kernel Closures
+~~~~~~~~~~~~~~~
+
+Constants
+^^^^^^^^^
+
+Warp allows references to external constants in kernels:
+
+.. code:: python
+
+    def create_kernel_with_constant(constant):
+        @wp.kernel
+        def k(a: wp.array(dtype=float)):
+            tid = wp.tid()
+            a[tid] += constant
+        return k
+
+    k1 = create_kernel_with_constant(17.0)
+    k2 = create_kernel_with_constant(42.0)
+
+    a = wp.zeros(5, dtype=float)
+
+    wp.launch(k1, dim=a.shape, inputs=[a])
+    wp.launch(k2, dim=a.shape, inputs=[a])
+
+    print(a)
+
+Output:
+
+.. code:: text
+
+    [59. 59. 59. 59. 59.]
+
+
+Data Types
+^^^^^^^^^^
+
+Warp data types can also be captured in a closure.  Here is an example of creating kernels that work with different vector dimensions:
+
+.. code:: python
+
+    def create_kernel_with_dtype(vec_type):
+        @wp.kernel
+        def k(a: wp.array(dtype=vec_type)):
+            tid = wp.tid()
+            a[tid] += float(tid) * vec_type(1.0)
+        return k
+
+    k2 = create_kernel_with_dtype(wp.vec2)
+    k4 = create_kernel_with_dtype(wp.vec4)
+
+    a2 = wp.ones(3, dtype=wp.vec2)
+    a4 = wp.ones(3, dtype=wp.vec4)
+
+    wp.launch(k2, dim=a2.shape, inputs=[a2])
+    wp.launch(k4, dim=a4.shape, inputs=[a4])
+
+    print(a2)
+    print(a4)
+
+Output:
+
+.. code:: text
+
+    [[1. 1.]
+     [2. 2.]
+     [3. 3.]]
+    [[1. 1. 1. 1.]
+     [2. 2. 2. 2.]
+     [3. 3. 3. 3.]]
+
+
+Functions
+^^^^^^^^^
+
+Here's a kernel generator that's parameterized using different functions:
+
+.. code:: python
+
+    def create_kernel_with_function(f):
+        @wp.kernel
+        def k(a: wp.array(dtype=float)):
+            tid = wp.tid()
+            a[tid] = f(a[tid])
+        return k
+
+    @wp.func
+    def square(x: float):
+        return x * x
+
+    @wp.func
+    def cube(x: float):
+        return x * x * x
+
+    k1 = create_kernel_with_function(square)
+    k2 = create_kernel_with_function(cube)
+
+    a1 = wp.array([1, 2, 3, 4, 5], dtype=float)
+    a2 = wp.array([1, 2, 3, 4, 5], dtype=float)
+
+    wp.launch(k1, dim=a1.shape, inputs=[a1])
+    wp.launch(k2, dim=a2.shape, inputs=[a2])
+
+    print(a1)
+    print(a2)
+
+Output:
+
+.. code:: text
+
+    [ 1.  4.  9.  16.  25.]
+    [ 1.  8.  27.  64.  125.]
+
+
+Function Closures
+~~~~~~~~~~~~~~~~~
+
+Warp functions (``@wp.func``) also support closures, just like kernels:
+
+.. code:: python
+
+    def create_function_with_constant(constant):
+        @wp.func
+        def f(x: float):
+            return constant * x
+        return f
+
+    f1 = create_function_with_constant(2.0)
+    f2 = create_function_with_constant(3.0)
+
+    @wp.kernel
+    def k(a: wp.array(dtype=float)):
+        tid = wp.tid()
+        x = float(tid)
+        a[tid] = f1(x) + f2(x)
+
+    a = wp.ones(5, dtype=float)
+
+    wp.launch(k, dim=a.shape, inputs=[a])
+
+    print(a)
+
+Output:
+
+.. code:: text
+
+    [ 0.  5. 10. 15. 20.]
+
+
+We can also create related function and kernel closures together like this:
+
+.. code:: python
+
+    def create_fk(a, b):
+        @wp.func
+        def f(x: float):
+            return a * x
+
+        @wp.kernel    
+        def k(a: wp.array(dtype=float)):
+            tid = wp.tid()
+            a[tid] = f(a[tid]) + b
+
+        return f, k
+
+    # create related function and kernel closures
+    f1, k1 = create_fk(2.0, 3.0)
+    f2, k2 = create_fk(4.0, 5.0)
+
+    # use the functions separately in a new kernel
+    @wp.kernel
+    def kk(a: wp.array(dtype=float)):
+        tid = wp.tid()
+        a[tid] = f1(a[tid]) + f2(a[tid])
+
+    a1 = wp.array([1, 2, 3, 4, 5], dtype=float)
+    a2 = wp.array([1, 2, 3, 4, 5], dtype=float)
+    ak = wp.array([1, 2, 3, 4, 5], dtype=float)
+
+    wp.launch(k1, dim=a1.shape, inputs=[a1])
+    wp.launch(k2, dim=a2.shape, inputs=[a2])
+    wp.launch(kk, dim=ak.shape, inputs=[ak])
+
+    print(a1)
+    print(a2)
+    print(ak)
+
+Output:
+
+.. code:: text
+
+    [ 5.  7.  9. 11. 13.]
+    [ 9. 13. 17. 21. 25.]
+    [ 6. 12. 18. 24. 30.]
+
+
+Dynamic Structs
+~~~~~~~~~~~~~~~
+
+Sometimes it's useful to customize Warp structs with different data types.
+
+Customize Precision
+^^^^^^^^^^^^^^^^^^^
+
+For example, we can create structs with different floating point precision:
+
+.. code:: python
+
+    def create_struct_with_precision(dtype):
+        @wp.struct
+        class S:
+            a: dtype
+            b: dtype
+        return S
+
+    # create structs with different floating point precision
+    S16 = create_struct_with_precision(wp.float16)
+    S32 = create_struct_with_precision(wp.float32)
+    S64 = create_struct_with_precision(wp.float64)
+
+    s16 = S16()
+    s32 = S32()
+    s64 = S64()
+
+    s16.a, s16.b = 2.0001, 3.0000002
+    s32.a, s32.b = 2.0001, 3.0000002
+    s64.a, s64.b = 2.0001, 3.0000002
+
+    # create a generic kernel that works with the different types
+    @wp.kernel
+    def k(s: Any, output: wp.array(dtype=Any)):
+        tid = wp.tid()
+        x = output.dtype(tid)
+        output[tid] = x * s.a + s.b
+
+    a16 = wp.empty(5, dtype=wp.float16)
+    a32 = wp.empty(5, dtype=wp.float32)
+    a64 = wp.empty(5, dtype=wp.float64)
+
+    wp.launch(k, dim=a16.shape, inputs=[s16, a16])
+    wp.launch(k, dim=a32.shape, inputs=[s32, a32])
+    wp.launch(k, dim=a64.shape, inputs=[s64, a64])
+
+    print(a16)
+    print(a32)
+    print(a64)
+
+We can see the effect of using different floating point precision in the output:
+
+.. code:: text
+
+    [ 3.  5.  7.  9. 11.]
+    [ 3.0000002  5.0001     7.0002003  9.000299  11.0004   ]
+    [ 3.0000002  5.0001002  7.0002002  9.0003002 11.0004002]
+
+
+Customize Dimensions
+^^^^^^^^^^^^^^^^^^^^
+
+Another useful application of dynamic structs is the ability to customize dimensionality.  Here, we create structs that work with 2D and 3D data:
+
+.. code:: python
+
+    # create struct with different vectors and matrix dimensions
+    def create_struct_nd(dim):
+        @wp.struct
+        class S:
+            v: wp.types.vector(dim, float)
+            m: wp.types.matrix((dim, dim), float)
+        return S
+
+    S2 = create_struct_nd(2)
+    S3 = create_struct_nd(3)
+
+    s2 = S2()
+    s2.v = (1.0, 2.0)
+    s2.m = ((2.0, 0.0),
+            (0.0, 0.5))
+
+    s3 = S3()
+    s3.v = (1.0, 2.0, 3.0)
+    s3.m = ((2.0, 0.0, 0.0),
+            (0.0, 0.5, 0.0),
+            (0.0, 0.0, 1.0))
+
+    # create a generic kernel that works with the different types
+    @wp.kernel
+    def k(s: Any, output: wp.array(dtype=Any)):
+        tid = wp.tid()
+        x = float(tid)
+        output[tid] = x * s.v * s.m
+
+    a2 = wp.empty(5, dtype=wp.vec2)
+    a3 = wp.empty(5, dtype=wp.vec3)
+
+    wp.launch(k, dim=a2.shape, inputs=[s2, a2])
+    wp.launch(k, dim=a3.shape, inputs=[s3, a3])
+
+    print(a2)
+    print(a3)
+
+Output:
+
+.. code:: text
+
+    [[0. 0.]
+     [2. 1.]
+     [4. 2.]
+     [6. 3.]
+     [8. 4.]]
+    [[ 0.  0.  0.]
+     [ 2.  1.  3.]
+     [ 4.  2.  6.]
+     [ 6.  3.  9.]
+     [ 8.  4. 12.]]
+
+
+Module Reloading
+~~~~~~~~~~~~~~~~
+
+Frequent recompilation can add overhead to a program, especially if the program is creating kernels at runtime.  Consider this program:
+
+.. code:: python
+
+    def create_kernel_with_constant(constant):
+        @wp.kernel
+        def k(a: wp.array(dtype=float)):
+            tid = wp.tid()
+            a[tid] += constant
+        return k
+
+    a = wp.zeros(5, dtype=float)
+
+    k1 = create_kernel_with_constant(17.0)
+    wp.launch(k1, dim=a.shape, inputs=[a])
+    print(a)
+
+    k2 = create_kernel_with_constant(42.0)
+    wp.launch(k2, dim=a.shape, inputs=[a])
+    print(a)
+
+    k3 = create_kernel_with_constant(-9.0)
+    wp.launch(k3, dim=a.shape, inputs=[a])
+    print(a)
+
+Kernel creation is interspersed with kernel launches, which forces reloading on each kernel launch:
+
+.. code:: text
+
+    Module __main__ 96db544 load on device 'cuda:0' took 165.46 ms  (compiled)
+    [17. 17. 17. 17. 17.]
+    Module __main__ 9f609a4 load on device 'cuda:0' took 151.69 ms  (compiled)
+    [59. 59. 59. 59. 59.]
+    Module __main__ e93fbb9 load on device 'cuda:0' took 167.84 ms  (compiled)
+    [50. 50. 50. 50. 50.]
+
+To avoid reloading, all kernels should be created before launching them:
+
+.. code:: python
+
+    def create_kernel_with_constant(constant):
+        @wp.kernel
+        def k(a: wp.array(dtype=float)):
+            tid = wp.tid()
+            a[tid] += constant
+        return k
+
+    k1 = create_kernel_with_constant(17.0)
+    k2 = create_kernel_with_constant(42.0)
+    k3 = create_kernel_with_constant(-9.0)
+
+    a = wp.zeros(5, dtype=float)
+
+    wp.launch(k1, dim=a.shape, inputs=[a])
+    print(a)
+
+    wp.launch(k2, dim=a.shape, inputs=[a])
+    print(a)
+
+    wp.launch(k3, dim=a.shape, inputs=[a])
+    print(a)
+
+.. code:: text
+
+    Module __main__ e93fbb9 load on device 'cuda:0' took 164.87 ms  (compiled)
+    [17. 17. 17. 17. 17.]
+    [59. 59. 59. 59. 59.]
+    [50. 50. 50. 50. 50.]
+
+Redefining identical kernels, functions, and structs should not cause module reloading, since Warp is able to detect duplicates:
+
+.. code:: python
+
+    def create_struct(dtype):
+        @wp.struct
+        class S:
+            a: dtype
+            b: dtype
+        return S
+
+    def create_function(dtype, S):
+        @wp.func
+        def f(s: S):
+            return s.a * s.b
+        return f
+
+    def create_kernel(dtype, S, f, C):
+        @wp.kernel
+        def k(a: wp.array(dtype=dtype)):
+            tid = wp.tid()
+            s = S(a[tid], C)
+            a[tid] = f(s)
+        return k
+
+    # create identical struct, function, and kernel in a loop
+    for i in range(3):
+        S = create_struct(float)
+        f = create_function(float, S)
+        k = create_kernel(float, S, f, 3.0)
+
+        a = wp.array([1, 2, 3, 4, 5], dtype=float)
+
+        wp.launch(k, dim=a.shape, inputs=[a])
+        print(a)
+
+Even though struct ``S``, function ``f``, and kernel ``k`` are re-created in each iteration of the loop, they are duplicates so the module is only loaded once:
+
+.. code:: text
+
+    Module __main__ 4af2d60 load on device 'cuda:0' took 181.34 ms  (compiled)
+    [ 3.  6.  9. 12. 15.]
+    [ 3.  6.  9. 12. 15.]
+    [ 3.  6.  9. 12. 15.]
+
+
+.. _late_binding:
+
+Late Binding and Static Expressions
+-----------------------------------
+
+Python uses late binding, which means that variables can be referenced in a function before they are defined:
+
+.. code:: python
+
+    def k():
+        # Function f() and constant C are not defined yet.
+        # They will be resolved when k() is called.
+        print(f() + C)
+
+    def f():
+        return 42
+
+    C = 17
+
+    # late binding occurs in this call
+    k()
+
+Warp follows this convention by default, because it's the Pythonic way.  Here is a similar program written in Warp:
+
+.. code:: python
+
+    @wp.kernel
+    def k():
+        # Function f() and constant C are not defined yet.
+        # They will be resolved when k() is called.
+        print(f() + C)
+
+    @wp.func
+    def f():
+        return 42
+
+    C = 17
+
+    # late binding occurs in this launch, when the module is compiled
+    wp.launch(k, dim=1)
+
+    # wait for the output
+    wp.synchronize_device()
+
+Late binding is often convenient, but it can sometimes lead to surprising results.  Consider this snippet, which creates kernels in a loop.  The kernels reference the loop variable as a constant.
+
+.. code:: python
+
+    # create a list of kernels that use the loop variable
+    kernels = []
+    for i in range(3):
+        @wp.kernel
+        def k():
+            print(i)
+        kernels.append(k)
+
+    # launch the kernels
+    for k in kernels:
+        wp.launch(k, dim=1)
+
+    wp.synchronize_device()
+
+This prints:
+
+.. code:: text
+
+    2
+    2
+    2
+
+This might be surprising, but creating a similar program in pure Python would lead to the same results.  Because of late binding, the captured loop variable ``i`` is not evaluated until the kernels are launched.  At that moment, the value of ``i`` is 2 and we see the same output from each kernel.
+
+In Warp, ``wp.static()`` can be used to get around this problem:
+
+.. code:: python
+
+    # create a list of kernels that use the loop variable
+    kernels = []
+    for i in range(3):
+        @wp.kernel
+        def k():
+            print(wp.static(i))  # wp.static() for the win
+        kernels.append(k)
+
+    # launch the kernels
+    for k in kernels:
+        wp.launch(k, dim=1)
+
+    wp.synchronize_device()
+
+Warp replaces the call to ``wp.static()`` with the value of the expression passed as its argument.  The expression is evaluated immediately at the time of kernel definition.  This is similar to static binding used by languages like C++, which means that all variables referenced by the static expression must already be defined.
+
+To further illustrate the difference between the default late binding behavior and static expressions, consider this program:
+
+.. code:: python
+
+    C = 17
+
+    @wp.kernel
+    def k1():
+        print(C)
+
+    @wp.kernel
+    def k2():
+        print(wp.static(C))
+
+    # redefine constant
+    C = 42
+
+    wp.launch(k1, dim=1)
+    wp.launch(k2, dim=1)
+
+    wp.synchronize_device()
+
+Output:
+
+.. code:: text
+
+    42
+    17
+
+Kernel ``k1`` uses late binding of ``C``.  This means that it captures the latest value of ``C``, determined when the module is built during the launch.  Kernel ``k2`` consumes ``C`` in a static expression, thus it captures the value of ``C`` when the kernel is defined.
+
+The same rules apply to resolving Warp functions:
+
+.. code:: python
+
+    @wp.func
+    def f():
+        return 17
+
+    @wp.kernel
+    def k1():
+        print(f())
+
+    @wp.kernel
+    def k2():
+        print(wp.static(f)())
+
+    # redefine function
+    @wp.func
+    def f():
+        return 42
+
+    wp.launch(k1, dim=1)
+    wp.launch(k2, dim=1)
+
+    wp.synchronize_device()
+
+Output:
+
+.. code:: text
+
+    42
+    17
+
+Kernel ``k1`` uses the latest definition of function ``f``, while kernel ``k2`` uses the definition of ``f`` when the kernel was declared.
diff --git a/docs/configuration.rst b/docs/configuration.rst
index 04ac2d2c..b054d5d8 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -7,6 +7,8 @@ Warp has settings at the global, module, and kernel level that can be used to fi
 of Warp programs. In cases in which a setting can be changed at multiple levels (e.g.: ``enable_backward``),
 the setting at the more-specific scope takes precedence.
 
+.. _global-settings:
+
 Global Settings
 ---------------
 
diff --git a/docs/index.rst b/docs/index.rst
index ac324f32..4338cb9f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,7 +7,7 @@ regular Python functions and JIT compiles them to efficient kernel code that can
 Warp is designed for `spatial computing <https://en.wikipedia.org/wiki/Spatial_computing>`_
 and comes with a rich set of primitives that make it easy to write 
 programs for physics simulation, perception, robotics, and geometry processing. In addition, Warp kernels 
-are differentiable and can be used as part of machine-learning pipelines with frameworks such as PyTorch and JAX.
+are differentiable and can be used as part of machine-learning pipelines with frameworks such as PyTorch, JAX and Paddle.
 
 Below are some examples of simulations implemented using Warp:
 
@@ -320,8 +320,7 @@ Contributing
 
 Contributions and pull requests from the community are welcome and are taken under the
 terms described in the **Feedback** section of `LICENSE.md <https://github.com/NVIDIA/warp/blob/main/LICENSE.md#9-feedback>`__.
-`CONTRIBUTING.md <https://github.com/NVIDIA/warp/blob/main/CONTRIBUTING.md>`_ provides additional information on
-how to open a pull request for Warp.
+Please see the :doc:`modules/contribution_guide` for more information on contributing to the development of Warp.
 
 Citing
 ------
@@ -356,12 +355,14 @@ Full Table of Contents
     configuration
     debugging
     limitations
+    modules/contribution_guide
     faq
 
 .. toctree::
     :maxdepth: 2
     :caption: Advanced Topics
 
+    codegen
     modules/allocators
     modules/concurrency
     profiling
diff --git a/docs/installation.rst b/docs/installation.rst
index 016109f0..b432a326 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -25,11 +25,11 @@ the ``pip install`` command, e.g.
    * - Platform
      - Install Command
    * - Linux aarch64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.3.3/warp_lang-1.3.3+cu11-py3-none-manylinux2014_aarch64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_aarch64.whl``
    * - Linux x86-64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.3.3/warp_lang-1.3.3+cu11-py3-none-manylinux2014_x86_64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_x86_64.whl``
    * - Windows x86-64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.3.3/warp_lang-1.3.3+cu11-py3-none-win_amd64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-win_amd64.whl``
 
 The ``--force-reinstall`` option may need to be used to overwrite a previous installation.
 
@@ -76,6 +76,7 @@ The following optional dependencies are required to support certain features:
 * `usd-core <https://pypi.org/project/usd-core>`_: Required for some Warp examples, ``warp.sim.parse_usd()``, and ``warp.render.UsdRenderer``.
 * `JAX <https://jax.readthedocs.io/en/latest/installation.html>`_: Required for JAX interoperability (see :ref:`jax-interop`).
 * `PyTorch <https://pytorch.org/get-started/locally/>`_: Required for PyTorch interoperability (see :ref:`pytorch-interop`).
+* `Paddle <https://github.com/PaddlePaddle/Paddle>`_: Required for Paddle interoperability (see :ref:`paddle-interop`).
 * `NVTX for Python <https://github.com/NVIDIA/NVTX#python>`_: Required to use :class:`wp.ScopedTimer(use_nvtx=True) <warp.ScopedTimer>`.
 
 Building the Warp documentation requires:
diff --git a/docs/modules/contribution_guide.rst b/docs/modules/contribution_guide.rst
new file mode 100644
index 00000000..905de2c3
--- /dev/null
+++ b/docs/modules/contribution_guide.rst
@@ -0,0 +1,333 @@
+Contribution Guide
+==================
+
+Some ways to contribute to the development of Warp include:
+
+* Reporting bugs and requesting new features on `GitHub <https://github.com/NVIDIA/warp/issues>`__.
+* Asking questions, sharing your work, or participating in discussion threads on
+  `GitHub <https://github.com/NVIDIA/warp/discussions>`__ (preferred) or
+  `Discord <https://discord.com/invite/nvidiaomniverse>`__. 
+* Adding new examples to the Warp repository.
+* Documentation improvements.
+* Contributing bug fixes or new features.
+
+Code Contributions
+------------------
+
+Code contributions from the community are welcome and are taken under the
+terms described in the **Feedback** section of `LICENSE.md <https://github.com/NVIDIA/warp/blob/main/LICENSE.md#9-feedback>`__.
+
+Contributors are encouraged to first open an issue on GitHub to discuss proposed feature contributions and gauge
+potential interest.
+
+Overview
+^^^^^^^^
+
+#. Create a fork of the Warp GitHub repository by visiting https://github.com/NVIDIA/warp/fork
+#. Clone your fork on your local machine, e.g. ``git clone git@github.com:username/warp.git``.
+#. Create a branch to develop your contribution on, e.g. ``git checkout -b mmacklin/cuda-bvh-optimizations``.
+
+   Use the following naming conventions for the branch name:
+
+   * New features: ``username/feature-name``
+   * Bug fixes: ``bugfix/feature-name``
+
+#. Make your desired changes.
+
+   * Please familiarize yourself with the :ref:`coding-guidelines`.
+   * Ensure that code changes pass :ref:`linting and formatting checks <linting-and-formatting>`.
+   * Test cases should be written to verify correctness (:ref:`testing-warp`).
+   * Documentation should be added for new features (:ref:`building-docs`).
+   * Add an entry to the unreleased section at the top of the
+     `CHANGELOG.md <https://github.com/NVIDIA/warp/blob/main/CHANGELOG.md>`__ describing the changes.
+
+#. Push your branch to your GitHub fork, e.g. ``git push origin username/feature-name``.
+#. Submit a pull request on GitHub to the ``main`` branch (:ref:`pull-requests`).
+   Work with reviewers to ensure the pull request is in a state suitable for merging.
+
+.. _coding-guidelines:
+
+General Coding Guidelines
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* Follow `PEP 8 <https://peps.python.org/pep-0008/>`__ as the baseline for coding style, but prioritize matching the
+  existing style and conventions of the file being modified to maintain consistency.
+* Use `snake case <https://en.wikipedia.org/wiki/Snake_case>`__ for all function names.
+* Use `Google-style docstrings <https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings>`__
+  for Python code.
+* Include the NVIDIA copyright header on all newly created files, updating the year to current year at the time of
+  the initial file creation.
+* Aim for consistency in variable and function names.
+
+  * Use the existing terminology when possible when naming new functions (e.g. use ``points`` instead of ``vertex_buffer``).
+  * Don't introduce new abbreviations if one already exists in the code base.
+  * Also be mindful of consistency and clarity when naming local function variables.
+
+* Avoid generic function names like ``get_data()``.
+* Follow the existing style conventions in any CUDA C++ files being modified.
+* Use both ``inputs`` and ``outputs`` parameters in ``wp.launch()`` in functions that are expected to be used in
+  differentiable programming applications to aid in visualization and debugging tools.
+
+.. _linting-and-formatting:
+
+Linting and Formatting
+^^^^^^^^^^^^^^^^^^^^^^
+
+`Ruff <https://docs.astral.sh/ruff/>`__ is used as the linter and code formatter for Python code in the Warp repository.
+The contents of pull requests will automatically be checked to ensure adherence to our formatting and linting standards.
+
+We recommend first running Ruff locally on your branch prior to opening a pull request.
+From the project root, run:
+
+.. code-block:: bash
+
+    pip install pre-commit
+    pre-commit run --all
+
+This command will attempt to fix any lint violations and then format the code.
+
+To run Ruff checks at the same time as ``git commit``, pre-commit hooks can be installed by running this command in the project root:
+
+.. code-block:: bash
+
+    pre-commit install
+
+.. _building-docs:
+
+Building the Documentation
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The Sphinx documentation can be built by running the following from the project root:
+
+.. code-block:: bash
+
+    pip install -r docs/requirements.txt
+    python build_docs.py
+
+This command also regenerates the stub file (``warp/stubs.py``) and the reStructuredText file for the
+:doc:`functions` page. After building the documentation, it is recommended to run a ``git status`` to
+check if your changes have modified these files. If so, please commit the modified files to your branch.
+
+.. note:: In the future, Warp needs to be built at least once prior to building the documentation.
+
+.. _pull-requests:
+
+Pull Request Guidelines
+^^^^^^^^^^^^^^^^^^^^^^^
+
+* Ensure your pull request has a descriptive title that clearly states the purpose of the changes.
+* Include a brief description covering:
+
+  * Summary of changes.
+  * Areas affected by the changes.
+  * The problem being solved.
+  * Any limitations or non-handled areas in the changes.
+  * Any existing GitHub issues being addressed by the changes.
+
+.. _testing-warp:
+
+Testing Warp
+------------
+
+Running the Test Suite
+^^^^^^^^^^^^^^^^^^^^^^
+
+Warp's test suite uses the `unittest <https://docs.python.org/3/library/unittest.html>`__ unit testing framework,
+along with `unittest-parallel <https://github.com/craigahobbs/unittest-parallel>`__ to run tests in parallel.
+
+The majority of the Warp tests are located in the `warp/tests <https://github.com/NVIDIA/warp/tree/main/warp/tests>`__
+directory. As part of the test suite, most examples in the ``warp/examples`` subdirectories are tested via
+`test_examples.py <https://github.com/NVIDIA/warp/blob/main/warp/tests/test_examples.py>`__.
+
+After building and installing Warp (``pip install -e .`` from the project root), run the test suite using
+``python -m warp.tests``. The tests should take 5–10 minutes to run. By default, only the test modules
+defined in ``default_suite()`` (in ``warp/tests/unittest_suites.py``) are run. To run the test suite
+using `test discovery <https://docs.python.org/3/library/unittest.html#test-discovery>`__, use
+``python -m warp.tests -s autodetect``, which will discover tests in modules matching the path
+``warp/tests/test*.py``.
+
+Running a subset of tests
+"""""""""""""""""""""""""
+
+Instead of running the full test suite, there are two main ways to select a subset of tests to run.
+These options must be used with the ``-s autodetect`` option.
+
+Use ``-p PATTERN`` to define a pattern to match test files.
+For example, to run only tests that have ``mesh`` in the file name, use:
+
+.. code-block:: bash
+
+    python -m warp.tests -s autodetect -p '*mesh*.py'
+
+Use ``-k TESTNAMEPATTERNS`` to define `wildcard test name patterns <https://docs.python.org/3/library/unittest.html#unittest.TestLoader.testNamePatterns>`__.
+This option can be used multiple times.
+For example, to run only tests that have either ``mgpu`` or ``cuda`` in their name, use:
+
+.. code-block:: bash
+
+    python -m warp.tests -s autodetect -k 'mgpu' -k 'cuda'
+
+Adding New Tests
+^^^^^^^^^^^^^^^^
+
+For tests that should be run on multiple devices, e.g. ``"cpu"``, ``"cuda:0"``, and ``"cuda:1"``, we recommend
+first defining a test function at the module scope and then using ``add_function_test()`` to add multiple
+test methods (a separate method for each device) to a test class.
+
+.. code-block:: python
+
+    # Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+    # NVIDIA CORPORATION and its licensors retain all intellectual property
+    # and proprietary rights in and to this software, related documentation
+    # and any modifications thereto.  Any use, reproduction, disclosure or
+    # distribution of this software and related documentation without an express
+    # license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+    import unittest
+
+    import warp as wp
+    from warp.tests.unittest_utils import *
+
+
+    def test_amazing_code_test_one(test, device):
+        pass
+
+    devices = get_test_devices()
+
+
+    class TestAmazingCode(unittest.TestCase):
+        pass
+
+    add_function_test(TestAmazingCode, "test_amazing_code_test_one", test_amazing_code_test_one, devices=devices)
+
+
+    if __name__ == "__main__":
+        wp.clear_kernel_cache()
+        unittest.main(verbosity=2)
+
+If we directly run this module, we get the following output:
+
+.. code-block:: bash
+
+    python test_amazing_code.py 
+    Warp 1.3.1 initialized:
+    CUDA Toolkit 12.6, Driver 12.6
+    Devices:
+        "cpu"      : "x86_64"
+        "cuda:0"   : "NVIDIA GeForce RTX 3090" (24 GiB, sm_86, mempool enabled)
+        "cuda:1"   : "NVIDIA GeForce RTX 3090" (24 GiB, sm_86, mempool enabled)
+    CUDA peer access:
+        Supported fully (all-directional)
+    Kernel cache:
+        /home/nvidia/.cache/warp/1.3.1
+    test_amazing_code_test_one_cpu (__main__.TestAmazingCode) ... ok
+    test_amazing_code_test_one_cuda_0 (__main__.TestAmazingCode) ... ok
+    test_amazing_code_test_one_cuda_1 (__main__.TestAmazingCode) ... ok
+
+    ----------------------------------------------------------------------
+    Ran 3 tests in 0.001s
+
+    OK
+
+Note that the output indicated that three tests were run, despite us only writing a single test function called
+``test_amazing_code_test_one()``.
+A closer inspection reveals that the test function was run on three separate devices: ``"cpu"``, ``"cuda:0"``, and
+``cuda:1``. This is a result of calling ``add_function_test()`` in our test script with the `devices=devices` argument.
+``add_function_test()`` is defined in ``warp/tests/unittest_utils.py``.
+
+A caveat of using ``add_function_test()`` is that this by itself is not sufficient to ensure that the registered test
+function (e.g. `test_amazing_code_test_one()`) is run on different devices. It is up to the body of the test to make use
+of the ``device`` argument in ensuring that data is allocated on and kernels are run on the intended ``device`` for the
+test, e.g.
+
+.. code-block:: python
+
+    def test_amazing_code_test_one(test, device):
+        with wp.ScopedDevice(device):
+            score = wp.zeros(1, dtype=float, requires_grad=True)
+
+or
+
+.. code-block:: python
+
+    def test_amazing_code_test_one(test, device):
+        score = wp.zeros(1, dtype=float, requires_grad=True, device=device)
+
+Checking for Expected Behaviors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Due to the use of the test-registration function ``add_function_test()``, the ``test`` parameter actually refers to the
+instance of the test class, which always subclasses ``unittest.TestCase``.
+
+The ``unittest`` library also provides methods to check that assertions are raised, as it is also important to test code
+paths that trigger errors. The `assertRaises() <https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertRaises>`__
+and `assertRaisesRegex() <https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertRaisesRegex>`__
+methods can be used to test that a block of code correctly raises an exception.
+
+Sometimes we need to compare the contents of a Warp array with an expected result.
+Some functions that are helpful include:
+
+* ``assert_np_equal()``: Accepts two NumPy arrays as input parameters along with an optional absolute tolerance ``tol``
+  defaulted to 0. If the tolerance is 0, the arrays are compared using ``np.testing.assert_array_equal()``. Otherwise,
+  both NumPy arrays are flattened and compared with ``np.testing.assert_allclose()``.
+* ``assert_array_equal()``: Accepts two Warp arrays as input parameters, converts each array to a NumPy array on the
+  CPU, and then compares the arrays using ``np.testing.assert_equal()``.
+* ``wp.expect_eq()``: Unlike the previous two functions, the array(s) are to be compared by running a Warp kernel
+  so the data can remain in the GPU. This is important if the array is particularly large that an element-wise
+  comparison on the CPU would be prohibitively slow.
+
+Skipping Tests
+^^^^^^^^^^^^^^
+
+Warp needs to be tested on multiple operating systems including macOS, on which NVIDIA GPUs are not supported.
+When it is not possible for a particular test to be executed on *any* devices, there are some mechanisms to mark the
+test as *skipped*.
+
+``unittest`` provides some `methods <https://docs.python.org/3/library/unittest.html#skipping-tests-and-expected-failures>`__
+to skip a test.
+
+If the test function is added to a test class using ``add_function_test()``, we can pass an empty list as the argument
+to the ``device`` parameter.
+
+The final common technique is to avoid calling ``add_function_test`` on a test function in order to skip it.
+Examples are `test_torch.py <https://github.com/NVIDIA/warp/blob/main/warp/tests/test_torch.py>`__,
+`test_jax.py <https://github.com/NVIDIA/warp/blob/main/warp/tests/test_jax.py>`__, and
+`test_dlpack.py <https://github.com/NVIDIA/warp/blob/main/warp/tests/test_dlpack.py>`__.
+This technique is discouraged because the test is not marked as skipped in the ``unittest`` framework.
+Instead, the test is treated as if it does not exist.
+This can create a situation in which we are unaware that a test is being skipped because it does not show up under the
+skipped tests count (it doesn't show up under the passed tests count, either).
+
+Besides the situation in which a test requires CUDA, some examples for skipping tests are:
+
+* ``usd-core`` is not installed in the current environment.
+* The installed JAX version is too old.
+* Warp was not built with CUTLASS support (e.g. `python build_lib.py --quick`).
+* The system does not have at least two CUDA devices available (e.g. required for a multi-GPU test).
+
+Tests Without a Device
+^^^^^^^^^^^^^^^^^^^^^^
+
+Recall that we previously discussed the use of ``add_function_test()`` to register a test function so that it can be
+run on different devices (e.g. ``"cpu"`` and ``"cuda:0"``).
+Sometimes, a test function doesn't make use of a specific device and we only want to run it a single time.
+
+If we still want to use ``add_function_test()`` to register the test, we can pass ``devices=None`` to indicate that the
+function does not make use of devices. In this case, the function will be registered only a single time to the test
+class passed to ``add_function_test()``.
+
+An alternative is to avoid the use of ``add_function_test()`` altogether and define the test function inside the
+test class *directly*.
+Taking our previous example with ``TestAmazingCode``, instead of the class body simply being
+``pass``, we can add a device-agnostic function:
+
+.. code-block:: python
+
+    class TestAmazingCode(unittest.TestCase):
+        def test_amazing_code_no_device(self):
+            self.assertEqual(True, True)
+
+This technique can be more readable to some developers because it avoids the obfuscation of
+``add_function_test(..., device=None)``.
+After all, ``add_function_test()`` is used to facilitate the execution of a single test function on different devices
+instead of having to define a separate function for each device.
diff --git a/docs/modules/differentiability.rst b/docs/modules/differentiability.rst
index 81145d8d..3f1b8243 100644
--- a/docs/modules/differentiability.rst
+++ b/docs/modules/differentiability.rst
@@ -176,6 +176,7 @@ When we run simulations independently in parallel, the Jacobian corresponding to
         
         tape.zero()
 
+.. _custom-gradient-functions:
 
 Custom Gradient Functions
 #########################
diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index ffd87dc9..30a9fd80 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -836,29 +836,56 @@ Tile Primitives
     :returns: A tile with ``shape=(1,n)`` with linearly spaced elements of specified dtype
 
 
-.. py:function:: tile_load(a: Array[Any], x: int32, y: int32, m: int32, n: int32) -> Tile
+.. py:function:: tile_load(a: Array[Any], i: int32, n: int32) -> Tile
 
-    Loads a tile from a global memory array.
+    Loads a 1D tile from a global memory array.
 
     This method will cooperatively load a tile from global memory using all threads in the block.
 
     :param a: The source array in global memory
-    :param x: Offset in the source array measured in multiples of ``m``, i.e.: ``i=x*m``
-    :param y: Offset in the source array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param i: Offset in the source array measured in multiples of ``n``, i.e.: ``offset=i*n``
+    :param n: The number of elements in the tile
+    :returns: A tile with ``shape=(1,n)`` and dtype the same as the source array
+
+
+.. py:function:: tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32) -> Tile
+    :noindex:
+    :nocontentsentry:
+
+    Loads a 2D tile from a global memory array.
+
+    This method will cooperatively load a tile from global memory using all threads in the block.
+
+    :param a: The source array in global memory
+    :param i: Offset in the source array measured in multiples of ``m``, i.e.: ``row=i*m``
+    :param j: Offset in the source array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param m: The size of the tile's first dimension
-    :param n: The size of the tile's second dimensions
+    :param n: The size of the tile's second dimension
     :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array
 
 
-.. py:function:: tile_store(a: Array[Any], x: int32, y: int32, t: Any) -> None
+.. py:function:: tile_store(a: Array[Any], i: int32, t: Any) -> None
+
+    Stores a 1D tile to a global memory array.
+
+    This method will cooperatively store a tile to global memory using all threads in the block.
+
+    :param a: The destination array in global memory
+    :param i: Offset in the destination array measured in multiples of ``n``, i.e.: ``offset=i*n``
+    :param t: The source tile to store data from, must have the same dtype as the destination array
+
+
+.. py:function:: tile_store(a: Array[Any], i: int32, j: int32, t: Any) -> None
+    :noindex:
+    :nocontentsentry:
 
     Stores a tile to a global memory array.
 
     This method will cooperatively store a tile to global memory using all threads in the block.
 
     :param a: The destination array in global memory
-    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*m``
-    :param y: Offset in the destination array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param i: Offset in the destination array measured in multiples of ``m``, i.e.: ``row=i*m``
+    :param j: Offset in the destination array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param t: The source tile to store data from, must have the same dtype as the destination array
 
 
@@ -879,8 +906,11 @@ Tile Primitives
 
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
+    * If the input value is a scalar then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a vector then the resulting tile has ``shape=(length(vector), block_dim)``
+
     :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
-    :returns: A tile with ``shape=(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch()``.
+    :returns: A tile with first dimension according to the value type length and a second dimension equal to ``block_dim``
 
     This example shows how to create a linear sequence from thread variables:
 
@@ -898,7 +928,8 @@ Tile Primitives
 
     .. code-block:: text
 
-        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 10 12 14...]]
+        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 ...]]
+
     
 
 
@@ -908,6 +939,9 @@ Tile Primitives
 
     This function converts a block-wide tile back to per-thread values.
 
+    * If the input tile is 1-dimensional then the resulting value will be a per-thread scalar
+    * If the input tile is 2-dimensional then the the resulting value will be a per-thread vector of length M
+
     :param a: A tile with dimensions ``shape=(M, block_dim)``
     :returns: A single value per-thread with the same dtype as the tile
 
@@ -964,6 +998,16 @@ Tile Primitives
     :returns: Tile with ``shape=(N,M)``
 
 
+.. py:function:: tile_broadcast(a: Tile, m: int32, n: int32) -> Tile
+
+    Broadcast a tile.
+
+    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+
+    :param a: Tile to broadcast
+    :returns: Tile with broadcast ``shape=(m, n)``
+
+
 .. py:function:: tile_sum(a: Tile) -> Tile
 
     Cooperatively compute the sum the tile elements using all threads in the block.
@@ -1529,6 +1573,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1536,6 +1582,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1543,6 +1591,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1550,6 +1600,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1557,6 +1609,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1564,6 +1618,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1571,6 +1627,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1578,6 +1636,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1585,6 +1645,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1592,6 +1654,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1599,6 +1663,8 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1606,11 +1672,15 @@ Utility
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, value: Any) -> Any
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1618,6 +1688,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1625,6 +1697,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1632,6 +1706,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1639,6 +1715,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1646,6 +1724,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1653,6 +1733,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1660,6 +1742,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
     :noindex:
@@ -1667,6 +1751,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
     :noindex:
@@ -1674,6 +1760,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
     :noindex:
@@ -1681,6 +1769,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
     :noindex:
@@ -1688,6 +1778,8 @@ Utility
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+
 
 .. py:function:: lerp(a: Float, b: Float, t: Float) -> Float
 
@@ -2581,5 +2673,22 @@ Operators
     :nocontentsentry:
 
 
+
+
+Code Generation
+---------------
+.. py:function:: static(expr: Any) -> Any
+
+    Evaluates a static Python expression and replaces it with its result.
+
+    See the `codegen.html#static-expressions <section on code generation>`_ for more details.
+
+    Note:
+        The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
+        which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
+        The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
+        (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).
+
+
 .. rubric:: Footnotes
 .. [1] Function gradients have not been implemented for backpropagation.
diff --git a/docs/modules/interoperability.rst b/docs/modules/interoperability.rst
index 800d4e79..ef215f7c 100644
--- a/docs/modules/interoperability.rst
+++ b/docs/modules/interoperability.rst
@@ -709,6 +709,7 @@ The canonical way to export a Warp array to an external framework is to use the
 
     jax_array = jax.dlpack.from_dlpack(warp_array)
     torch_tensor = torch.utils.dlpack.from_dlpack(warp_array)
+    paddle_tensor = paddle.utils.dlpack.from_dlpack(warp_array)
 
 For CUDA arrays, this will synchronize the current stream of the consumer framework with the current Warp stream on the array's device.
 Thus it should be safe to use the wrapped array in the consumer framework, even if the array was previously used in a Warp kernel
@@ -719,9 +720,11 @@ This approach may be used for older versions of frameworks that do not support t
 
     warp_array1 = wp.from_dlpack(jax.dlpack.to_dlpack(jax_array))
     warp_array2 = wp.from_dlpack(torch.utils.dlpack.to_dlpack(torch_tensor))
+    warp_array3 = wp.from_dlpack(paddle.utils.dlpack.to_dlpack(paddle_tensor))
 
     jax_array = jax.dlpack.from_dlpack(wp.to_dlpack(warp_array))
     torch_tensor = torch.utils.dlpack.from_dlpack(wp.to_dlpack(warp_array))
+    paddle_tensor = paddle.utils.dlpack.from_dlpack(wp.to_dlpack(warp_array))
 
 This approach is generally faster because it skips any stream synchronization, but another solution must be used to ensure correct
 ordering of operations.  In situations where no synchronization is required, using this approach can yield better performance.
@@ -733,3 +736,181 @@ This may be a good choice in situations like these:
 
 .. autofunction:: warp.from_dlpack
 .. autofunction:: warp.to_dlpack
+
+.. _paddle-interop:
+
+Paddle
+------
+
+Warp provides helper functions to convert arrays to/from Paddle::
+
+    w = wp.array([1.0, 2.0, 3.0], dtype=float, device="cpu")
+
+    # convert to Paddle tensor
+    t = wp.to_paddle(w)
+
+    # convert from Paddle tensor
+    w = wp.from_paddle(t)
+
+These helper functions allow the conversion of Warp arrays to/from Paddle tensors without copying the underlying data.
+At the same time, if available, gradient arrays and tensors are converted to/from Paddle autograd tensors, allowing the use of Warp arrays
+in Paddle autograd computations.
+
+.. autofunction:: warp.from_paddle
+.. autofunction:: warp.to_paddle
+.. autofunction:: warp.device_from_paddle
+.. autofunction:: warp.device_to_paddle
+.. autofunction:: warp.dtype_from_paddle
+.. autofunction:: warp.dtype_to_paddle
+
+To convert a Paddle CUDA stream to a Warp CUDA stream and vice versa, Warp provides the following functions:
+
+.. autofunction:: warp.stream_from_paddle
+
+Example: Optimization using ``warp.from_paddle()``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An example usage of minimizing a loss function over an array of 2D points written in Warp via Paddle's Adam optimizer
+using :func:`warp.from_paddle` is as follows::
+
+    import warp as wp
+    import paddle
+
+    # init warp context at beginning
+    wp.context.init()
+
+    @wp.kernel()
+    def loss(xs: wp.array(dtype=float, ndim=2), l: wp.array(dtype=float)):
+        tid = wp.tid()
+        wp.atomic_add(l, 0, xs[tid, 0] ** 2.0 + xs[tid, 1] ** 2.0)
+
+    # indicate requires_grad so that Warp can accumulate gradients in the grad buffers
+    xs = paddle.randn([100, 2])
+    xs.stop_gradient = False
+    l = paddle.zeros([1])
+    l.stop_gradient = False
+    opt = paddle.optimizer.Adam(learning_rate=0.1, parameters=[xs])
+
+    wp_xs = wp.from_paddle(xs)
+    wp_l = wp.from_paddle(l)
+
+    tape = wp.Tape()
+    with tape:
+        # record the loss function kernel launch on the tape
+        wp.launch(loss, dim=len(xs), inputs=[wp_xs], outputs=[wp_l], device=wp_xs.device)
+
+    for i in range(500):
+        tape.zero()
+        tape.backward(loss=wp_l)  # compute gradients
+        # now xs.grad will be populated with the gradients computed by Warp
+        opt.step()  # update xs (and thereby wp_xs)
+
+        # these lines are only needed for evaluating the loss
+        # (the optimization just needs the gradient, not the loss value)
+        wp_l.zero_()
+        wp.launch(loss, dim=len(xs), inputs=[wp_xs], outputs=[wp_l], device=wp_xs.device)
+        print(f"{i}\tloss: {l.item()}")
+
+Example: Optimization using ``warp.to_paddle``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Less code is needed when we declare the optimization variables directly in Warp and use :func:`warp.to_paddle` to convert them to Paddle tensors.
+Here, we revisit the same example from above where now only a single conversion to a paddle tensor is needed to supply Adam with the optimization variables::
+
+    import warp as wp
+    import numpy as np
+    import paddle
+
+    # init warp context at beginning
+    wp.context.init()
+
+    @wp.kernel()
+    def loss(xs: wp.array(dtype=float, ndim=2), l: wp.array(dtype=float)):
+        tid = wp.tid()
+        wp.atomic_add(l, 0, xs[tid, 0] ** 2.0 + xs[tid, 1] ** 2.0)
+
+    # initialize the optimization variables in Warp
+    xs = wp.array(np.random.randn(100, 2), dtype=wp.float32, requires_grad=True)
+    l = wp.zeros(1, dtype=wp.float32, requires_grad=True)
+    # just a single wp.to_paddle call is needed, Adam optimizes using the Warp array gradients
+    opt = paddle.optimizer.Adam(learning_rate=0.1, parameters=[wp.to_paddle(xs)])
+
+    tape = wp.Tape()
+    with tape:
+        wp.launch(loss, dim=len(xs), inputs=[xs], outputs=[l], device=xs.device)
+
+    for i in range(500):
+        tape.zero()
+        tape.backward(loss=l)
+        opt.step()
+
+        l.zero_()
+        wp.launch(loss, dim=len(xs), inputs=[xs], outputs=[l], device=xs.device)
+        print(f"{i}\tloss: {l.numpy()[0]}")
+
+Performance Notes
+^^^^^^^^^^^^^^^^^
+
+The ``wp.from_paddle()`` function creates a Warp array object that shares data with a Paddle tensor.  Although this function does not copy the data, there is always some CPU overhead during the conversion.  If these conversions happen frequently, the overall program performance may suffer.  As a general rule, it's good to avoid repeated conversions of the same tensor.  Instead of:
+
+.. code:: python
+
+    x_t = paddle.arange(n, dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+    y_t = paddle.ones([n], dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+
+    for i in range(10):
+        x_w = wp.from_paddle(x_t)
+        y_w = wp.from_paddle(y_t)
+        wp.launch(saxpy, dim=n, inputs=[x_w, y_w, 1.0], device=device)
+
+Try converting the arrays only once and reuse them:
+
+.. code:: python
+
+    x_t = paddle.arange(n, dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+    y_t = paddle.ones([n], dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+
+    x_w = wp.from_paddle(x_t)
+    y_w = wp.from_paddle(y_t)
+
+    for i in range(10):
+        wp.launch(saxpy, dim=n, inputs=[x_w, y_w, 1.0], device=device)
+
+If reusing arrays is not possible (e.g., a new Paddle tensor is constructed on every iteration), passing ``return_ctype=True`` to ``wp.from_paddle()`` should yield faster performance.  Setting this argument to True avoids constructing a ``wp.array`` object and instead returns a low-level array descriptor.  This descriptor is a simple C structure that can be passed to Warp kernels instead of a ``wp.array``, but cannot be used in other places that require a ``wp.array``.
+
+.. code:: python
+
+    for n in range(1, 10):
+        # get Paddle tensors for this iteration
+        x_t = paddle.arange(n, dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+        y_t = paddle.ones([n], dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+
+        # get Warp array descriptors
+        x_ctype = wp.from_paddle(x_t, return_ctype=True)
+        y_ctype = wp.from_paddle(y_t, return_ctype=True)
+
+        wp.launch(saxpy, dim=n, inputs=[x_ctype, y_ctype, 1.0], device=device)
+
+An alternative approach is to pass the Paddle tensors to Warp kernels directly.  This avoids constructing temporary Warp arrays by leveraging standard array interfaces (like ``__cuda_array_interface__``) supported by both Paddle and Warp.  The main advantage of this approach is convenience, since there is no need to call any conversion functions.  The main limitation is that it does not handle gradients, because gradient information is not included in the standard array interfaces.  This technique is therefore most suitable for algorithms that do not involve differentiation.
+
+.. code:: python
+
+    x = paddle.arange(n, dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+    y = paddle.ones([n], dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+
+    for i in range(10):
+        wp.launch(saxpy, dim=n, inputs=[x, y, 1.0], device=device)
+
+.. code:: shell
+
+    python -m warp.examples.benchmarks.benchmark_interop_paddle
+
+Sample output:
+
+.. code::
+
+    13990 ms  from_paddle(...)
+     5990 ms  from_paddle(..., return_ctype=True)
+    35167 ms  direct from paddle
+
+The default ``wp.from_paddle()`` conversion is the slowest.  Passing ``return_ctype=True`` is the fastest, because it skips creating temporary Warp array objects.  Passing Paddle tensors to Warp kernels directly falls somewhere in between.  It skips creating temporary Warp arrays, but accessing the ``__cuda_array_interface__`` attributes of Paddle tensors adds overhead because they are initialized on-demand.
diff --git a/docs/modules/runtime.rst b/docs/modules/runtime.rst
index 05c63d43..aa628608 100644
--- a/docs/modules/runtime.rst
+++ b/docs/modules/runtime.rst
@@ -47,67 +47,7 @@ generated compilation artifacts as Warp does not automatically try to keep the c
 Runtime Kernel Creation
 #######################
 
-It is often desirable to specialize kernels for different types, constants, or functions at runtime.
-We can achieve this through the use of runtime kernel specialization using Python closures.
-
-For example, we might require a variety of kernels that execute particular functions for each item in an array.
-We might also want this function call to be valid for a variety of data types. Making use of closure and generics, we can generate
-these kernels using a single kernel definition::
-
-    def make_kernel(func, dtype):
-        def closure_kernel_fn(data: wp.array(dtype=dtype), out: wp.array(dtype=dtype)):
-            tid = wp.tid()
-            out[tid] = func(data[tid])
-
-        return wp.Kernel(closure_kernel_fn)
-
-In practice, we might use our kernel generator, ``make_kernel()`` as follows::
-
-    @wp.func
-    def sqr(x: Any) -> Any:
-        return x * x
-
-    @wp.func
-    def cube(x: Any) -> Any:
-        return sqr(x) * x
-
-    sqr_float = make_kernel(sqr, wp.float32)
-    cube_double = make_kernel(cube, wp.float64)
-
-    arr = [1.0, 2.0, 3.0]
-    N = len(arr)
-
-    data_float = wp.array(arr, dtype=wp.float32, device=device)
-    data_double = wp.array(arr, dtype=wp.float64, device=device)
-
-    out_float = wp.zeros(N, dtype=wp.float32, device=device)
-    out_double = wp.zeros(N, dtype=wp.float64, device=device)
-
-    wp.launch(sqr_float, dim=N, inputs=[data_float], outputs=[out_float], device=device)
-    wp.launch(cube_double, dim=N, inputs=[data_double], outputs=[out_double], device=device)
-
-We can specialize kernel definitions over Warp constants similarly. The following generates kernels that add a specified constant
-to a generic-typed array value::
-
-    def make_add_kernel(key, constant):
-        def closure_kernel_fn(data: wp.array(dtype=Any), out: wp.array(dtype=Any)):
-            tid = wp.tid()
-            out[tid] = data[tid] + constant
-
-        return wp.Kernel(closure_kernel_fn, key=key)
-
-    add_ones_int = make_add_kernel("add_one", wp.constant(1))
-    add_ones_vec3 = make_add_kernel("add_ones_vec3", wp.constant(wp.vec3(1.0, 1.0, 1.0)))
-
-    a = wp.zeros(2, dtype=int)
-    b = wp.zeros(2, dtype=wp.vec3)
-
-    a_out = wp.zeros_like(a)
-    b_out = wp.zeros_like(b)
-
-    wp.launch(add_ones_int, dim=a.size, inputs=[a], outputs=[a_out], device=device)
-    wp.launch(add_ones_vec3, dim=b.size, inputs=[b], outputs=[b_out], device=device)
-
+Warp allows generating kernels on-the-fly with various customizations, including closure support.  Refer to the :ref:`Code Generation<code_generation>` section for the latest features.
 
 .. _Arrays:
 
@@ -684,12 +624,15 @@ This can be surprising for users that are accustomed to C-style conversions but
     Users should explicitly cast variables to compatible types using constructors like
     ``int()``, ``float()``, ``wp.float16()``, ``wp.uint8()``, etc.
 
+.. note::
+    For performance reasons, Warp relies on native compilers to perform numeric conversions (e.g., LLVM for CPU and NVRTC for CUDA).  This is generally not a problem, but in some cases the results may vary on different devices.  For example, the conversion ``wp.uint8(-1.0)`` results in undefined behavior, since the floating point value -1.0 is out of range for unsigned integer types.  C++ compilers are free to handle such cases as they see fit.  Numeric conversions are only guaranteed to produce correct results when the value being converted is in the range supported by the target data type.
+
 Constants
 ---------
 
-In general, Warp kernels cannot access variables in the global Python interpreter state. One exception to this is for compile-time constants, which may be declared globally (or as class attributes) and folded into the kernel definition.
+A Warp kernel can access Python variables defined outside of the kernel, which are treated as compile-time constants inside of the kernel.
 
-Constants are defined using the ``wp.constant()`` function. An example is shown below::
+.. code:: python
 
     TYPE_SPHERE = wp.constant(0)
     TYPE_CUBE = wp.constant(1)
@@ -700,15 +643,16 @@ Constants are defined using the ``wp.constant()`` function. An example is shown
 
         t = geometry[wp.tid()]
 
-        if (t == TYPE_SPHERE):
+        if t == TYPE_SPHERE:
             print("sphere")
-        if (t == TYPE_CUBE):
+        elif t == TYPE_CUBE:
             print("cube")
-        if (t == TYPE_CAPSULE):
+        elif t == TYPE_CAPSULE:
             print("capsule")
 
+Note that using ``wp.constant()`` is no longer required, but it performs some type checking and can serve as a reminder that the variables are meant to be used as Warp constants.
 
-.. autoclass:: constant
+The behavior is simple and intuitive when the referenced Python variables never change. For details and more complex scenarios, refer to :ref:`External References and Constants<external_references>`. The :ref:`Code Generation<code_generation>` section contains additional information and tips for advanced usage.
 
 Predefined Constants
 ####################
diff --git a/docs/modules/sim.rst b/docs/modules/sim.rst
index f6d6ce06..973401ad 100644
--- a/docs/modules/sim.rst
+++ b/docs/modules/sim.rst
@@ -163,6 +163,9 @@ Integrators
 .. autoclass:: FeatherstoneIntegrator
     :members:
 
+.. autoclass:: VBDIntegrator
+    :members:
+
 Importers
 ---------
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index d63c016a..b8b6bd59 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,5 @@
-furo==2024.7.18
-sphinx==7.4.7
+furo==2024.8.6
+sphinx==8.0.2
 sphinx_copybutton==0.5.2
-numpy==1.26.4
-ruff==0.5.5
+numpy==2.1.1
+ruff==0.6.8
diff --git a/exts/omni.warp.core/config/extension.toml b/exts/omni.warp.core/config/extension.toml
index 8b80e91f..841caf50 100644
--- a/exts/omni.warp.core/config/extension.toml
+++ b/exts/omni.warp.core/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.3.3"
+version = "1.4.0"
 authors = ["NVIDIA"]
 title = "Warp Core"
 description="The core Warp Python module"
@@ -38,6 +38,7 @@ pyCoverageOmit = [
     "warp/stubs.py",
     "warp/jax.py",
     "warp/torch.py",
+    "warp/paddle.py",
     "warp/build.py",
     "warp/build_dll.py",
     "warp/sim/**",
diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md
index 73426dca..82fb2e73 100644
--- a/exts/omni.warp.core/docs/CHANGELOG.md
+++ b/exts/omni.warp.core/docs/CHANGELOG.md
@@ -1,5 +1,73 @@
 # CHANGELOG
 
+## [1.4.0] - 2024-10-01
+
+### Added
+
+- Support for a new `wp.static(expr)` function that allows arbitrary Python expressions to be evaluated at the time of
+  function/kernel definition ([docs](https://nvidia.github.io/warp/codegen.html#static-expressions)).
+- Support for stream priorities to hint to the device that it should process pending work
+  in high-priority streams over pending work in low-priority streams when possible
+  ([docs](https://nvidia.github.io/warp/modules/concurrency.html#stream-priorities)).
+- Adaptive sparse grid geometry to `warp.fem` ([docs](https://nvidia.github.io/warp/modules/fem.html#adaptivity)).
+- Support for defining `wp.kernel` and `wp.func` objects from within closures.
+- Support for defining multiple versions of kernels, functions, and structs without manually assigning unique keys.
+- Support for default argument values for user functions decorated with `wp.func`.
+- Allow passing custom launch dimensions to `jax_kernel()` ([GH-310](https://github.com/NVIDIA/warp/pull/310)).
+- JAX interoperability examples for sharding and matrix multiplication ([docs](https://nvidia.github.io/warp/modules/interoperability.html#using-shardmap-for-distributed-computation)).
+- Interoperability support for the PaddlePaddle ML framework ([GH-318](https://github.com/NVIDIA/warp/pull/318)).
+- Support `wp.mod()` for vector types ([GH-282](https://github.com/NVIDIA/warp/issues/282)).
+- Expose the modulo operator `%` to Python's runtime scalar and vector types.
+- Support for fp64 `atomic_add`, `atomic_max`, and `atomic_min` ([GH-284](https://github.com/NVIDIA/warp/issues/284)).
+- Support for quaternion indexing (e.g. `q.w`).
+- Support shadowing builtin functions ([GH-308](https://github.com/NVIDIA/warp/issues/308)).
+- Support for redefining function overloads.
+- Add an ocean sample to the `omni.warp` extension.
+- `warp.sim.VBDIntegrator` now supports body-particle collision.
+- Add a [contributing guide](https://nvidia.github.io/warp/modules/contribution_guide.html) to the Sphinx docs .
+- Add documentation for dynamic code generation ([docs](https://nvidia.github.io/warp/codegen.html#dynamic-kernel-creation)).
+
+### Changed
+
+- `wp.sim.Model.edge_indices` now includes boundary edges.
+- Unexposed `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` from the Python scope.
+- Skip unused functions in module code generation, improving performance.
+- Avoid reloading modules if their content does not change, improving performance.
+- `wp.Mesh.points` is now a property instead of a raw data member, its reference can be changed after the mesh is initialized.
+- Improve error message when invalid objects are referenced in a Warp kernel.
+- `if`/`else`/`elif` statements with constant conditions are resolved at compile time with no branches being inserted in the generated code.
+- Include all non-hidden builtins in the stub file.
+- Improve accuracy of symmetric eigenvalues routine in `warp.fem`.
+
+### Fixed
+
+- Fix for `wp.func` erroring out when defining a `Tuple` as a return type hint ([GH-302](https://github.com/NVIDIA/warp/issues/302)).
+- Fix array in-place op (`+=`, `-=`) adjoints to compute gradients correctly in the backwards pass
+- Fix vector, matrix in-place assignment adjoints to compute gradients correctly in the backwards pass, e.g.: `v[1] = x`
+- Fix a bug in which Python docstrings would be created as local function variables in generated code.
+- Fix a bug with autograd array access validation in functions from different modules.
+- Fix a rare crash during error reporting on some systems due to glibc mismatches.
+- Handle `--num_tiles 1` in `example_render_opengl.py` ([GH-306](https://github.com/NVIDIA/warp/issues/306)).
+- Fix the computation of body contact forces in `FeatherstoneIntegrator` when bodies and particles collide.
+- Fix bug in `FeatherstoneIntegrator` where `eval_rigid_jacobian` could give incorrect results or reach an infinite
+  loop when the body and joint indices were not in the same order. Added `Model.joint_ancestor` to fix the indexing
+  from a joint to its parent joint in the articulation.
+- Fix wrong vertex index passed to `add_edges()` called from `ModelBuilder.add_cloth_mesh()` ([GH-319](https://github.com/NVIDIA/warp/issues/319)).
+- Add a workaround for uninitialized memory read warning in the `compute-sanitizer` initcheck tool when using `wp.Mesh`.
+- Fix name clashes when Warp functions and structs are returned from Python functions multiple times.
+- Fix name clashes between Warp functions and structs defined in different modules.
+- Fix code generation errors when overloading generic kernels defined in a Python function.
+- Fix issues with unrelated functions being treated as overloads (e.g., closures).
+- Fix handling of `stream` argument in `array.__dlpack__()`.
+- Fix a bug related to reloading CPU modules.
+- Fix a crash when kernel functions are not found in CPU modules.
+- Fix conditions not being evaluated as expected in `while` statements.
+- Fix printing Boolean and 8-bit integer values.
+- Fix array interface type strings used for Boolean and 8-bit integer values.
+- Fix initialization error when setting struct members.
+- Fix Warp not being initialized upon entering a `wp.Tape` context.
+- Use `kDLBool` instead of `kDLUInt` for DLPack interop of Booleans.
+
 ## [1.3.3] - 2024-09-04
 
 - Bug fixes
diff --git a/exts/omni.warp/config/extension.toml b/exts/omni.warp/config/extension.toml
index 8e80c45d..cfebd3b6 100644
--- a/exts/omni.warp/config/extension.toml
+++ b/exts/omni.warp/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.3.3"
+version = "1.4.0"
 authors = ["NVIDIA"]
 title = "Warp"
 description="Warp OmniGraph Nodes and Sample Scenes"
@@ -35,7 +35,7 @@ exclude = ["Ogn*Database.py", "*/ogn*"]
 "omni.timeline" = {}
 "omni.ui" = {optional = true}
 "omni.usd" = {}
-"omni.warp.core" = {version = "1.3.3", exact = true}
+"omni.warp.core" = {version = "1.4.0", exact = true}
 
 [[python.module]]
 name = "omni.warp._extension"
diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md
index 73426dca..82fb2e73 100644
--- a/exts/omni.warp/docs/CHANGELOG.md
+++ b/exts/omni.warp/docs/CHANGELOG.md
@@ -1,5 +1,73 @@
 # CHANGELOG
 
+## [1.4.0] - 2024-10-01
+
+### Added
+
+- Support for a new `wp.static(expr)` function that allows arbitrary Python expressions to be evaluated at the time of
+  function/kernel definition ([docs](https://nvidia.github.io/warp/codegen.html#static-expressions)).
+- Support for stream priorities to hint to the device that it should process pending work
+  in high-priority streams over pending work in low-priority streams when possible
+  ([docs](https://nvidia.github.io/warp/modules/concurrency.html#stream-priorities)).
+- Adaptive sparse grid geometry to `warp.fem` ([docs](https://nvidia.github.io/warp/modules/fem.html#adaptivity)).
+- Support for defining `wp.kernel` and `wp.func` objects from within closures.
+- Support for defining multiple versions of kernels, functions, and structs without manually assigning unique keys.
+- Support for default argument values for user functions decorated with `wp.func`.
+- Allow passing custom launch dimensions to `jax_kernel()` ([GH-310](https://github.com/NVIDIA/warp/pull/310)).
+- JAX interoperability examples for sharding and matrix multiplication ([docs](https://nvidia.github.io/warp/modules/interoperability.html#using-shardmap-for-distributed-computation)).
+- Interoperability support for the PaddlePaddle ML framework ([GH-318](https://github.com/NVIDIA/warp/pull/318)).
+- Support `wp.mod()` for vector types ([GH-282](https://github.com/NVIDIA/warp/issues/282)).
+- Expose the modulo operator `%` to Python's runtime scalar and vector types.
+- Support for fp64 `atomic_add`, `atomic_max`, and `atomic_min` ([GH-284](https://github.com/NVIDIA/warp/issues/284)).
+- Support for quaternion indexing (e.g. `q.w`).
+- Support shadowing builtin functions ([GH-308](https://github.com/NVIDIA/warp/issues/308)).
+- Support for redefining function overloads.
+- Add an ocean sample to the `omni.warp` extension.
+- `warp.sim.VBDIntegrator` now supports body-particle collision.
+- Add a [contributing guide](https://nvidia.github.io/warp/modules/contribution_guide.html) to the Sphinx docs .
+- Add documentation for dynamic code generation ([docs](https://nvidia.github.io/warp/codegen.html#dynamic-kernel-creation)).
+
+### Changed
+
+- `wp.sim.Model.edge_indices` now includes boundary edges.
+- Unexposed `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` from the Python scope.
+- Skip unused functions in module code generation, improving performance.
+- Avoid reloading modules if their content does not change, improving performance.
+- `wp.Mesh.points` is now a property instead of a raw data member, its reference can be changed after the mesh is initialized.
+- Improve error message when invalid objects are referenced in a Warp kernel.
+- `if`/`else`/`elif` statements with constant conditions are resolved at compile time with no branches being inserted in the generated code.
+- Include all non-hidden builtins in the stub file.
+- Improve accuracy of symmetric eigenvalues routine in `warp.fem`.
+
+### Fixed
+
+- Fix for `wp.func` erroring out when defining a `Tuple` as a return type hint ([GH-302](https://github.com/NVIDIA/warp/issues/302)).
+- Fix array in-place op (`+=`, `-=`) adjoints to compute gradients correctly in the backwards pass
+- Fix vector, matrix in-place assignment adjoints to compute gradients correctly in the backwards pass, e.g.: `v[1] = x`
+- Fix a bug in which Python docstrings would be created as local function variables in generated code.
+- Fix a bug with autograd array access validation in functions from different modules.
+- Fix a rare crash during error reporting on some systems due to glibc mismatches.
+- Handle `--num_tiles 1` in `example_render_opengl.py` ([GH-306](https://github.com/NVIDIA/warp/issues/306)).
+- Fix the computation of body contact forces in `FeatherstoneIntegrator` when bodies and particles collide.
+- Fix bug in `FeatherstoneIntegrator` where `eval_rigid_jacobian` could give incorrect results or reach an infinite
+  loop when the body and joint indices were not in the same order. Added `Model.joint_ancestor` to fix the indexing
+  from a joint to its parent joint in the articulation.
+- Fix wrong vertex index passed to `add_edges()` called from `ModelBuilder.add_cloth_mesh()` ([GH-319](https://github.com/NVIDIA/warp/issues/319)).
+- Add a workaround for uninitialized memory read warning in the `compute-sanitizer` initcheck tool when using `wp.Mesh`.
+- Fix name clashes when Warp functions and structs are returned from Python functions multiple times.
+- Fix name clashes between Warp functions and structs defined in different modules.
+- Fix code generation errors when overloading generic kernels defined in a Python function.
+- Fix issues with unrelated functions being treated as overloads (e.g., closures).
+- Fix handling of `stream` argument in `array.__dlpack__()`.
+- Fix a bug related to reloading CPU modules.
+- Fix a crash when kernel functions are not found in CPU modules.
+- Fix conditions not being evaluated as expected in `while` statements.
+- Fix printing Boolean and 8-bit integer values.
+- Fix array interface type strings used for Boolean and 8-bit integer values.
+- Fix initialization error when setting struct members.
+- Fix Warp not being initialized upon entering a `wp.Tape` context.
+- Use `kDLBool` instead of `kDLUInt` for DLPack interop of Booleans.
+
 ## [1.3.3] - 2024-09-04
 
 - Bug fixes
diff --git a/warp/__init__.py b/warp/__init__.py
index 8ecda0c1..b051f837 100644
--- a/warp/__init__.py
+++ b/warp/__init__.py
@@ -100,11 +100,17 @@
 
 from warp.dlpack import from_dlpack, to_dlpack
 
+from warp.paddle import from_paddle, to_paddle
+from warp.paddle import dtype_from_paddle, dtype_to_paddle
+from warp.paddle import device_from_paddle, device_to_paddle
+from warp.paddle import stream_from_paddle
+
 from warp.build import clear_kernel_cache
 
 from warp.constants import *
 
 from . import builtins
+from warp.builtins import static
 
 import warp.config as config
 
diff --git a/warp/builtins.py b/warp/builtins.py
index 19b1254c..1a940161 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1883,6 +1883,7 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     export=False,
 )
 
+
 def tile_load_1d_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -1892,7 +1893,9 @@ def tile_load_1d_value_func(arg_types, arg_values):
         raise RuntimeError("tile_load() argument 0 must be an array")
 
     if arg_types["a"].ndim != 1:
-        raise RuntimeError("tile_load() argument 0 must be 1-dimensional if using the ``wp.tile_load(array, i, n)`` syntax.")
+        raise RuntimeError(
+            "tile_load() argument 0 must be 1-dimensional if using the ``wp.tile_load(array, i, n)`` syntax."
+        )
 
     if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_load() argument 1 must be an integer")
@@ -1901,7 +1904,7 @@ def tile_load_1d_value_func(arg_types, arg_values):
         raise RuntimeError("'n' keyword argument must be specified when calling tile_load() function")
 
     a = arg_types["a"]
-    m, n = 1, arg_values["n"]
+    _m, n = 1, arg_values["n"]
 
     return TileLoad(a, 1, n)
 
@@ -1918,6 +1921,7 @@ def tile_load_1d_dispatch_func(arg_types: Mapping[str, type], return_type: Any,
 
     return ((array, i), template_args)
 
+
 add_builtin(
     "tile_load",
     input_types={"a": array(dtype=Any), "i": int, "n": int},
@@ -1946,7 +1950,9 @@ def tile_load_2d_value_func(arg_types, arg_values):
         raise RuntimeError("tile_load() argument 0 must be an array")
 
     if arg_types["a"].ndim != 2:
-        raise RuntimeError("tile_load() argument 0 must be 2-dimensional if using the ``wp.tile_load(array, i, j, m, n)`` syntax.")
+        raise RuntimeError(
+            "tile_load() argument 0 must be 2-dimensional if using the ``wp.tile_load(array, i, j, m, n)`` syntax."
+        )
 
     if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_load() argument 1 must be an integer")
@@ -2013,7 +2019,9 @@ def tile_store_1d_value_func(arg_types, arg_values):
         raise RuntimeError("tile_store() argument 0 must be an array")
 
     if arg_types["a"].ndim != 1:
-        raise RuntimeError("tile_load() argument 0 must be a 1-dimensional array if using the ``wp.tile_store(array, i, t)`` syntax.")
+        raise RuntimeError(
+            "tile_load() argument 0 must be a 1-dimensional array if using the ``wp.tile_store(array, i, t)`` syntax."
+        )
 
     if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_store() argument 1 must be an integer")
@@ -2044,6 +2052,7 @@ def tile_store_1d_value_func(arg_types, arg_values):
     export=False,
 )
 
+
 def tile_store_2d_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2056,7 +2065,9 @@ def tile_store_2d_value_func(arg_types, arg_values):
         raise RuntimeError("tile_store() argument 0 must be an array")
 
     if arg_types["a"].ndim != 2:
-        raise RuntimeError("tile_load() argument 0 must be a 2-dimensional array if using the ``wp.tile_store(array, i, j, t)`` syntax.")
+        raise RuntimeError(
+            "tile_load() argument 0 must be a 2-dimensional array if using the ``wp.tile_store(array, i, j, t)`` syntax."
+        )
 
     if not type_is_int(arg_types["i"]):
         raise RuntimeError("tile_store() argument 1 must be an integer")
@@ -2343,6 +2354,7 @@ def tile_transpose_value_func(arg_types, arg_values):
     export=False,
 )
 
+
 def tile_broadcast_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -2364,7 +2376,9 @@ def tile_broadcast_value_func(arg_types, arg_values):
     elif t.N == n:
         stride_n = t.strides[1]
     else:
-        raise RuntimeError(f"Broadcast dimension must be 1 or match destination, shape(src) = {t.m, t.n}, shape(dest) = {m, n}")
+        raise RuntimeError(
+            f"Broadcast dimension must be 1 or match destination, shape(src) = {t.m, t.n}, shape(dest) = {m, n}"
+        )
 
     # try to broadcast first dimension
     if t.M == 1:
@@ -2372,7 +2386,9 @@ def tile_broadcast_value_func(arg_types, arg_values):
     elif t.M == m:
         stride_m = t.strides[0]
     else:
-        raise RuntimeError(f"Broadcast dimension must be 1 or match destination, shape(src) = {t.m, t.n}, shape(dest) = {m, n}")
+        raise RuntimeError(
+            f"Broadcast dimension must be 1 or match destination, shape(src) = {t.m, t.n}, shape(dest) = {m, n}"
+        )
 
     # force the input tile to shared memory
     t.storage = "shared"
@@ -2382,8 +2398,8 @@ def tile_broadcast_value_func(arg_types, arg_values):
 
     return tile_type
 
-def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
 
+def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
     tile = arg_values["a"]
 
     template_args = []
@@ -2412,8 +2428,6 @@ def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any
 )
 
 
-
-
 def tile_matmul_value_func(arg_types, arg_values):
     # return generic type (for doc builds)
     if arg_types is None:
@@ -4579,7 +4593,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4588,7 +4604,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4597,7 +4615,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4606,7 +4626,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.""",
+        doc="""Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4616,7 +4638,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4625,7 +4649,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4634,7 +4660,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4643,7 +4671,9 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
         hidden=hidden,
         input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
         value_func=atomic_op_value_func,
-        doc="""Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.""",
+        doc="""Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -5504,7 +5534,7 @@ def tile_matmul_generic_value_func(arg_types, arg_values):
         raise RuntimeError("tile_matmul() argument 1 must be an tile")
 
     # out = wp.tile_matmul(a, b)
-    if len(arg_types) == 2:        
+    if len(arg_types) == 2:
         return Tile(dtype=a.dtype, M=a.M, N=b.N, storage="shared")
 
     # wp.tile_matmul(a, b, out)
@@ -5673,7 +5703,11 @@ def tile_flip_layout(layout):
 
 add_builtin(
     "tile_matmul",
-    input_types={"a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any), "out": Tile(dtype=Any, M=Any, N=Any)},
+    input_types={
+        "a": Tile(dtype=Any, M=Any, N=Any),
+        "b": Tile(dtype=Any, M=Any, N=Any),
+        "out": Tile(dtype=Any, M=Any, N=Any),
+    },
     value_func=tile_matmul_generic_value_func,
     lto_dispatch_func=tile_matmul_generic_lto_dispatch_func,
     variadic=False,
@@ -5864,3 +5898,36 @@ def tile_fft_generic_lto_dispatch_func(
     export=False,
     namespace="",
 )
+
+# ---------------------------------
+# Code Generation
+
+add_builtin(
+    "static",
+    input_types={"expr": Any},
+    value_type=Any,
+    doc="""Evaluates a static Python expression and replaces it with its result.
+
+    See the `codegen.html#static-expressions <section on code generation>`_ for more details.
+
+    Note:
+        The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
+        which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
+        The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
+        (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).""",
+    group="Code Generation",
+)
+
+
+def static(expr):
+    """
+    Evaluates a static expression and replaces the expression with its result.
+
+    Args:
+        expr: A Python expression to evaluate. Must return a non-null value which must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).
+
+    Note:
+        The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
+        which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
+    """
+    return expr
diff --git a/warp/codegen.py b/warp/codegen.py
index f347c2fc..50288e05 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -237,8 +237,11 @@ def __init__(self, cls: Struct, ctype):
 
     def __getattribute__(self, name):
         cls = super().__getattribute__("_cls")
-        if name in cls.vars:
-            var = cls.vars[name]
+        if name == "native_name":
+            return cls.native_name
+
+        var = cls.vars.get(name)
+        if var is not None:
             if isinstance(var.type, type) and issubclass(var.type, ctypes.Array):
                 # Each field stored in a `StructInstance` is exposed as
                 # a standard Python attribute but also has a `ctypes`
@@ -413,6 +416,9 @@ def __init__(self, cls, key, module):
             elif issubclass(var.type, ctypes.Array):
                 fields.append((label, var.type))
             else:
+                # HACK: fp16 requires conversion functions from warp.so
+                if var.type is warp.float16:
+                    warp.init()
                 fields.append((label, var.type._type_))
 
         class StructType(ctypes.Structure):
@@ -490,6 +496,10 @@ class NewStructInstance(self.cls, StructInstance):
             def __init__(inst):
                 StructInstance.__init__(inst, self, None)
 
+        # make sure warp.types.get_type_code works with this StructInstance
+        NewStructInstance.cls = self.cls
+        NewStructInstance.native_name = self.native_name
+
         return NewStructInstance()
 
     def initializer(self):
@@ -635,6 +645,9 @@ def type_to_ctype(t, value_type=False):
             return t.ctype()
         elif isinstance(t, Struct):
             return t.native_name
+        elif isinstance(t, type) and issubclass(t, StructInstance):
+            # ensure the actual Struct name is used instead of "NewStructInstance"
+            return t.native_name
         elif is_reference(t):
             if not value_type:
                 return Var.type_to_ctype(t.value_type) + "*"
@@ -890,6 +903,12 @@ def __init__(
             # this is to avoid registering false references to overshadowed modules
             adj.symbols[name] = arg
 
+        # try to replace static expressions by their constant result if the
+        # expression can be evaluated at declaration time
+        adj.static_expressions: Dict[str, Any] = {}
+        if "static" in adj.source:
+            adj.replace_static_expressions()
+
         # There are cases where a same module might be rebuilt multiple times,
         # for example when kernels are nested inside of functions, or when
         # a kernel's launch raises an exception. Ideally we'd always want to
@@ -929,6 +948,7 @@ def build(adj, builder, default_builder_options=None):
 
         adj.return_var = None  # return type for function or kernel
         adj.loop_symbols = []  # symbols at the start of each loop
+        adj.loop_const_iter_symbols = set()  # iteration variables (constant) for static loops
 
         # blocks
         adj.blocks = [Block()]
@@ -1268,7 +1288,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         # immediately allocate output variables so we can pass them into the dispatch method
         if return_type is None:
-            # void function 
+            # void function
             output = None
             output_list = []
         elif not isinstance(return_type, Sequence) or len(return_type) == 1:
@@ -1282,7 +1302,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
             output = [adj.add_var(v) for v in return_type]
             output_list = output
 
-
         # If we have a built-in that requires special handling to dispatch
         # the arguments to the underlying C++ function, then we can resolve
         # these using the `dispatch_func`. Since this is only called from
@@ -1575,6 +1594,16 @@ def emit_If(adj, node):
         # eval condition
         cond = adj.eval(node.test)
 
+        if cond.constant is not None:
+            # resolve constant condition
+            if cond.constant:
+                for stmt in node.body:
+                    adj.eval(stmt)
+            else:
+                for stmt in node.orelse:
+                    adj.eval(stmt)
+            return None
+
         # save symbol map
         symbols_prev = adj.symbols.copy()
 
@@ -1670,7 +1699,7 @@ def emit_Name(adj, node):
         if isinstance(obj, types.ModuleType):
             return obj
 
-        raise RuntimeError("Cannot reference a global variable from a kernel unless `wp.constant()` is being used")
+        raise TypeError(f"Invalid external reference type: {type(obj)}")
 
     @staticmethod
     def resolve_type_attribute(var_type: type, attr: str):
@@ -1784,7 +1813,7 @@ def emit_Ellipsis(adj, node):
 
     def emit_NameConstant(adj, node):
         if node.value:
-            return adj.add_constant(True)
+            return adj.add_constant(node.value)
         elif node.value is None:
             raise WarpCodegenTypeError("None type unsupported")
         else:
@@ -1798,7 +1827,7 @@ def emit_Constant(adj, node):
         elif isinstance(node, ast.Ellipsis):
             return adj.emit_Ellipsis(node)
         else:
-            assert isinstance(node, ast.NameConstant)
+            assert isinstance(node, ast.NameConstant) or isinstance(node, ast.Constant)
             return adj.emit_NameConstant(node)
 
     def emit_BinOp(adj, node):
@@ -1839,6 +1868,11 @@ def materialize_redefinitions(adj, symbols):
         # detect symbols with conflicting definitions (assigned inside the for loop)
         for items in symbols.items():
             sym = items[0]
+            if adj.loop_const_iter_symbols is not None and sym in adj.loop_const_iter_symbols:
+                # ignore constant overwriting in for-loops if it is a loop iterator
+                # (it is no problem to unroll static loops multiple times in sequence)
+                continue
+
             var1 = items[1]
             var2 = adj.symbols[sym]
 
@@ -1985,15 +2019,27 @@ def get_unroll_range(adj, loop):
         )
         return range_call
 
+    def begin_record_constant_iter_symbols(adj):
+        if adj.loop_const_iter_symbols is None:
+            adj.loop_const_iter_symbols = set()
+
+    def end_record_constant_iter_symbols(adj):
+        adj.loop_const_iter_symbols = None
+
     def emit_For(adj, node):
         # try and unroll simple range() statements that use constant args
         unroll_range = adj.get_unroll_range(node)
 
         if isinstance(unroll_range, range):
+            const_iter_sym = node.target.id
+            if adj.loop_const_iter_symbols is not None:
+                # prevent constant conflicts in `materialize_redefinitions()`
+                adj.loop_const_iter_symbols.add(const_iter_sym)
+
+            # unroll static for-loop
             for i in unroll_range:
                 const_iter = adj.add_constant(i)
-                var_iter = adj.add_builtin_call("int", [const_iter])
-                adj.symbols[node.target.id] = var_iter
+                adj.symbols[const_iter_sym] = const_iter
 
                 # eval body
                 for s in node.body:
@@ -2009,6 +2055,7 @@ def emit_For(adj, node):
                 iter = adj.eval(node.iter)
 
             adj.symbols[node.target.id] = adj.begin_for(iter)
+            adj.begin_record_constant_iter_symbols()
 
             # for loops should be side-effect free, here we store a copy
             adj.loop_symbols.append(adj.symbols.copy())
@@ -2019,6 +2066,7 @@ def emit_For(adj, node):
 
             adj.materialize_redefinitions(adj.loop_symbols[-1])
             adj.loop_symbols.pop()
+            adj.end_record_constant_iter_symbols()
 
             adj.end_for(iter)
 
@@ -2075,13 +2123,28 @@ def emit_Call(adj, node):
 
         # try and lookup function in globals by
         # resolving path (e.g.: module.submodule.attr)
-        func, path = adj.resolve_static_expression(node.func)
+        if hasattr(node.func, "warp_func"):
+            func = node.func.warp_func
+            path = []
+        else:
+            func, path = adj.resolve_static_expression(node.func)
         if func is None:
             func = adj.eval(node.func)
 
+        if adj.is_static_expression(func):
+            # try to evaluate wp.static() expressions
+            obj, _ = adj.evaluate_static_expression(node)
+            if obj is not None:
+                if isinstance(obj, warp.context.Function):
+                    # special handling for wp.static() evaluating to a function
+                    return obj
+                else:
+                    out = adj.add_constant(obj)
+                    return out
+
         type_args = {}
 
-        if not isinstance(func, warp.context.Function):
+        if len(path) > 0 and not isinstance(func, warp.context.Function):
             attr = path[-1]
             caller = func
             func = None
@@ -2610,6 +2673,190 @@ def resolve_path(adj, path):
 
         return expr
 
+    # retrieves a dictionary of all closure and global variables and their values
+    # to be used in the evaluation context of wp.static() expressions
+    def get_static_evaluation_context(adj):
+        closure_vars = dict(
+            zip(
+                adj.func.__code__.co_freevars,
+                [c.cell_contents for c in (adj.func.__closure__ or [])],
+            )
+        )
+
+        vars_dict = {}
+        vars_dict.update(adj.func.__globals__)
+        # variables captured in closure have precedence over global vars
+        vars_dict.update(closure_vars)
+
+        return vars_dict
+
+    def is_static_expression(adj, func):
+        return (
+            isinstance(func, types.FunctionType)
+            and func.__module__ == "warp.builtins"
+            and func.__qualname__ == "static"
+        )
+
+    # verify the return type of a wp.static() expression is supported inside a Warp kernel
+    def verify_static_return_value(adj, value):
+        if value is None:
+            raise ValueError("None is returned")
+        if warp.types.is_value(value):
+            return True
+        if warp.types.is_array(value):
+            # more useful explanation for the common case of creating a Warp array
+            raise ValueError("a Warp array cannot be created inside Warp kernels")
+        if isinstance(value, str):
+            # we want to support cases such as `print(wp.static("test"))`
+            return True
+        if isinstance(value, warp.context.Function):
+            return True
+
+        def verify_struct(s: StructInstance, attr_path: List[str]):
+            for key in s._cls.vars.keys():
+                v = getattr(s, key)
+                if issubclass(type(v), StructInstance):
+                    verify_struct(v, attr_path + [key])
+                else:
+                    try:
+                        adj.verify_static_return_value(v)
+                    except ValueError as e:
+                        raise ValueError(
+                            f"the returned Warp struct contains a data type that cannot be constructed inside Warp kernels: {e} at {value._cls.key}.{'.'.join(attr_path)}"
+                        ) from e
+
+        if issubclass(type(value), StructInstance):
+            return verify_struct(value, [])
+
+        raise ValueError(f"value of type {type(value)} cannot be constructed inside Warp kernels")
+
+    # find the source code string of an AST node
+    def extract_node_source(adj, node) -> Optional[str]:
+        if not hasattr(node, "lineno") or not hasattr(node, "col_offset"):
+            return None
+
+        start_line = node.lineno - 1  # line numbers start at 1
+        start_col = node.col_offset
+
+        if hasattr(node, "end_lineno") and hasattr(node, "end_col_offset"):
+            end_line = node.end_lineno - 1
+            end_col = node.end_col_offset
+        else:
+            # fallback for Python versions before 3.8
+            # we have to find the end line and column manually
+            end_line = start_line
+            end_col = start_col
+            parenthesis_count = 1
+            for lineno in range(start_line, len(adj.source_lines)):
+                if lineno == start_line:
+                    c_start = start_col
+                else:
+                    c_start = 0
+                line = adj.source_lines[lineno]
+                for i in range(c_start, len(line)):
+                    c = line[i]
+                    if c == "(":
+                        parenthesis_count += 1
+                    elif c == ")":
+                        parenthesis_count -= 1
+                        if parenthesis_count == 0:
+                            end_col = i
+                            end_line = lineno
+                            break
+                if parenthesis_count == 0:
+                    break
+
+        if start_line == end_line:
+            # single-line expression
+            return adj.source_lines[start_line][start_col:end_col]
+        else:
+            # multi-line expression
+            lines = []
+            # first line (from start_col to the end)
+            lines.append(adj.source_lines[start_line][start_col:])
+            # middle lines (entire lines)
+            lines.extend(adj.source_lines[start_line + 1 : end_line])
+            # last line (from the start to end_col)
+            lines.append(adj.source_lines[end_line][:end_col])
+            return "\n".join(lines).strip()
+
+    # handles a wp.static() expression and returns the resulting object and a string representing the code
+    # of the static expression
+    def evaluate_static_expression(adj, node) -> Tuple[Any, str]:
+        if len(node.args) == 1:
+            static_code = adj.extract_node_source(node.args[0])
+        elif len(node.keywords) == 1:
+            static_code = adj.extract_node_source(node.keywords[0])
+        else:
+            raise WarpCodegenError("warp.static() requires a single argument or keyword")
+        if static_code is None:
+            raise WarpCodegenError("Error extracting source code from wp.static() expression")
+
+        vars_dict = adj.get_static_evaluation_context()
+        # add constant variables to the static call context
+        constant_vars = {k: v.constant for k, v in adj.symbols.items() if isinstance(v, Var) and v.constant is not None}
+        vars_dict.update(constant_vars)
+
+        try:
+            value = eval(static_code, vars_dict)
+            if warp.config.verbose:
+                print(f"Evaluated static command: {static_code} = {value}")
+        except NameError as e:
+            raise WarpCodegenError(
+                f"Error evaluating static expression: {e}. Make sure all variables used in the static expression are constant."
+            ) from e
+        except Exception as e:
+            raise WarpCodegenError(
+                f"Error evaluating static expression: {e} while evaluating the following code generated from the static expression:\n{static_code}"
+            ) from e
+
+        try:
+            adj.verify_static_return_value(value)
+        except ValueError as e:
+            raise WarpCodegenError(
+                f"Static expression returns an unsupported value: {e} while evaluating the following code generated from the static expression:\n{static_code}"
+            ) from e
+
+        return value, static_code
+
+    # try to replace wp.static() expressions by their evaluated value if the
+    # expression can be evaluated
+    def replace_static_expressions(adj):
+        class StaticExpressionReplacer(ast.NodeTransformer):
+            def visit_Call(self, node):
+                func, _ = adj.resolve_static_expression(node.func, eval_types=False)
+                if adj.is_static_expression(func):
+                    try:
+                        # the static expression will execute as long as the static expression is valid and
+                        # only depends on global or captured variables
+                        obj, code = adj.evaluate_static_expression(node)
+                        if code is not None:
+                            adj.static_expressions[code] = obj
+                            if isinstance(obj, warp.context.Function):
+                                name_node = ast.Name("__warp_func__")
+                                # we add a pointer to the Warp function here so that we can refer to it later at
+                                # codegen time (note that the function key itself is not sufficient to uniquely
+                                # identify the function, as the function may be redefined between the current time
+                                # of wp.static() declaration and the time of codegen during module building)
+                                name_node.warp_func = obj
+                                return ast.copy_location(name_node, node)
+                            else:
+                                return ast.copy_location(ast.Constant(value=obj), node)
+                    except Exception:
+                        # Ignoring failing static expressions should generally not be an issue because only
+                        # one of these cases should be possible:
+                        #   1) the static expression itself is invalid code, in which case the module cannot be
+                        #      built all,
+                        #   2) the static expression contains a reference to a local (even if constant) variable
+                        #      (and is therefore not executable and raises this exception), in which
+                        #      case changing the constant, or the code affecting this constant, would lead to
+                        #      a different module hash anyway.
+                        pass
+
+                return self.generic_visit(node)
+
+        adj.tree = StaticExpressionReplacer().visit(adj.tree)
+
     # Evaluates a static expression that does not depend on runtime values
     # if eval_types is True, try resolving the path using evaluated type information as well
     def resolve_static_expression(adj, root_node, eval_types=True):
@@ -2684,7 +2931,7 @@ def get_node_source(adj, node):
         # return the Python code corresponding to the given AST node
         return ast.get_source_segment(adj.source, node)
 
-    def get_references(adj) -> Dict[str, Any]:
+    def get_references(adj) -> Tuple[Dict[str, Any], Dict[Any, Any], Dict[warp.context.Function, Any]]:
         """Traverses ``adj.tree`` and returns referenced constants, types, and user-defined functions."""
 
         local_variables = set()  # Track local variables appearing on the LHS so we know when variables are shadowed
@@ -2976,6 +3223,15 @@ def scalar_value(x):
         # make sure we emit the value of objects, e.g. uint32
         return str(value.value)
 
+    elif issubclass(value_type, warp.codegen.StructInstance):
+        # constant struct instance
+        arg_strs = []
+        for key, var in value._cls.vars.items():
+            attr = getattr(value, key)
+            arg_strs.append(f"{Var.type_to_ctype(var.type)}({constant_str(attr)})")
+        arg_str = ", ".join(arg_strs)
+        return f"{value.native_name}({arg_str})"
+
     elif value == math.inf:
         return "INFINITY"
 
diff --git a/warp/config.py b/warp/config.py
index e732f71b..49df51ea 100644
--- a/warp/config.py
+++ b/warp/config.py
@@ -7,7 +7,7 @@
 
 from typing import Optional
 
-version: str = "1.3.3"
+version: str = "1.4.0"
 """Warp version string"""
 
 verify_fp: bool = False
diff --git a/warp/context.py b/warp/context.py
index a66577fd..281a6009 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -1450,9 +1450,9 @@ def hash_function(self, func):
 
             # custom bits
             if ovl.custom_grad_func:
-                ch.update(bytes(ovl.custom_grad_func.adj.source, "utf-8"))
+                ch.update(self.hash_adjoint(ovl.custom_grad_func.adj))
             if ovl.custom_replay_func:
-                ch.update(bytes(ovl.custom_replay_func.adj.source, "utf-8"))
+                ch.update(self.hash_adjoint(ovl.custom_replay_func.adj))
             if ovl.replay_snippet:
                 ch.update(bytes(ovl.replay_snippet, "utf-8"))
             if ovl.native_snippet:
@@ -1516,6 +1516,10 @@ def hash_adjoint(self, adj):
             else:
                 raise RuntimeError(f"Invalid constant type: {type(value)}")
 
+        # hash wp.static() expressions that were evaluated at declaration time
+        for k, v in adj.static_expressions.items():
+            ch.update(bytes(f"{k} = {v}", "utf-8"))
+
         # hash referenced types
         for t in types.keys():
             ch.update(bytes(warp.types.get_type_code(t), "utf-8"))
@@ -1541,8 +1545,8 @@ def __init__(self, module, options, hasher=None):
         self.options = options
         self.module = module
         self.deferred_functions = []
-        self.ltoirs = {}        # map from lto symbol to lto binary
-        self.ltoirs_decl = {}   # map from lto symbol to lto forward declaration
+        self.ltoirs = {}  # map from lto symbol to lto binary
+        self.ltoirs_decl = {}  # map from lto symbol to lto forward declaration
 
         if hasher is None:
             hasher = ModuleHasher(module)
@@ -1617,7 +1621,7 @@ def codegen(self, device):
         source += 'extern "C" {\n'
         for fwd in self.ltoirs_decl.values():
             source += fwd + "\n"
-        source += '}\n'
+        source += "}\n"
 
         # code-gen structs
         visited_structs = set()
diff --git a/warp/dlpack.py b/warp/dlpack.py
index 34de4264..20860c6e 100644
--- a/warp/dlpack.py
+++ b/warp/dlpack.py
@@ -124,6 +124,8 @@ def device_to_dlpack(wp_device) -> DLDevice:
 
 
 def dtype_to_dlpack(wp_dtype) -> DLDataType:
+    if wp_dtype == warp.bool:
+        return (DLDataTypeCode.kDLBool, 8, 1)
     if wp_dtype == warp.int8:
         return (DLDataTypeCode.kDLInt, 8, 1)
     elif wp_dtype == warp.uint8:
diff --git a/warp/examples/benchmarks/benchmark.bat b/warp/examples/benchmarks/benchmark.bat
index 9edec17d..66a5dab3 100644
--- a/warp/examples/benchmarks/benchmark.bat
+++ b/warp/examples/benchmarks/benchmark.bat
@@ -11,3 +11,5 @@ python benchmark_cloth.py numpy
 @REM python benchmark_cloth.py numba
 @REM python benchmark_cloth.py jax_cpu
 @REM python benchmark_cloth.py jax_gpu
+@REM python benchmark_cloth.py paddle_cpu
+@REM python benchmark_cloth.py paddle_gpu
diff --git a/warp/examples/benchmarks/benchmark.sh b/warp/examples/benchmarks/benchmark.sh
index f82289a6..a4d54386 100755
--- a/warp/examples/benchmarks/benchmark.sh
+++ b/warp/examples/benchmarks/benchmark.sh
@@ -11,3 +11,5 @@ python3 benchmark_cloth.py numpy
 # python3 benchmark_cloth.py jax_cpu
 # python3 benchmark_cloth.py jax_gpu
 # python3 benchmark_cloth.py numba
+# python3 benchmark_cloth.py paddle_cpu
+# python3 benchmark_cloth.py paddle_gpu
diff --git a/warp/examples/benchmarks/benchmark_cloth.py b/warp/examples/benchmarks/benchmark_cloth.py
index d28213da..3fc6a740 100644
--- a/warp/examples/benchmarks/benchmark_cloth.py
+++ b/warp/examples/benchmarks/benchmark_cloth.py
@@ -219,6 +219,16 @@ def run_benchmark(mode, dim, timers, render=False):
 
             integrator = benchmark_cloth_jax.JxIntegrator(cloth)
 
+        elif mode == "paddle_cpu":
+            import benchmark_cloth_paddle
+
+            integrator = benchmark_cloth_paddle.TrIntegrator(cloth, "cpu")
+
+        elif mode == "paddle_gpu":
+            import benchmark_cloth_paddle
+
+            integrator = benchmark_cloth_paddle.TrIntegrator(cloth, "gpu")
+
         else:
             raise RuntimeError("Unknown simulation backend")
 
diff --git a/warp/examples/sim/example_cloth.py b/warp/examples/sim/example_cloth.py
index 13ea6860..1a93763d 100644
--- a/warp/examples/sim/example_cloth.py
+++ b/warp/examples/sim/example_cloth.py
@@ -26,9 +26,33 @@
 import warp.sim.render
 
 
+def color_lattice_grid(num_x, num_y):
+    colors = []
+    for _i in range(4):
+        colors.append([])
+
+    for xi in range(num_x + 1):
+        for yi in range(num_y + 1):
+            vId = xi * (num_y + 1) + yi
+
+            a = 1 if xi % 2 else 0
+            b = 1 if yi % 2 else 0
+
+            c = a * 2 + b
+
+            colors[c].append(vId)
+
+    colors_wp = []
+    for i_color in range(len(colors)):
+        colors_wp.append(wp.array(colors[i_color], dtype=wp.int32))
+
+    return colors_wp
+
+
 class IntegratorType(Enum):
     EULER = "euler"
     XPBD = "xpbd"
+    VBD = "vbd"
 
     def __str__(self):
         return self.value
@@ -67,7 +91,7 @@ def __init__(
                 tri_ka=1.0e3,
                 tri_kd=1.0e1,
             )
-        else:
+        elif self.integrator_type == IntegratorType.XPBD:
             builder.add_cloth_grid(
                 pos=wp.vec3(0.0, 4.0, 0.0),
                 rot=wp.quat_from_axis_angle(wp.vec3(1.0, 0.0, 0.0), math.pi * 0.5),
@@ -83,6 +107,22 @@ def __init__(
                 spring_ke=1.0e3,
                 spring_kd=0.0,
             )
+        else:
+            # VBD
+            builder.add_cloth_grid(
+                pos=wp.vec3(0.0, 4.0, 0.0),
+                rot=wp.quat_from_axis_angle(wp.vec3(1.0, 0.0, 0.0), math.pi * 0.5),
+                vel=wp.vec3(0.0, 0.0, 0.0),
+                dim_x=self.sim_width,
+                dim_y=self.sim_height,
+                cell_x=0.1,
+                cell_y=0.1,
+                mass=0.1,
+                fix_left=True,
+                tri_ke=1e4,
+                tri_ka=1e4,
+                tri_kd=1e-5,
+            )
 
         usd_stage = Usd.Stage.Open(os.path.join(warp.examples.get_asset_directory(), "bunny.usd"))
         usd_geom = UsdGeom.Mesh(usd_stage.GetPrimAtPath("/root/bunny"))
@@ -103,16 +143,20 @@ def __init__(
             kf=1.0e1,
         )
 
-        if self.integrator_type == IntegratorType.EULER:
-            self.integrator = wp.sim.SemiImplicitIntegrator()
-        else:
-            self.integrator = wp.sim.XPBDIntegrator(iterations=1)
-
         self.model = builder.finalize()
         self.model.ground = True
         self.model.soft_contact_ke = 1.0e4
         self.model.soft_contact_kd = 1.0e2
 
+        if self.integrator_type == IntegratorType.EULER:
+            self.integrator = wp.sim.SemiImplicitIntegrator()
+        elif self.integrator_type == IntegratorType.XPBD:
+            self.integrator = wp.sim.XPBDIntegrator(iterations=1)
+        else:
+            self.integrator = wp.sim.VBDIntegrator(self.model, iterations=1)
+            # we need to give VBD coloring information
+            self.model.particle_coloring = color_lattice_grid(width, height)
+
         self.state_0 = self.model.state()
         self.state_1 = self.model.state()
 
diff --git a/warp/native/mat.h b/warp/native/mat.h
index 0327a07a..ee084d60 100644
--- a/warp/native/mat.h
+++ b/warp/native/mat.h
@@ -210,6 +210,12 @@ inline CUDA_CALLABLE mat_t<Rows, Rows, Type> identity()
     return m;
 }
 
+template<unsigned Rows, typename Type>
+inline CUDA_CALLABLE void adj_identity(const mat_t<Rows, Rows, Type>& adj_ret)
+{
+    // nop
+}
+
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE bool operator==(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,Cols,Type>& b)
 {
diff --git a/warp/native/quat.h b/warp/native/quat.h
index 3896c029..90f9c556 100644
--- a/warp/native/quat.h
+++ b/warp/native/quat.h
@@ -29,6 +29,14 @@ struct quat_t
         w = static_cast<Type>(other.w);
     }
 
+    inline CUDA_CALLABLE quat_t(const initializer_array<4, Type> &l)
+    {
+        x = l[0];
+        y = l[1];
+        z = l[2];
+        w = l[3];
+    }
+
     // imaginary part
     Type x;
     Type y;
diff --git a/warp/native/spatial.h b/warp/native/spatial.h
index 6e0d27da..48261536 100644
--- a/warp/native/spatial.h
+++ b/warp/native/spatial.h
@@ -127,6 +127,12 @@ struct transform_t
     CUDA_CALLABLE inline transform_t(vec_t<3,Type> p=vec_t<3,Type>(), quat_t<Type> q=quat_t<Type>()) : p(p), q(q) {}
     CUDA_CALLABLE inline transform_t(Type)  {}  // helps uniform initialization
 
+    CUDA_CALLABLE inline transform_t(const initializer_array<7, Type> &l)
+    {
+        p = vec_t<3,Type>(l[0], l[1], l[2]);
+        q = quat_t<Type>(l[3], l[4], l[5], l[6]);
+    }
+
     CUDA_CALLABLE inline Type operator[](int index) const
     {
         assert(index < 7);
diff --git a/warp/paddle.py b/warp/paddle.py
new file mode 100644
index 00000000..65dcf17f
--- /dev/null
+++ b/warp/paddle.py
@@ -0,0 +1,382 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from __future__ import annotations
+
+import ctypes
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy
+
+import warp
+import warp.context
+
+if TYPE_CHECKING:
+    import paddle
+
+
+# return the warp device corresponding to a paddle device
+def device_from_paddle(paddle_device: Union[paddle.base.libpaddle.Place, str]) -> warp.context.Device:
+    """Return the Warp device corresponding to a Paddle device.
+
+    Args:
+        paddle_device (`paddle.base.libpaddle.Place` or `str`): Paddle device identifier
+
+    Raises:
+        RuntimeError: Paddle device does not have a corresponding Warp device
+    """
+    if type(paddle_device) is str:
+        warp_device = warp.context.runtime.device_map.get(paddle_device)
+        if warp_device is not None:
+            return warp_device
+        elif paddle_device.startswith("gpu"):
+            return warp.context.runtime.get_current_cuda_device()
+        else:
+            raise RuntimeError(f"Unsupported Paddle device {paddle_device}")
+    else:
+        import paddle
+
+        try:
+            if paddle_device.is_gpu_place():
+                return warp.context.runtime.cuda_devices[paddle_device.gpu_device_id()]
+            elif paddle_device.is_cpu_place():
+                return warp.context.runtime.cpu_device
+            else:
+                raise RuntimeError(f"Unsupported Paddle device type {paddle_device}")
+        except Exception as e:
+            import paddle
+
+            if not isinstance(paddle_device, paddle.base.libpaddle.Place):
+                raise ValueError("Argument must be a paddle.base.libpaddle.Place object or a string") from e
+            raise
+
+
+def device_to_paddle(warp_device: warp.context.Devicelike) -> str:
+    """Return the Paddle device string corresponding to a Warp device.
+
+    Args:
+        warp_device: An identifier that can be resolved to a :class:`warp.context.Device`.
+
+    Raises:
+        RuntimeError: The Warp device is not compatible with PyPaddle.
+    """
+    device = warp.get_device(warp_device)
+    if device.is_cpu or device.is_primary:
+        return str(device).replace("cuda", "gpu")
+    elif device.is_cuda and device.is_uva:
+        # it's not a primary context, but paddle can access the data ptr directly thanks to UVA
+        return f"gpu:{device.ordinal}"
+    raise RuntimeError(f"Warp device {device} is not compatible with paddle")
+
+
+def dtype_to_paddle(warp_dtype):
+    """Return the Paddle dtype corresponding to a Warp dtype.
+
+    Args:
+        warp_dtype: A Warp data type that has a corresponding ``paddle.dtype``.
+            ``warp.uint16``, ``warp.uint32``, and ``warp.uint64`` are mapped
+            to the signed integer ``paddle.dtype`` of the same width.
+    Raises:
+        TypeError: Unable to find a corresponding PyPaddle data type.
+    """
+    # initialize lookup table on first call to defer paddle import
+    if dtype_to_paddle.type_map is None:
+        import paddle
+
+        dtype_to_paddle.type_map = {
+            warp.float16: paddle.float16,
+            warp.float32: paddle.float32,
+            warp.float64: paddle.float64,
+            warp.int8: paddle.int8,
+            warp.int16: paddle.int16,
+            warp.int32: paddle.int32,
+            warp.int64: paddle.int64,
+            warp.uint8: paddle.uint8,
+            warp.bool: paddle.bool,
+            # paddle doesn't support unsigned ints bigger than 8 bits
+            warp.uint16: paddle.int16,
+            warp.uint32: paddle.int32,
+            warp.uint64: paddle.int64,
+        }
+
+    paddle_dtype = dtype_to_paddle.type_map.get(warp_dtype)
+    if paddle_dtype is not None:
+        return paddle_dtype
+    else:
+        raise TypeError(f"Cannot convert {warp_dtype} to a Paddle type")
+
+
+def dtype_from_paddle(paddle_dtype):
+    """Return the Warp dtype corresponding to a Paddle dtype.
+
+    Args:
+        paddle_dtype: A ``paddle.dtype`` that has a corresponding Warp data type.
+            Currently ``paddle.bfloat16``, ``paddle.complex64``, and
+            ``paddle.complex128`` are not supported.
+
+    Raises:
+        TypeError: Unable to find a corresponding Warp data type.
+    """
+    # initialize lookup table on first call to defer paddle import
+    if dtype_from_paddle.type_map is None:
+        import paddle
+
+        dtype_from_paddle.type_map = {
+            paddle.float16: warp.float16,
+            paddle.float32: warp.float32,
+            paddle.float64: warp.float64,
+            paddle.int8: warp.int8,
+            paddle.int16: warp.int16,
+            paddle.int32: warp.int32,
+            paddle.int64: warp.int64,
+            paddle.uint8: warp.uint8,
+            paddle.bool: warp.bool,
+            # currently unsupported by Warp
+            # paddle.bfloat16:
+            # paddle.complex64:
+            # paddle.complex128:
+        }
+
+    warp_dtype = dtype_from_paddle.type_map.get(paddle_dtype)
+
+    if warp_dtype is not None:
+        return warp_dtype
+    else:
+        raise TypeError(f"Cannot convert {paddle_dtype} to a Warp type")
+
+
+def dtype_is_compatible(paddle_dtype: paddle.dtype, warp_dtype) -> bool:
+    """Evaluates whether the given paddle dtype is compatible with the given Warp dtype."""
+    # initialize lookup table on first call to defer paddle import
+    if dtype_is_compatible.compatible_sets is None:
+        import paddle
+
+        dtype_is_compatible.compatible_sets = {
+            paddle.float64: {warp.float64},
+            paddle.float32: {warp.float32},
+            paddle.float16: {warp.float16},
+            # allow aliasing integer tensors as signed or unsigned integer arrays
+            paddle.int64: {warp.int64, warp.uint64},
+            paddle.int32: {warp.int32, warp.uint32},
+            paddle.int16: {warp.int16, warp.uint16},
+            paddle.int8: {warp.int8, warp.uint8},
+            paddle.uint8: {warp.uint8, warp.int8},
+            paddle.bool: {warp.bool, warp.uint8, warp.int8},
+            # currently unsupported by Warp
+            # paddle.bfloat16:
+            # paddle.complex64:
+            # paddle.complex128:
+        }
+
+    compatible_set = dtype_is_compatible.compatible_sets.get(paddle_dtype)
+
+    if compatible_set is not None:
+        if warp_dtype in compatible_set:
+            return True
+        # check if it's a vector or matrix type
+        if hasattr(warp_dtype, "_wp_scalar_type_"):
+            return warp_dtype._wp_scalar_type_ in compatible_set
+
+    return False
+
+
+# lookup tables initialized when needed
+dtype_from_paddle.type_map = None
+dtype_to_paddle.type_map = None
+dtype_is_compatible.compatible_sets = None
+
+
+# wrap a paddle tensor to a wp array, data is not copied
+def from_paddle(
+    t: paddle.Tensor,
+    dtype: Optional[paddle.dtype] = None,
+    requires_grad: Optional[bool] = None,
+    grad: Optional[paddle.Tensor] = None,
+    return_ctype: bool = False,
+) -> warp.array:
+    """Convert a Paddle tensor to a Warp array without copying the data.
+
+    Args:
+        t (paddle.Tensor): The paddle tensor to wrap.
+        dtype (warp.dtype, optional): The target data type of the resulting Warp array. Defaults to the tensor value type mapped to a Warp array value type.
+        requires_grad (bool, optional): Whether the resulting array should wrap the tensor's gradient, if it exists (the grad tensor will be allocated otherwise). Defaults to the tensor's `requires_grad` value.
+        grad (paddle.Tensor, optional): The grad attached to given tensor. Defaults to None.
+        return_ctype (bool, optional): Whether to return a low-level array descriptor instead of a ``wp.array`` object (faster).  The descriptor can be passed to Warp kernels.
+
+    Returns:
+        warp.array: The wrapped array or array descriptor.
+    """
+    if dtype is None:
+        dtype = dtype_from_paddle(t.dtype)
+    elif not dtype_is_compatible(t.dtype, dtype):
+        raise RuntimeError(f"Cannot convert Paddle type {t.dtype} to Warp type {dtype}")
+
+    # get size of underlying data type to compute strides
+    ctype_size = ctypes.sizeof(dtype._type_)
+
+    shape = tuple(t.shape)
+    strides = tuple(s * ctype_size for s in t.strides)
+
+    # if target is a vector or matrix type
+    # then check if trailing dimensions match
+    # the target type and update the shape
+    if hasattr(dtype, "_shape_"):
+        dtype_shape = dtype._shape_
+        dtype_dims = len(dtype._shape_)
+        # ensure inner shape matches
+        if dtype_dims > len(shape) or dtype_shape != shape[-dtype_dims:]:
+            raise RuntimeError(
+                f"Could not convert Paddle tensor with shape {shape} to Warp array with dtype={dtype}, ensure that source inner shape is {dtype_shape}"
+            )
+        # ensure inner strides are contiguous
+        if strides[-1] != ctype_size or (dtype_dims > 1 and strides[-2] != ctype_size * dtype_shape[-1]):
+            raise RuntimeError(
+                f"Could not convert Paddle tensor with shape {shape} to Warp array with dtype={dtype}, because the source inner strides are not contiguous"
+            )
+        # trim shape and strides
+        shape = tuple(shape[:-dtype_dims]) or (1,)
+        strides = tuple(strides[:-dtype_dims]) or (ctype_size,)
+
+    # gradient
+    # - if return_ctype is False, we set `grad` to a wp.array or None
+    # - if return_ctype is True, we set `grad_ptr` and set `grad` as the owner (wp.array or paddle.Tensor)
+    requires_grad = (not t.stop_gradient) if requires_grad is None else requires_grad
+    grad_ptr = 0
+    if grad is not None:
+        if isinstance(grad, warp.array):
+            if return_ctype:
+                if grad.strides != strides:
+                    raise RuntimeError(
+                        f"Gradient strides must match array strides, expected {strides} but got {grad.strides}"
+                    )
+                grad_ptr = grad.ptr
+        else:
+            # assume grad is a paddle.Tensor
+            if return_ctype:
+                if t.strides != grad.strides:
+                    raise RuntimeError(
+                        f"Gradient strides must match array strides, expected {t.strides} but got {grad.strides}"
+                    )
+                grad_ptr = grad.data_ptr()
+            else:
+                grad = from_paddle(grad, dtype=dtype, requires_grad=False)
+    elif requires_grad:
+        # wrap the tensor gradient, allocate if necessary
+        if t.grad is not None:
+            if return_ctype:
+                grad = t.grad
+                if t.strides != grad.strides:
+                    raise RuntimeError(
+                        f"Gradient strides must match array strides, expected {t.strides} but got {grad.strides}"
+                    )
+                grad_ptr = grad.data_ptr()
+            else:
+                grad = from_paddle(t.grad, dtype=dtype, requires_grad=False)
+        else:
+            # allocate a zero-filled gradient if it doesn't exist
+            # Note: we use Warp to allocate the shared gradient with compatible strides
+            grad = warp.zeros(dtype=dtype, shape=shape, strides=strides, device=device_from_paddle(t.place))
+            # use .grad_ for zero-copy
+            t.grad_ = to_paddle(grad, requires_grad=False)
+            grad_ptr = grad.ptr
+
+    if return_ctype:
+        ptr = t.data_ptr()
+
+        # create array descriptor
+        array_ctype = warp.types.array_t(ptr, grad_ptr, len(shape), shape, strides)
+
+        # keep data and gradient alive
+        array_ctype._ref = t
+        array_ctype._gradref = grad
+
+        return array_ctype
+
+    else:
+        a = warp.array(
+            ptr=t.data_ptr(),
+            dtype=dtype,
+            shape=shape,
+            strides=strides,
+            device=device_from_paddle(t.place),
+            copy=False,
+            grad=grad,
+            requires_grad=requires_grad,
+        )
+
+        # save a reference to the source tensor, otherwise it may get deallocated
+        a._tensor = t
+
+        return a
+
+
+def to_paddle(a: warp.array, requires_grad: bool = None) -> paddle.Tensor:
+    """
+    Convert a Warp array to a Paddle tensor without copying the data.
+
+    Args:
+        a (warp.array): The Warp array to convert.
+        requires_grad (bool, optional): Whether the resulting tensor should convert the array's gradient, if it exists, to a grad tensor. Defaults to the array's `requires_grad` value.
+
+    Returns:
+        paddle.Tensor: The converted tensor.
+    """
+    import paddle
+    import paddle.utils.dlpack
+
+    if requires_grad is None:
+        requires_grad = a.requires_grad
+
+    # Paddle does not support structured arrays
+    if isinstance(a.dtype, warp.codegen.Struct):
+        raise RuntimeError("Cannot convert structured Warp arrays to Paddle.")
+
+    if a.device.is_cpu:
+        # Paddle has an issue wrapping CPU objects
+        # that support the __array_interface__ protocol
+        # in this case we need to workaround by going
+        # to an ndarray first, see https://pearu.github.io/array_interface_pypaddle.html
+        t = paddle.to_tensor(numpy.asarray(a), place="cpu")
+        t.stop_gradient = not requires_grad
+        if requires_grad and a.requires_grad:
+            # use .grad_ for zero-copy
+            t.grad_ = paddle.to_tensor(numpy.asarray(a.grad), place="cpu")
+        return t
+
+    elif a.device.is_cuda:
+        # Paddle does support the __cuda_array_interface__
+        # correctly, but we must be sure to maintain a reference
+        # to the owning object to prevent memory allocs going out of scope
+        t = paddle.utils.dlpack.from_dlpack(warp.to_dlpack(a)).to(device=device_to_paddle(a.device))
+        t.stop_gradient = not requires_grad
+        if requires_grad and a.requires_grad:
+            # use .grad_ for zero-copy
+            t.grad_ = paddle.utils.dlpack.from_dlpack(warp.to_dlpack(a.grad)).to(device=device_to_paddle(a.device))
+        return t
+
+    else:
+        raise RuntimeError("Unsupported device")
+
+
+def stream_from_paddle(stream_or_device=None):
+    """Convert from a Paddle CUDA stream to a Warp CUDA stream."""
+    import paddle
+
+    if isinstance(stream_or_device, paddle.device.Stream):
+        stream = stream_or_device
+    else:
+        # assume arg is a paddle device
+        stream = paddle.device.current_stream(stream_or_device)
+
+    device = device_from_paddle(stream.device)
+
+    warp_stream = warp.Stream(device, cuda_stream=stream.stream_base.cuda_stream)
+
+    # save a reference to the source stream, otherwise it may be destroyed
+    warp_stream._paddle_stream = stream
+
+    return warp_stream
diff --git a/warp/sim/integrator_euler.py b/warp/sim/integrator_euler.py
index 40db2813..624e644a 100644
--- a/warp/sim/integrator_euler.py
+++ b/warp/sim/integrator_euler.py
@@ -761,6 +761,7 @@ def eval_particle_contacts(
     contact_body_vel: wp.array(dtype=wp.vec3),
     contact_normal: wp.array(dtype=wp.vec3),
     contact_max: int,
+    body_f_in_world_frame: bool,
     # outputs
     particle_f: wp.array(dtype=wp.vec3),
     body_f: wp.array(dtype=wp.spatial_vector),
@@ -809,7 +810,11 @@ def eval_particle_contacts(
     body_v = wp.spatial_bottom(body_v_s)
 
     # compute the body velocity at the particle position
-    bv = body_v + wp.cross(body_w, r) + wp.transform_vector(X_wb, contact_body_vel[tid])
+    bv = body_v + wp.transform_vector(X_wb, contact_body_vel[tid])
+    if body_f_in_world_frame:
+        bv += wp.cross(body_w, bx)
+    else:
+        bv += wp.cross(body_w, r)
 
     # relative velocity
     v = pv - bv
@@ -840,12 +845,14 @@ def eval_particle_contacts(
     ft = wp.normalize(vt) * wp.min(kf * wp.length(vt), abs(mu * c * ke))
 
     f_total = fn + (fd + ft)
-    t_total = wp.cross(r, f_total)
 
     wp.atomic_sub(particle_f, particle_index, f_total)
 
     if body_index >= 0:
-        wp.atomic_add(body_f, body_index, wp.spatial_vector(t_total, f_total))
+        if body_f_in_world_frame:
+            wp.atomic_sub(body_f, body_index, wp.spatial_vector(wp.cross(bx, f_total), f_total))
+        else:
+            wp.atomic_add(body_f, body_index, wp.spatial_vector(wp.cross(r, f_total), f_total))
 
 
 @wp.kernel
@@ -1814,7 +1821,9 @@ def eval_body_joint_forces(model: Model, state: State, control: Control, body_f:
         )
 
 
-def eval_particle_body_contact_forces(model: Model, state: State, particle_f: wp.array, body_f: wp.array):
+def eval_particle_body_contact_forces(
+    model: Model, state: State, particle_f: wp.array, body_f: wp.array, body_f_in_world_frame: bool = False
+):
     if model.particle_count and model.shape_count > 1:
         wp.launch(
             kernel=eval_particle_contacts,
@@ -1841,6 +1850,7 @@ def eval_particle_body_contact_forces(model: Model, state: State, particle_f: wp
                 model.soft_contact_body_vel,
                 model.soft_contact_normal,
                 model.soft_contact_max,
+                body_f_in_world_frame,
             ],
             # outputs
             outputs=[particle_f, body_f],
@@ -1897,7 +1907,7 @@ def compute_forces(model: Model, state: State, control: Control, particle_f: wp.
     eval_body_contact_forces(model, state, particle_f)
 
     # particle shape contact
-    eval_particle_body_contact_forces(model, state, particle_f, body_f)
+    eval_particle_body_contact_forces(model, state, particle_f, body_f, body_f_in_world_frame=False)
 
     # muscles
     if False:
diff --git a/warp/sim/integrator_featherstone.py b/warp/sim/integrator_featherstone.py
index 2bc3e61f..6b240870 100644
--- a/warp/sim/integrator_featherstone.py
+++ b/warp/sim/integrator_featherstone.py
@@ -592,7 +592,7 @@ def jcalc_integrate(
         p_s = wp.vec3(joint_q[coord_start + 0], joint_q[coord_start + 1], joint_q[coord_start + 2])
 
         # linear vel of origin (note q/qd switch order of linear angular elements)
-        # note we are converting the body twist in the space frame (w_s, v_s) to compute center of mass velcity
+        # note we are converting the body twist in the space frame (w_s, v_s) to compute center of mass velocity
         dpdt_s = v_s + wp.cross(w_s, p_s)
 
         # quat and quat derivative
@@ -1621,7 +1621,7 @@ def simulate(self, model: Model, state_in: State, state_out: State, dt: float, c
             eval_particle_ground_contact_forces(model, state_in, particle_f)
 
             # particle shape contact
-            eval_particle_body_contact_forces(model, state_in, particle_f, body_f)
+            eval_particle_body_contact_forces(model, state_in, particle_f, body_f, body_f_in_world_frame=True)
 
             # muscles
             if False:
diff --git a/warp/sim/integrator_vbd.py b/warp/sim/integrator_vbd.py
index 23ea7631..f6ced884 100644
--- a/warp/sim/integrator_vbd.py
+++ b/warp/sim/integrator_vbd.py
@@ -10,7 +10,7 @@
 
 from ..types import float32, matrix
 from .integrator import Integrator
-from .model import PARTICLE_FLAG_ACTIVE, Control, Model, State
+from .model import PARTICLE_FLAG_ACTIVE, Control, Model, ModelShapeMaterials, State
 
 
 class mat66(matrix(shape=(6, 6), dtype=float32)):
@@ -110,6 +110,39 @@ def _test_compute_force_element_adjacency(
                 )
 
 
+@wp.func
+def build_orthonormal_basis(n: wp.vec3):
+    """
+    Builds an orthonormal basis given a normal vector `n`. Return the two axes that is perpendicular to `n`.
+
+    :param n: A 3D vector (list or array-like) representing the normal vector
+    """
+    b1 = wp.vec3()
+    b2 = wp.vec3()
+    if n[2] < 0.0:
+        a = 1.0 / (1.0 - n[2])
+        b = n[0] * n[1] * a
+        b1[0] = 1.0 - n[0] * n[0] * a
+        b1[1] = -b
+        b1[2] = n[0]
+
+        b2[0] = b
+        b2[1] = n[1] * n[1] * a - 1.0
+        b2[2] = -n[1]
+    else:
+        a = 1.0 / (1.0 + n[2])
+        b = -n[0] * n[1] * a
+        b1[0] = 1.0 - n[0] * n[0] * a
+        b1[1] = b
+        b1[2] = -n[0]
+
+        b2[0] = b
+        b2[1] = 1.0 - n[1] * n[1] * a
+        b2[2] = -n[1]
+
+    return b1, b2
+
+
 @wp.func
 def calculate_triangle_deformation_gradient(
     face: int, tri_indices: wp.array(dtype=wp.int32, ndim=2), pos: wp.array(dtype=wp.vec3), tri_pose: wp.mat22
@@ -288,6 +321,157 @@ def evaluate_stvk_force_hessian(
     return f, h
 
 
+@wp.func
+def evaluate_ground_contact_force_hessian(
+    vertex_pos: wp.vec3,
+    vertex_prev_pos: wp.vec3,
+    particle_radius: float,
+    ground_normal: wp.vec3,
+    ground_level: float,
+    soft_contact_ke: float,
+    friction_mu: float,
+    friction_epsilon: float,
+    dt: float,
+):
+    penetration_depth = -(wp.dot(ground_normal, vertex_pos) + ground_level - particle_radius)
+
+    if penetration_depth > 0:
+        ground_contact_force_norm = penetration_depth * soft_contact_ke
+        ground_contact_force = ground_normal * ground_contact_force_norm
+        ground_contact_hessian = soft_contact_ke * wp.outer(ground_normal, ground_normal)
+
+        dx = vertex_pos - vertex_prev_pos
+
+        # friction
+        e0, e1 = build_orthonormal_basis(ground_normal)
+
+        T = mat32(e0[0], e1[0], e0[1], e1[1], e0[2], e1[2])
+
+        relative_translation = dx
+        u = wp.transpose(T) * relative_translation
+        eps_u = friction_epsilon * dt
+
+        friction_force, friction_hessian = compute_friction(friction_mu, ground_contact_force_norm, T, u, eps_u)
+        ground_contact_force = ground_contact_force + friction_force
+        ground_contact_hessian = ground_contact_hessian + friction_hessian
+    else:
+        ground_contact_force = wp.vec3(0.0, 0.0, 0.0)
+        ground_contact_hessian = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+
+    return ground_contact_force, ground_contact_hessian
+
+
+@wp.func
+def evaluate_body_particle_contact(
+    particle_index: int,
+    particle_pos: wp.vec3,
+    particle_prev_pos: wp.vec3,
+    contact_index: int,
+    soft_contact_ke: float,
+    friction_mu: float,
+    friction_epsilon: float,
+    particle_radius: wp.array(dtype=float),
+    shape_materials: ModelShapeMaterials,
+    shape_body: wp.array(dtype=int),
+    body_q: wp.array(dtype=wp.transform),
+    body_qd: wp.array(dtype=wp.spatial_vector),
+    body_com: wp.array(dtype=wp.vec3),
+    contact_shape: wp.array(dtype=int),
+    contact_body_pos: wp.array(dtype=wp.vec3),
+    contact_body_vel: wp.array(dtype=wp.vec3),
+    contact_normal: wp.array(dtype=wp.vec3),
+    dt: float,
+):
+    shape_index = contact_shape[contact_index]
+    body_index = shape_body[shape_index]
+
+    X_wb = wp.transform_identity()
+    X_com = wp.vec3()
+    if body_index >= 0:
+        X_wb = body_q[body_index]
+        X_com = body_com[body_index]
+
+    # body position in world space
+    bx = wp.transform_point(X_wb, contact_body_pos[contact_index])
+    r = bx - wp.transform_point(X_wb, X_com)
+
+    n = contact_normal[contact_index]
+
+    penetration_depth = -(wp.dot(n, particle_pos - bx) - particle_radius[particle_index])
+    if penetration_depth > 0:
+        body_contact_force_norm = penetration_depth * soft_contact_ke
+        body_contact_force = n * body_contact_force_norm
+        body_contact_hessian = soft_contact_ke * wp.outer(n, n)
+
+        mu = 0.5 * (friction_mu + shape_materials.mu[shape_index])
+
+        dx = particle_pos - particle_prev_pos
+
+        # body velocity
+        body_v_s = wp.spatial_vector()
+        if body_index >= 0:
+            body_v_s = body_qd[body_index]
+
+        body_w = wp.spatial_top(body_v_s)
+        body_v = wp.spatial_bottom(body_v_s)
+
+        # compute the body velocity at the particle position
+        bv = body_v + wp.cross(body_w, r) + wp.transform_vector(X_wb, contact_body_vel[contact_index])
+
+        relative_translation = dx - bv * dt
+
+        # friction
+        e0, e1 = build_orthonormal_basis(n)
+
+        T = mat32(e0[0], e1[0], e0[1], e1[1], e0[2], e1[2])
+
+        u = wp.transpose(T) * relative_translation
+        eps_u = friction_epsilon * dt
+
+        friction_force, friction_hessian = compute_friction(mu, body_contact_force_norm, T, u, eps_u)
+        body_contact_force = body_contact_force + friction_force
+        body_contact_hessian = body_contact_hessian + friction_hessian
+    else:
+        body_contact_force = wp.vec3(0.0, 0.0, 0.0)
+        body_contact_hessian = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+
+    return body_contact_force, body_contact_hessian
+
+
+@wp.func
+def compute_friction(mu: float, normal_contact_force: float, T: mat32, u: wp.vec2, eps_u: float):
+    """
+    Returns the friction force and hessian.
+    Args:
+        mu: Friction coefficient.
+        normal_contact_force: normal contact force.
+        T: Transformation matrix (3x2 matrix).
+        u: 2D displacement vector.
+    """
+    # Friction
+    u_norm = wp.length(u)
+
+    if u_norm > 0.0:
+        # IPC friction
+        if u_norm > eps_u:
+            # constant stage
+            f1_SF_over_x = 1.0 / u_norm
+        else:
+            # smooth transition
+            f1_SF_over_x = (-u_norm / eps_u + 2.0) / eps_u
+
+        force = -mu * normal_contact_force * T * (f1_SF_over_x * u)
+
+        # Different from IPC, we treat the contact normal as constant
+        # this significantly improves the stability
+        hessian = mu * normal_contact_force * T * (f1_SF_over_x * wp.identity(2, float)) * wp.transpose(T)
+    else:
+        force = wp.vec3(0.0, 0.0, 0.0)
+        hessian = wp.mat33(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+
+    return force, hessian
+
+
 @wp.kernel
 def forward_step(
     dt: float,
@@ -300,21 +484,21 @@ def forward_step(
     particle_flags: wp.array(dtype=wp.uint32),
     inertia: wp.array(dtype=wp.vec3),
 ):
-    vertex = wp.tid()
+    particle = wp.tid()
 
-    prev_pos[vertex] = pos[vertex]
-    if not particle_flags[vertex] & PARTICLE_FLAG_ACTIVE:
-        inertia[vertex] = prev_pos[vertex]
+    prev_pos[particle] = pos[particle]
+    if not particle_flags[particle] & PARTICLE_FLAG_ACTIVE:
+        inertia[particle] = prev_pos[particle]
         return
-    vel_new = vel[vertex] + (gravity + external_force[vertex] * inv_mass[vertex]) * dt
-    pos[vertex] = pos[vertex] + vel_new * dt
-    inertia[vertex] = pos[vertex]
+    vel_new = vel[particle] + (gravity + external_force[particle] * inv_mass[particle]) * dt
+    pos[particle] = pos[particle] + vel_new * dt
+    inertia[particle] = pos[particle]
 
 
 @wp.kernel
 def VBD_solve_trimesh(
     dt: float,
-    vertex_ids_in_color: wp.array(dtype=wp.int32),
+    particle_ids_in_color: wp.array(dtype=wp.int32),
     prev_pos: wp.array(dtype=wp.vec3),
     pos: wp.array(dtype=wp.vec3),
     pos_new: wp.array(dtype=wp.vec3),
@@ -328,32 +512,57 @@ def VBD_solve_trimesh(
     tri_areas: wp.array(dtype=float),
     edge_indices: wp.array(dtype=wp.int32, ndim=2),
     adjacency: ForceElementAdjacencyInfo,
+    # contact info
+    #   self contact
+    soft_contact_ke: float,
+    friction_mu: float,
+    friction_epsilon: float,
+    #   body-particle contact
+    particle_radius: wp.array(dtype=float),
+    body_particle_contact_buffer_pre_alloc: int,
+    body_particle_contact_buffer: wp.array(dtype=int),
+    body_particle_contact_count: wp.array(dtype=int),
+    shape_materials: ModelShapeMaterials,
+    shape_body: wp.array(dtype=int),
+    body_q: wp.array(dtype=wp.transform),
+    body_qd: wp.array(dtype=wp.spatial_vector),
+    body_com: wp.array(dtype=wp.vec3),
+    contact_shape: wp.array(dtype=int),
+    contact_body_pos: wp.array(dtype=wp.vec3),
+    contact_body_vel: wp.array(dtype=wp.vec3),
+    contact_normal: wp.array(dtype=wp.vec3),
+    # ground-particle contact
+    has_ground: bool,
+    ground: wp.array(dtype=float),
 ):
-    t_id = wp.tid()
+    tid = wp.tid()
 
-    vertex = vertex_ids_in_color[t_id]
-    # wp.printf("vId: %d\n", vertex)
+    particle_index = particle_ids_in_color[tid]
+    # wp.printf("vId: %d\n", particle)
 
-    if not particle_flags[vertex] & PARTICLE_FLAG_ACTIVE:
+    if not particle_flags[particle_index] & PARTICLE_FLAG_ACTIVE:
         return
 
-    dtSqrReciprocal = 1.0 / (dt * dt)
+    particle_pos = pos[particle_index]
+    particle_prev_pos = pos[particle_index]
+
+    dt_sqr_reciprocal = 1.0 / (dt * dt)
 
     # inertia force and hessian
-    f = mass[vertex] * (inertia[vertex] - pos[vertex]) * (dtSqrReciprocal)
-    h = mass[vertex] * dtSqrReciprocal * wp.identity(n=3, dtype=float)
+    f = mass[particle_index] * (inertia[particle_index] - pos[particle_index]) * (dt_sqr_reciprocal)
+    h = mass[particle_index] * dt_sqr_reciprocal * wp.identity(n=3, dtype=float)
 
     # elastic force and hessian
-    for i_adj_tri in range(get_vertex_num_adjacent_faces(vertex, adjacency)):
-        # wp.printf("vertex: %d | num_adj_faces: %d | ", vertex, get_vertex_num_adjacent_faces(vertex, adjacency))
-        tri_id, vertex_order = get_vertex_adjacent_face_id_order(vertex, i_adj_tri, adjacency)
+    for i_adj_tri in range(get_vertex_num_adjacent_faces(particle_index, adjacency)):
+        # wp.printf("particle: %d | num_adj_faces: %d | ", particle, get_particle_num_adjacent_faces(particle, adjacency))
+        tri_id, particle_order = get_vertex_adjacent_face_id_order(particle_index, i_adj_tri, adjacency)
 
-        # wp.printf("i_face: %d | face id: %d | v_order: %d | ", i_adj_tri, tri_id, vertex_order)
+        # wp.printf("i_face: %d | face id: %d | v_order: %d | ", i_adj_tri, tri_id, particle_order)
         # wp.printf("face: %d %d %d\n", tri_indices[tri_id, 0], tri_indices[tri_id, 1], tri_indices[tri_id, 2], )
 
         f_tri, h_tri = evaluate_stvk_force_hessian(
             tri_id,
-            vertex_order,
+            particle_order,
             pos,
             tri_indices,
             tri_poses[tri_id],
@@ -366,46 +575,152 @@ def VBD_solve_trimesh(
         k_d = tri_materials[tri_id, 2]
         h_d = h_tri * (k_d / dt)
 
-        f_d = h_d * (prev_pos[vertex] - pos[vertex])
+        f_d = h_d * (prev_pos[particle_index] - pos[particle_index])
 
         f = f + f_tri + f_d
         h = h + h_tri + h_d
 
-        # wp.printf("vertex: %d, i_adj_tri: %d, vertex_order: %d, \nforce:\n %f %f %f, \nhessian:, \n%f %f %f, \n%f %f %f, \n%f %f %f\n",
-        #           vertex, i_adj_tri, vertex_order,
+        # wp.printf("particle: %d, i_adj_tri: %d, particle_order: %d, \nforce:\n %f %f %f, \nhessian:, \n%f %f %f, \n%f %f %f, \n%f %f %f\n",
+        #           particle, i_adj_tri, particle_order,
         #           f[0], f[1], f[2],
         #           h[0, 0], h[0, 1], h[0, 2],
         #           h[1, 0], h[1, 1], h[1, 2],
         #           h[2, 0], h[2, 1], h[2, 2],
         #           )
 
+    # body-particle contact
+    particle_contact_count = min(body_particle_contact_count[particle_index], body_particle_contact_buffer_pre_alloc)
+
+    offset = body_particle_contact_buffer_pre_alloc * particle_index
+    for contact_counter in range(particle_contact_count):
+        # the index to access body-particle data, which is size-variable and only contains active contact
+        contact_index = body_particle_contact_buffer[offset + contact_counter]
+
+        body_contact_force, body_contact_hessian = evaluate_body_particle_contact(
+            particle_index,
+            particle_pos,
+            particle_prev_pos,
+            contact_index,
+            soft_contact_ke,
+            friction_mu,
+            friction_epsilon,
+            particle_radius,
+            shape_materials,
+            shape_body,
+            body_q,
+            body_qd,
+            body_com,
+            contact_shape,
+            contact_body_pos,
+            contact_body_vel,
+            contact_normal,
+            dt,
+        )
+
+        f = f + body_contact_force
+        h = h + body_contact_hessian
+
+    if has_ground:
+        ground_normal = wp.vec3(ground[0], ground[1], ground[2])
+        ground_level = ground[3]
+        ground_contact_force, ground_contact_hessian = evaluate_ground_contact_force_hessian(
+            particle_pos,
+            particle_prev_pos,
+            particle_radius[particle_index],
+            ground_normal,
+            ground_level,
+            soft_contact_ke,
+            friction_mu,
+            friction_epsilon,
+            dt,
+        )
+
+        f = f + ground_contact_force
+        h = h + ground_contact_hessian
+
     if abs(wp.determinant(h)) > 1e-5:
         hInv = wp.inverse(h)
-        pos_new[vertex] = pos[vertex] + hInv * f
+        pos_new[particle_index] = particle_pos + hInv * f
 
 
 @wp.kernel
 def VBD_copy_particle_positions_back(
-    vertex_ids_in_color: wp.array(dtype=wp.int32),
+    particle_ids_in_color: wp.array(dtype=wp.int32),
     pos: wp.array(dtype=wp.vec3),
     pos_new: wp.array(dtype=wp.vec3),
 ):
-    t_id = wp.tid()
-    vertex = vertex_ids_in_color[t_id]
+    tid = wp.tid()
+    particle = particle_ids_in_color[tid]
 
-    pos[vertex] = pos_new[vertex]
+    pos[particle] = pos_new[particle]
 
 
 @wp.kernel
 def update_velocity(
     dt: float, prev_pos: wp.array(dtype=wp.vec3), pos: wp.array(dtype=wp.vec3), vel: wp.array(dtype=wp.vec3)
 ):
-    vertex = wp.tid()
-    vel[vertex] = (pos[vertex] - prev_pos[vertex]) / dt
+    particle = wp.tid()
+    vel[particle] = (pos[particle] - prev_pos[particle]) / dt
+
+
+@wp.kernel
+def convert_body_particle_contact_data_kernel(
+    # inputs
+    body_particle_contact_buffer_pre_alloc: int,
+    soft_contact_particle: wp.array(dtype=int),
+    contact_count: wp.array(dtype=int),
+    contact_max: int,
+    # outputs
+    body_particle_contact_buffer: wp.array(dtype=int),
+    body_particle_contact_count: wp.array(dtype=int),
+):
+    contact_index = wp.tid()
+    count = min(contact_max, contact_count[0])
+    if contact_index >= count:
+        return
+
+    particle_index = soft_contact_particle[contact_index]
+    offset = particle_index * body_particle_contact_buffer_pre_alloc
+
+    contact_counter = wp.atomic_add(body_particle_contact_count, particle_index, 1)
+    if contact_counter < body_particle_contact_buffer_pre_alloc:
+        body_particle_contact_buffer[offset + contact_counter] = contact_index
 
 
 class VBDIntegrator(Integrator):
-    def __init__(self, model: Model, iterations=10):
+    """An implicit integrator using Vertex Block Descent (VBD) for cloth simulation.
+
+    References:
+        - Anka He Chen, Ziheng Liu, Yin Yang, and Cem Yuksel. 2024. Vertex Block Descent. ACM Trans. Graph. 43, 4, Article 116 (July 2024), 16 pages. https://doi.org/10.1145/3658179
+
+    Note that VBDIntegrator's constructor requires a :class:`Model` object as input, so that it can do some precomputation and preallocate the space.
+    After construction, you must provide the same :class:`Model` object that you used that was used during construction.
+    Currently, you must manually provide particle coloring and assign it to `model.particle_coloring` to make VBD work.
+
+    VBDIntegrator.simulate accepts three arguments: class:`Model`, :class:`State`, and :class:`Control` (optional) objects, this time-integrator
+    may be used to advance the simulation state forward in time.
+
+    Example
+    -------
+
+    .. code-block:: python
+
+        model.particle_coloring = # load or generate particle coloring
+        integrator = wp.VBDIntegrator(model)
+
+        # simulation loop
+        for i in range(100):
+            state = integrator.simulate(model, state_in, state_out, dt, control)
+
+    """
+
+    def __init__(
+        self,
+        model: Model,
+        iterations=10,
+        body_particle_contact_buffer_pre_alloc=4,
+        friction_epsilon=1e-2,
+    ):
         self.device = model.device
         self.model = model
         self.iterations = iterations
@@ -416,6 +731,15 @@ def __init__(self, model: Model, iterations=10):
 
         self.adjacency = self.compute_force_element_adjacency(model).to(self.device)
 
+        self.body_particle_contact_buffer_pre_alloc = body_particle_contact_buffer_pre_alloc
+        self.body_particle_contact_buffer = wp.zeros(
+            (self.body_particle_contact_buffer_pre_alloc * model.particle_count,),
+            dtype=wp.int32,
+            device=self.device,
+        )
+        self.body_particle_contact_count = wp.zeros((model.particle_count,), dtype=wp.int32, device=self.device)
+        self.friction_epsilon = friction_epsilon
+
         # tests
         # wp.launch(kernel=_test_compute_force_element_adjacency,
         #           inputs=[self.adjacency, model.edge_indices, model.tri_indices],
@@ -507,6 +831,8 @@ def simulate(self, model: Model, state_in: State, state_out: State, dt: float, c
         if model is not self.model:
             raise ValueError("model must be the one used to initialize VBDIntegrator")
 
+        self.convert_body_particle_contact_data()
+
         wp.launch(
             kernel=forward_step,
             inputs=[
@@ -525,12 +851,12 @@ def simulate(self, model: Model, state_in: State, state_out: State, dt: float, c
         )
 
         for _iter in range(self.iterations):
-            for i_color in range(len(self.model.coloring)):
+            for color_counter in range(len(self.model.particle_coloring)):
                 wp.launch(
                     kernel=VBD_solve_trimesh,
                     inputs=[
                         dt,
-                        self.model.coloring[i_color],
+                        self.model.particle_coloring[color_counter],
                         self.particle_q_prev,
                         state_in.particle_q,
                         state_out.particle_q,
@@ -544,15 +870,34 @@ def simulate(self, model: Model, state_in: State, state_out: State, dt: float, c
                         self.model.tri_areas,
                         self.model.edge_indices,
                         self.adjacency,
+                        self.model.soft_contact_ke,
+                        self.model.soft_contact_mu,
+                        self.friction_epsilon,
+                        #   body-particle contact
+                        self.model.particle_radius,
+                        self.body_particle_contact_buffer_pre_alloc,
+                        self.body_particle_contact_buffer,
+                        self.body_particle_contact_count,
+                        self.model.shape_materials,
+                        self.model.shape_body,
+                        self.model.body_q,
+                        self.model.body_qd,
+                        self.model.body_com,
+                        self.model.soft_contact_shape,
+                        self.model.soft_contact_body_pos,
+                        self.model.soft_contact_body_vel,
+                        self.model.soft_contact_normal,
+                        self.model.ground,
+                        self.model.ground_plane,
                     ],
-                    dim=self.model.coloring[i_color].size,
+                    dim=self.model.particle_coloring[color_counter].size,
                     device=self.device,
                 )
 
                 wp.launch(
                     kernel=VBD_copy_particle_positions_back,
-                    inputs=[self.model.coloring[i_color], state_in.particle_q, state_out.particle_q],
-                    dim=self.model.coloring[i_color].size,
+                    inputs=[self.model.particle_coloring[color_counter], state_in.particle_q, state_out.particle_q],
+                    dim=self.model.particle_coloring[color_counter].size,
                     device=self.device,
                 )
 
@@ -563,6 +908,22 @@ def simulate(self, model: Model, state_in: State, state_out: State, dt: float, c
             device=self.device,
         )
 
+    def convert_body_particle_contact_data(self):
+        self.body_particle_contact_count.zero_()
+
+        wp.launch(
+            kernel=convert_body_particle_contact_data_kernel,
+            inputs=[
+                self.body_particle_contact_buffer_pre_alloc,
+                self.model.soft_contact_particle,
+                self.model.soft_contact_count,
+                self.model.soft_contact_max,
+            ],
+            outputs=[self.body_particle_contact_buffer, self.body_particle_contact_count],
+            dim=self.model.soft_contact_max,
+            device=self.device,
+        )
+
     @wp.kernel
     def count_num_adjacent_edges(
         edges_array: wp.array(dtype=wp.int32, ndim=2), num_vertex_adjacent_edges: wp.array(dtype=wp.int32)
diff --git a/warp/sim/model.py b/warp/sim/model.py
index 2842bef7..4d9df0fb 100644
--- a/warp/sim/model.py
+++ b/warp/sim/model.py
@@ -641,6 +641,8 @@ class Model:
         joint_dof_count (int): Total number of velocity degrees of freedom of all joints in the system
         joint_coord_count (int): Total number of position degrees of freedom of all joints in the system
 
+        particle_coloring (list of array): The coloring of all the particles, used for VBD's Gauss-Seidel interation.
+
         device (wp.Device): Device on which the Model was allocated
 
     Note:
@@ -810,6 +812,8 @@ def __init__(self, device=None):
         self.joint_dof_count = 0
         self.joint_coord_count = 0
 
+        self.particle_coloring = []
+
         self.device = wp.get_device(device)
 
     def state(self, requires_grad=None) -> State:
@@ -3858,16 +3862,22 @@ def grid_index(x, y, dim_x):
                 p = wp.quat_rotate(rot, g) + pos
                 m = mass
 
+                particle_flag = PARTICLE_FLAG_ACTIVE
+
                 if x == 0 and fix_left:
                     m = 0.0
+                    particle_flag = wp.uint32(int(particle_flag) & ~int(PARTICLE_FLAG_ACTIVE))
                 elif x == dim_x and fix_right:
                     m = 0.0
+                    particle_flag = wp.uint32(int(particle_flag) & ~int(PARTICLE_FLAG_ACTIVE))
                 elif y == 0 and fix_bottom:
                     m = 0.0
+                    particle_flag = wp.uint32(int(particle_flag) & ~int(PARTICLE_FLAG_ACTIVE))
                 elif y == dim_y and fix_top:
                     m = 0.0
+                    particle_flag = wp.uint32(int(particle_flag) & ~int(PARTICLE_FLAG_ACTIVE))
 
-                self.add_particle(p, vel, m)
+                self.add_particle(p, vel, m, flags=particle_flag)
 
                 if x > 0 and y > 0:
                     if reverse_winding:
@@ -4015,7 +4025,7 @@ def add_cloth_mesh(
             edgeinds[:, 0],
             edgeinds[:, 1],
             edgeinds[:, 2],
-            edgeinds[:, 0],
+            edgeinds[:, 3],
             edge_ke=[edge_ke] * len(edgeinds),
             edge_kd=[edge_kd] * len(edgeinds),
         )
diff --git a/warp/stubs.py b/warp/stubs.py
index 301b056b..01c8234d 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -109,11 +109,17 @@
 
 from warp.dlpack import from_dlpack, to_dlpack
 
+from warp.paddle import from_paddle, to_paddle
+from warp.paddle import dtype_from_paddle, dtype_to_paddle
+from warp.paddle import device_from_paddle, device_to_paddle
+from warp.paddle import stream_from_paddle
+
 from warp.build import clear_kernel_cache
 
 from warp.constants import *
 
 from . import builtins
+from warp.builtins import static
 
 import warp.config as config
 
@@ -928,30 +934,57 @@ def tile_arange(*args: Scalar, dtype: Scalar) -> Tile:
 
 
 @over
-def tile_load(a: Array[Any], x: int32, y: int32, m: int32, n: int32) -> Tile:
-    """Loads a tile from a global memory array.
+def tile_load(a: Array[Any], i: int32, n: int32) -> Tile:
+    """Loads a 1D tile from a global memory array.
 
     This method will cooperatively load a tile from global memory using all threads in the block.
 
     :param a: The source array in global memory
-    :param x: Offset in the source array measured in multiples of ``m``, i.e.: ``i=x*m``
-    :param y: Offset in the source array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param i: Offset in the source array measured in multiples of ``n``, i.e.: ``offset=i*n``
+    :param n: The number of elements in the tile
+    :returns: A tile with ``shape=(1,n)`` and dtype the same as the source array
+    """
+    ...
+
+
+@over
+def tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32) -> Tile:
+    """Loads a 2D tile from a global memory array.
+
+    This method will cooperatively load a tile from global memory using all threads in the block.
+
+    :param a: The source array in global memory
+    :param i: Offset in the source array measured in multiples of ``m``, i.e.: ``row=i*m``
+    :param j: Offset in the source array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param m: The size of the tile's first dimension
-    :param n: The size of the tile's second dimensions
+    :param n: The size of the tile's second dimension
     :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array
     """
     ...
 
 
 @over
-def tile_store(a: Array[Any], x: int32, y: int32, t: Any):
+def tile_store(a: Array[Any], i: int32, t: Any):
+    """Stores a 1D tile to a global memory array.
+
+    This method will cooperatively store a tile to global memory using all threads in the block.
+
+    :param a: The destination array in global memory
+    :param i: Offset in the destination array measured in multiples of ``n``, i.e.: ``offset=i*n``
+    :param t: The source tile to store data from, must have the same dtype as the destination array
+    """
+    ...
+
+
+@over
+def tile_store(a: Array[Any], i: int32, j: int32, t: Any):
     """Stores a tile to a global memory array.
 
     This method will cooperatively store a tile to global memory using all threads in the block.
 
     :param a: The destination array in global memory
-    :param x: Offset in the destination array measured in multiples of ``m``, i.e.: ``i=x*m``
-    :param y: Offset in the destination array measured in multiples of ``n``, i.e.; ``j=y*n``
+    :param i: Offset in the destination array measured in multiples of ``m``, i.e.: ``row=i*m``
+    :param j: Offset in the destination array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param t: The source tile to store data from, must have the same dtype as the destination array
     """
     ...
@@ -976,8 +1009,11 @@ def tile(x: Any) -> Tile:
 
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
+    * If the input value is a scalar then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a vector then the resulting tile has ``shape=(length(vector), block_dim)``
+
     :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
-    :returns: A tile with ``shape=(1, block_dim)`` where ``block_dim`` is the number of threads specified in ``wp.launch()``.
+    :returns: A tile with first dimension according to the value type length and a second dimension equal to ``block_dim``
 
     This example shows how to create a linear sequence from thread variables:
 
@@ -996,7 +1032,8 @@ def compute():
 
     .. code-block:: text
 
-        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 10 12 14...]]
+        tile(m=1, n=16, storage=register) = [[0 2 4 6 8 ...]]
+
 
     """
     ...
@@ -1008,6 +1045,9 @@ def untile(a: Any) -> Scalar:
 
     This function converts a block-wide tile back to per-thread values.
 
+    * If the input tile is 1-dimensional then the resulting value will be a per-thread scalar
+    * If the input tile is 2-dimensional then the the resulting value will be a per-thread vector of length M
+
     :param a: A tile with dimensions ``shape=(M, block_dim)``
     :returns: A single value per-thread with the same dtype as the tile
 
@@ -1071,6 +1111,18 @@ def tile_transpose(a: Tile) -> Tile:
     ...
 
 
+@over
+def tile_broadcast(a: Tile, m: int32, n: int32) -> Tile:
+    """Broadcast a tile.
+
+    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+
+    :param a: Tile to broadcast
+    :returns: Tile with broadcast ``shape=(m, n)``
+    """
+    ...
+
+
 @over
 def tile_sum(a: Tile) -> Tile:
     """Cooperatively compute the sum the tile elements using all threads in the block.
@@ -2163,145 +2215,217 @@ def atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: in
 
 @over
 def atomic_min(arr: Array[Any], i: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
+    """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
 @over
 def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
-    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value."""
+    """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
+
+    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    """
     ...
 
 
@@ -2803,3 +2927,18 @@ def tile_ifft(inout: Tile) -> Tile:
     :param inout: The input/output tile
     """
     ...
+
+
+@over
+def static(expr: Any) -> Any:
+    """Evaluates a static Python expression and replaces it with its result.
+
+    See the `codegen.html#static-expressions <section on code generation>`_ for more details.
+
+    Note:
+        The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
+        which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
+        The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
+        (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).
+    """
+    ...
diff --git a/warp/tape.py b/warp/tape.py
index 6df7c21b..67e9bc9f 100644
--- a/warp/tape.py
+++ b/warp/tape.py
@@ -50,6 +50,8 @@ def __init__(self):
         self.loss = None
 
     def __enter__(self):
+        wp.context.init()
+
         if wp.context.runtime.tape is not None:
             raise RuntimeError("Warp: Error, entering a tape while one is already active")
 
diff --git a/warp/tests/test_array.py b/warp/tests/test_array.py
index 97bcf208..77721ca5 100644
--- a/warp/tests/test_array.py
+++ b/warp/tests/test_array.py
@@ -2590,6 +2590,25 @@ def test_array_from_int64_domain(test, device):
     wp.zeros(np.array([1504, 1080, 520], dtype=np.int64), dtype=wp.float32, device=device)
 
 
+def test_numpy_array_interface(test, device):
+    # We should be able to convert between NumPy and Warp arrays using __array_interface__ on CPU.
+    # This tests all scalar types supported by both.
+
+    n = 10
+
+    scalar_types = wp.types.scalar_types
+
+    for dtype in scalar_types:
+        # test round trip
+        a1 = wp.zeros(n, dtype=dtype, device="cpu")
+        na = np.array(a1)
+        a2 = wp.array(na, device="cpu")
+
+        assert a1.dtype == a2.dtype
+        assert a1.shape == a2.shape
+        assert a1.strides == a2.strides
+
+
 devices = get_test_devices()
 
 
@@ -2648,6 +2667,7 @@ def test_array_new_del(self):
 add_function_test(TestArray, "test_array_of_structs_roundtrip", test_array_of_structs_roundtrip, devices=devices)
 add_function_test(TestArray, "test_array_from_numpy", test_array_from_numpy, devices=devices)
 add_function_test(TestArray, "test_array_aliasing_from_numpy", test_array_aliasing_from_numpy, devices=["cpu"])
+add_function_test(TestArray, "test_numpy_array_interface", test_numpy_array_interface, devices=["cpu"])
 
 add_function_test(TestArray, "test_array_inplace_ops", test_array_inplace_ops, devices=devices)
 add_function_test(TestArray, "test_direct_from_numpy", test_direct_from_numpy, devices=["cpu"])
diff --git a/warp/tests/test_codegen.py b/warp/tests/test_codegen.py
index beb0cf03..e3552ad2 100644
--- a/warp/tests/test_codegen.py
+++ b/warp/tests/test_codegen.py
@@ -405,22 +405,22 @@ def kernel_3_fn(
 
     kernel = wp.Kernel(func=kernel_1_fn)
     with test.assertRaisesRegex(
-        RuntimeError,
-        r"Cannot reference a global variable from a kernel unless `wp.constant\(\)` is being used",
+        TypeError,
+        r"Invalid external reference type: <class 'warp.types.array'>",
     ):
         wp.launch(kernel, dim=out.shape, inputs=(), outputs=(out,), device=device)
 
     kernel = wp.Kernel(func=kernel_2_fn)
     with test.assertRaisesRegex(
-        RuntimeError,
-        r"Cannot reference a global variable from a kernel unless `wp.constant\(\)` is being used",
+        TypeError,
+        r"Invalid external reference type: <class 'warp.types.array'>",
     ):
         wp.launch(kernel, dim=out.shape, inputs=(), outputs=(out,), device=device)
 
     kernel = wp.Kernel(func=kernel_3_fn)
     with test.assertRaisesRegex(
-        RuntimeError,
-        r"Cannot reference a global variable from a kernel unless `wp.constant\(\)` is being used",
+        TypeError,
+        r"Invalid external reference type: <class 'warp.types.array'>",
     ):
         wp.launch(kernel, dim=out.shape, inputs=(), outputs=(out,), device=device)
 
@@ -489,6 +489,21 @@ def kernel_2_fn():
         wp.launch(kernel, dim=1, device=device)
 
 
+def test_error_mutating_constant_in_dynamic_loop(test, device):
+    @wp.kernel
+    def dynamic_loop_kernel(n: int, input: wp.array(dtype=float)):
+        my_constant = 0.0
+        for i in range(n):
+            my_constant += input[i]
+
+    inputs = wp.array([1.0, 2.0, 3.0], dtype=float, device=device)
+    with test.assertRaisesRegex(
+        wp.codegen.WarpCodegenError,
+        r"Error mutating a constant my_constant inside a dynamic loop, use the following syntax\: pi = float\(3\.141\) to declare a dynamic variable",
+    ):
+        wp.launch(dynamic_loop_kernel, dim=1, inputs=[3, inputs], device=device)
+
+
 @wp.kernel
 def test_call_syntax():
     expected_pow = 16.0
@@ -667,6 +682,12 @@ class TestCodeGen(unittest.TestCase):
 add_function_test(
     TestCodeGen, func=test_error_unmatched_arguments, name="test_error_unmatched_arguments", devices=devices
 )
+add_function_test(
+    TestCodeGen,
+    func=test_error_mutating_constant_in_dynamic_loop,
+    name="test_error_mutating_constant_in_dynamic_loop",
+    devices=devices,
+)
 
 add_kernel_test(TestCodeGen, name="test_call_syntax", kernel=test_call_syntax, dim=1, devices=devices)
 add_kernel_test(TestCodeGen, name="test_shadow_builtin", kernel=test_shadow_builtin, dim=1, devices=devices)
diff --git a/warp/tests/test_dlpack.py b/warp/tests/test_dlpack.py
index 45fbef13..30ef693a 100644
--- a/warp/tests/test_dlpack.py
+++ b/warp/tests/test_dlpack.py
@@ -350,6 +350,34 @@ def test_dlpack_torch_to_warp_v2(test, device):
     assert_np_equal(a.numpy(), t.cpu().numpy())
 
 
+def test_dlpack_paddle_to_warp(test, device):
+    import paddle
+    import paddle.utils.dlpack
+
+    t = paddle.arange(N, dtype=paddle.float32).to(device=wp.device_to_paddle(device))
+
+    # paddle do not implement __dlpack__ yet, so only test to_dlpack here
+    a = wp.from_dlpack(paddle.utils.dlpack.to_dlpack(t))
+
+    item_size = wp.types.type_size_in_bytes(a.dtype)
+
+    test.assertEqual(a.ptr, t.data_ptr())
+    test.assertEqual(a.device, wp.device_from_paddle(t.place))
+    test.assertEqual(a.dtype, wp.dtype_from_paddle(t.dtype))
+    test.assertEqual(a.shape, tuple(t.shape))
+    test.assertEqual(a.strides, tuple(s * item_size for s in t.strides))
+
+    assert_np_equal(a.numpy(), t.numpy())
+
+    wp.launch(inc, dim=a.size, inputs=[a], device=device)
+
+    assert_np_equal(a.numpy(), t.numpy())
+
+    paddle.assign(t + 1, t)
+
+    assert_np_equal(a.numpy(), t.numpy())
+
+
 def test_dlpack_warp_to_jax(test, device):
     import jax
     import jax.dlpack
@@ -421,6 +449,61 @@ def test_dlpack_warp_to_jax_v2(test, device):
     assert_np_equal(a.numpy(), np.asarray(j2))
 
 
+def test_dlpack_warp_to_paddle(test, device):
+    import paddle.utils.dlpack
+
+    a = wp.array(data=np.arange(N, dtype=np.float32), device=device)
+
+    t = paddle.utils.dlpack.from_dlpack(wp.to_dlpack(a))
+
+    item_size = wp.types.type_size_in_bytes(a.dtype)
+
+    test.assertEqual(a.ptr, t.data_ptr())
+    test.assertEqual(a.device, wp.device_from_paddle(t.place))
+    test.assertEqual(a.dtype, wp.dtype_from_paddle(t.dtype))
+    test.assertEqual(a.shape, tuple(t.shape))
+    test.assertEqual(a.strides, tuple(s * item_size for s in t.strides))
+
+    assert_np_equal(a.numpy(), t.cpu().numpy())
+
+    wp.launch(inc, dim=a.size, inputs=[a], device=device)
+
+    assert_np_equal(a.numpy(), t.cpu().numpy())
+
+    paddle.assign(t + 1, t)
+
+    assert_np_equal(a.numpy(), t.cpu().numpy())
+
+
+def test_dlpack_warp_to_paddle_v2(test, device):
+    # same as original test, but uses newer __dlpack__() method
+
+    import paddle.utils.dlpack
+
+    a = wp.array(data=np.arange(N, dtype=np.float32), device=device)
+
+    # pass the array directly
+    t = paddle.utils.dlpack.from_dlpack(a)
+
+    item_size = wp.types.type_size_in_bytes(a.dtype)
+
+    test.assertEqual(a.ptr, t.data_ptr())
+    test.assertEqual(a.device, wp.device_from_paddle(t.place))
+    test.assertEqual(a.dtype, wp.dtype_from_paddle(t.dtype))
+    test.assertEqual(a.shape, tuple(t.shape))
+    test.assertEqual(a.strides, tuple(s * item_size for s in t.strides))
+
+    assert_np_equal(a.numpy(), t.numpy())
+
+    wp.launch(inc, dim=a.size, inputs=[a], device=device)
+
+    assert_np_equal(a.numpy(), t.numpy())
+
+    paddle.assign(t + 1, t)
+
+    assert_np_equal(a.numpy(), t.numpy())
+
+
 def test_dlpack_jax_to_warp(test, device):
     import jax
     import jax.dlpack
@@ -575,6 +658,41 @@ class TestDLPack(unittest.TestCase):
     print(f"Skipping Jax DLPack tests due to exception: {e}")
 
 
+# paddle interop via dlpack
+try:
+    import paddle
+    import paddle.utils.dlpack
+
+    # check which Warp devices work with paddle
+    # CUDA devices may fail if paddle was not compiled with CUDA support
+    test_devices = get_test_devices()
+    paddle_compatible_devices = []
+    for d in test_devices:
+        try:
+            t = paddle.arange(10).to(device=wp.device_to_paddle(d))
+            paddle.assign(t + 1, t)
+            paddle_compatible_devices.append(d)
+        except Exception as e:
+            print(f"Skipping paddle DLPack tests on device '{d}' due to exception: {e}")
+
+    if paddle_compatible_devices:
+        add_function_test(
+            TestDLPack, "test_dlpack_warp_to_paddle", test_dlpack_warp_to_paddle, devices=paddle_compatible_devices
+        )
+        add_function_test(
+            TestDLPack,
+            "test_dlpack_warp_to_paddle_v2",
+            test_dlpack_warp_to_paddle_v2,
+            devices=paddle_compatible_devices,
+        )
+        add_function_test(
+            TestDLPack, "test_dlpack_paddle_to_warp", test_dlpack_paddle_to_warp, devices=paddle_compatible_devices
+        )
+
+except Exception as e:
+    print(f"Skipping Paddle DLPack tests due to exception: {e}")
+
+
 if __name__ == "__main__":
     wp.clear_kernel_cache()
     unittest.main(verbosity=2)
diff --git a/warp/tests/test_implicit_init.py b/warp/tests/test_implicit_init.py
index e9daef58..e926397d 100644
--- a/warp/tests/test_implicit_init.py
+++ b/warp/tests/test_implicit_init.py
@@ -347,6 +347,55 @@ class TestImplicitInitIsPeerAccessSupported(unittest.TestCase):
 )
 
 
+#   Structs
+# ------------------------------------------------------------------------------
+
+
+def test_struct_member_init(test, device):
+    @wp.struct
+    class S:
+        # fp16 requires conversion functions from warp.so
+        x: wp.float16
+        v: wp.vec3h
+
+    s = S()
+    s.x = 42.0
+    s.v = wp.vec3h(1.0, 2.0, 3.0)
+
+
+class TestImplicitInitStructMemberInit(unittest.TestCase):
+    pass
+
+
+add_function_test(
+    TestImplicitInitStructMemberInit,
+    "test_struct_member_init",
+    test_struct_member_init,
+    check_output=False,
+)
+
+
+#   Tape
+# ------------------------------------------------------------------------------
+
+
+def test_tape(test, device):
+    with wp.Tape():
+        pass
+
+
+class TestImplicitInitTape(unittest.TestCase):
+    pass
+
+
+add_function_test(
+    TestImplicitInitTape,
+    "test_tape",
+    test_tape,
+    check_output=False,
+)
+
+
 if __name__ == "__main__":
     # Do not clear the kernel cache or call anything that would initialize Warp
     # since these tests are specifically aiming to catch issues where Warp isn't
diff --git a/warp/tests/test_paddle.py b/warp/tests/test_paddle.py
new file mode 100644
index 00000000..53db028e
--- /dev/null
+++ b/warp/tests/test_paddle.py
@@ -0,0 +1,852 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import unittest
+
+import numpy as np
+
+import warp as wp
+from warp.tests.unittest_utils import *
+
+
+@wp.kernel
+def op_kernel(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+    tid = wp.tid()
+    y[tid] = 0.5 - x[tid] * 2.0
+
+
+@wp.kernel
+def inc(a: wp.array(dtype=float)):
+    tid = wp.tid()
+    a[tid] = a[tid] + 1.0
+
+
+@wp.kernel
+def inc_vector(a: wp.array(dtype=wp.vec3f)):
+    tid = wp.tid()
+    a[tid] = a[tid] + wp.vec3f(1.0)
+
+
+@wp.kernel
+def inc_matrix(a: wp.array(dtype=wp.mat22f)):
+    tid = wp.tid()
+    a[tid] = a[tid] + wp.mat22f(1.0)
+
+
+@wp.kernel
+def arange(start: int, step: int, a: wp.array(dtype=int)):
+    tid = wp.tid()
+    a[tid] = start + step * tid
+
+
+# copy elements between non-contiguous 1d arrays of float
+@wp.kernel
+def copy1d_float_kernel(dst: wp.array(dtype=float), src: wp.array(dtype=float)):
+    i = wp.tid()
+    dst[i] = src[i]
+
+
+# copy elements between non-contiguous 2d arrays of float
+@wp.kernel
+def copy2d_float_kernel(dst: wp.array2d(dtype=float), src: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+
+
+# copy elements between non-contiguous 3d arrays of float
+@wp.kernel
+def copy3d_float_kernel(dst: wp.array3d(dtype=float), src: wp.array3d(dtype=float)):
+    i, j, k = wp.tid()
+    dst[i, j, k] = src[i, j, k]
+
+
+# copy elements between non-contiguous 2d arrays of vec3
+@wp.kernel
+def copy2d_vec3_kernel(dst: wp.array2d(dtype=wp.vec3), src: wp.array2d(dtype=wp.vec3)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+
+
+# copy elements between non-contiguous 2d arrays of mat22
+@wp.kernel
+def copy2d_mat22_kernel(dst: wp.array2d(dtype=wp.mat22), src: wp.array2d(dtype=wp.mat22)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+
+
+def test_dtype_from_paddle(test, device):
+    import paddle
+
+    def test_conversions(paddle_type, warp_type):
+        test.assertEqual(wp.dtype_from_paddle(paddle_type), warp_type)
+
+    test_conversions(paddle.float16, wp.float16)
+    test_conversions(paddle.float32, wp.float32)
+    test_conversions(paddle.float64, wp.float64)
+    test_conversions(paddle.int8, wp.int8)
+    test_conversions(paddle.int16, wp.int16)
+    test_conversions(paddle.int32, wp.int32)
+    test_conversions(paddle.int64, wp.int64)
+    test_conversions(paddle.uint8, wp.uint8)
+    test_conversions(paddle.bool, wp.bool)
+
+
+def test_dtype_to_paddle(test, device):
+    import paddle
+
+    def test_conversions(warp_type, paddle_type):
+        test.assertEqual(wp.dtype_to_paddle(warp_type), paddle_type)
+
+    test_conversions(wp.float16, paddle.float16)
+    test_conversions(wp.float32, paddle.float32)
+    test_conversions(wp.float64, paddle.float64)
+    test_conversions(wp.int8, paddle.int8)
+    test_conversions(wp.int16, paddle.int16)
+    test_conversions(wp.int32, paddle.int32)
+    test_conversions(wp.int64, paddle.int64)
+    test_conversions(wp.uint8, paddle.uint8)
+    test_conversions(wp.uint16, paddle.int16)
+    test_conversions(wp.uint32, paddle.int32)
+    test_conversions(wp.uint64, paddle.int64)
+    test_conversions(wp.bool, paddle.bool)
+
+
+def test_device_conversion(test, device):
+    paddle_device = wp.device_to_paddle(device)
+    warp_device = wp.device_from_paddle(paddle_device)
+    test.assertEqual(warp_device, device)
+
+
+def test_paddle_zerocopy(test, device):
+    import paddle
+
+    a = wp.zeros(10, dtype=wp.float32, device=device)
+    t = wp.to_paddle(a)
+    assert a.ptr == t.data_ptr()
+
+    paddle_device = wp.device_to_paddle(device)
+
+    t = paddle.zeros([10], dtype=paddle.float32).to(device=paddle_device)
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+
+
+def test_from_paddle(test, device):
+    import paddle
+
+    paddle_device = wp.device_to_paddle(device)
+
+    # automatically determine warp dtype
+    def wrap_scalar_tensor_implicit(paddle_dtype, expected_warp_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t)
+        assert a.dtype == expected_warp_dtype
+        assert a.shape == tuple(t.shape)
+
+    wrap_scalar_tensor_implicit(paddle.float64, wp.float64)
+    wrap_scalar_tensor_implicit(paddle.float32, wp.float32)
+    wrap_scalar_tensor_implicit(paddle.float16, wp.float16)
+    wrap_scalar_tensor_implicit(paddle.int64, wp.int64)
+    wrap_scalar_tensor_implicit(paddle.int32, wp.int32)
+    wrap_scalar_tensor_implicit(paddle.int16, wp.int16)
+    wrap_scalar_tensor_implicit(paddle.int8, wp.int8)
+    wrap_scalar_tensor_implicit(paddle.uint8, wp.uint8)
+    wrap_scalar_tensor_implicit(paddle.bool, wp.bool)
+
+    # explicitly specify warp dtype
+    def wrap_scalar_tensor_explicit(paddle_dtype, expected_warp_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t, expected_warp_dtype)
+        assert a.dtype == expected_warp_dtype
+        assert a.shape == tuple(t.shape)
+
+    wrap_scalar_tensor_explicit(paddle.float64, wp.float64)
+    wrap_scalar_tensor_explicit(paddle.float32, wp.float32)
+    wrap_scalar_tensor_explicit(paddle.float16, wp.float16)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.int64)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.uint64)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.int32)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.uint32)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.int16)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.uint16)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.bool)
+
+    def wrap_vec_tensor(n, desired_warp_dtype):
+        t = paddle.zeros((10, n), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+
+    wrap_vec_tensor(2, wp.vec2)
+    wrap_vec_tensor(3, wp.vec3)
+    wrap_vec_tensor(4, wp.vec4)
+    wrap_vec_tensor(6, wp.spatial_vector)
+    wrap_vec_tensor(7, wp.transform)
+
+    def wrap_mat_tensor(n, m, desired_warp_dtype):
+        t = paddle.zeros((10, n, m), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+
+    wrap_mat_tensor(2, 2, wp.mat22)
+    wrap_mat_tensor(3, 3, wp.mat33)
+    wrap_mat_tensor(4, 4, wp.mat44)
+    wrap_mat_tensor(6, 6, wp.spatial_matrix)
+
+    def wrap_vec_tensor_with_grad(n, desired_warp_dtype):
+        t = paddle.zeros((10, n), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype)
+        a.reuqires_grad = True
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+
+    wrap_vec_tensor_with_grad(2, wp.vec2)
+    wrap_vec_tensor_with_grad(3, wp.vec3)
+    wrap_vec_tensor_with_grad(4, wp.vec4)
+    wrap_vec_tensor_with_grad(6, wp.spatial_vector)
+    wrap_vec_tensor_with_grad(7, wp.transform)
+
+    def wrap_mat_tensor_with_grad(n, m, desired_warp_dtype):
+        t = paddle.zeros((10, n, m), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype, requires_grad=True)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+
+    wrap_mat_tensor_with_grad(2, 2, wp.mat22)
+    wrap_mat_tensor_with_grad(3, 3, wp.mat33)
+    wrap_mat_tensor_with_grad(4, 4, wp.mat44)
+    wrap_mat_tensor_with_grad(6, 6, wp.spatial_matrix)
+
+
+def test_array_ctype_from_paddle(test, device):
+    import paddle
+
+    paddle_device = wp.device_to_paddle(device)
+
+    # automatically determine warp dtype
+    def wrap_scalar_tensor_implicit(paddle_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t, return_ctype=True)
+        warp_dtype = wp.dtype_from_paddle(paddle_dtype)
+        ctype_size = ctypes.sizeof(warp_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_scalar_tensor_implicit(paddle.float64)
+    wrap_scalar_tensor_implicit(paddle.float32)
+    wrap_scalar_tensor_implicit(paddle.float16)
+    wrap_scalar_tensor_implicit(paddle.int64)
+    wrap_scalar_tensor_implicit(paddle.int32)
+    wrap_scalar_tensor_implicit(paddle.int16)
+    wrap_scalar_tensor_implicit(paddle.int8)
+    wrap_scalar_tensor_implicit(paddle.uint8)
+    wrap_scalar_tensor_implicit(paddle.bool)
+
+    # explicitly specify warp dtype
+    def wrap_scalar_tensor_explicit(paddle_dtype, warp_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=warp_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(warp_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_scalar_tensor_explicit(paddle.float64, wp.float64)
+    wrap_scalar_tensor_explicit(paddle.float32, wp.float32)
+    wrap_scalar_tensor_explicit(paddle.float16, wp.float16)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.int64)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.uint64)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.int32)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.uint32)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.int16)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.uint16)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.bool)
+
+    def wrap_vec_tensor(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_vec_tensor(wp.vec2)
+    wrap_vec_tensor(wp.vec3)
+    wrap_vec_tensor(wp.vec4)
+    wrap_vec_tensor(wp.spatial_vector)
+    wrap_vec_tensor(wp.transform)
+
+    def wrap_mat_tensor(mat_dtype):
+        t = paddle.zeros((10, *mat_dtype._shape_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=mat_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(mat_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_mat_tensor(wp.mat22)
+    wrap_mat_tensor(wp.mat33)
+    wrap_mat_tensor(wp.mat44)
+    wrap_mat_tensor(wp.spatial_matrix)
+
+    def wrap_vec_tensor_with_existing_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        t.stop_gradient = False
+        t.grad_ = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == t.grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_vec_tensor_with_existing_grad(wp.vec2)
+    wrap_vec_tensor_with_existing_grad(wp.vec3)
+    wrap_vec_tensor_with_existing_grad(wp.vec4)
+    wrap_vec_tensor_with_existing_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_existing_grad(wp.transform)
+
+    def wrap_vec_tensor_with_new_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, requires_grad=True, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == t.grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_vec_tensor_with_new_grad(wp.vec2)
+    wrap_vec_tensor_with_new_grad(wp.vec3)
+    wrap_vec_tensor_with_new_grad(wp.vec4)
+    wrap_vec_tensor_with_new_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_new_grad(wp.transform)
+
+    def wrap_vec_tensor_with_paddle_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        grad = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, grad=grad, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_vec_tensor_with_paddle_grad(wp.vec2)
+    wrap_vec_tensor_with_paddle_grad(wp.vec3)
+    wrap_vec_tensor_with_paddle_grad(wp.vec4)
+    wrap_vec_tensor_with_paddle_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_paddle_grad(wp.transform)
+
+    def wrap_vec_tensor_with_warp_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        grad = wp.zeros(10, dtype=vec_dtype, device=device)
+        a = wp.from_paddle(t, dtype=vec_dtype, grad=grad, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == grad.ptr
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+
+    wrap_vec_tensor_with_warp_grad(wp.vec2)
+    wrap_vec_tensor_with_warp_grad(wp.vec3)
+    wrap_vec_tensor_with_warp_grad(wp.vec4)
+    wrap_vec_tensor_with_warp_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_warp_grad(wp.transform)
+
+
+def test_to_paddle(test, device):
+    import paddle
+
+    def wrap_scalar_array(warp_dtype, expected_paddle_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_paddle(a)
+        assert t.dtype == expected_paddle_dtype
+        assert tuple(t.shape) == a.shape
+
+    wrap_scalar_array(wp.float64, paddle.float64)
+    wrap_scalar_array(wp.float32, paddle.float32)
+    wrap_scalar_array(wp.float16, paddle.float16)
+    wrap_scalar_array(wp.int64, paddle.int64)
+    wrap_scalar_array(wp.int32, paddle.int32)
+    wrap_scalar_array(wp.int16, paddle.int16)
+    wrap_scalar_array(wp.int8, paddle.int8)
+    wrap_scalar_array(wp.uint8, paddle.uint8)
+    wrap_scalar_array(wp.bool, paddle.bool)
+
+    # not supported by paddle
+    # wrap_scalar_array(wp.uint64, paddle.int64)
+    # wrap_scalar_array(wp.uint32, paddle.int32)
+    # wrap_scalar_array(wp.uint16, paddle.int16)
+
+    def wrap_vec_array(n, warp_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_paddle(a)
+        assert t.dtype == paddle.float32
+        assert tuple(t.shape) == (10, n)
+
+    wrap_vec_array(2, wp.vec2)
+    wrap_vec_array(3, wp.vec3)
+    wrap_vec_array(4, wp.vec4)
+    wrap_vec_array(6, wp.spatial_vector)
+    wrap_vec_array(7, wp.transform)
+
+    def wrap_mat_array(n, m, warp_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_paddle(a)
+        assert t.dtype == paddle.float32
+        assert tuple(t.shape) == (10, n, m)
+
+    wrap_mat_array(2, 2, wp.mat22)
+    wrap_mat_array(3, 3, wp.mat33)
+    wrap_mat_array(4, 4, wp.mat44)
+    wrap_mat_array(6, 6, wp.spatial_matrix)
+
+
+def test_from_paddle_slices(test, device):
+    import paddle
+
+    paddle_device = wp.device_to_paddle(device)
+
+    # 1D slice, contiguous
+    t_base = paddle.arange(10, dtype=paddle.float32).to(device=paddle_device)
+    t = t_base[2:9]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    assert_np_equal(a.numpy(), t.cpu().numpy())
+
+    # 1D slice with non-contiguous stride
+    t_base = paddle.arange(10, dtype=paddle.float32).to(device=paddle_device)
+    t = t_base[2:9:2]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy1d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+    # 2D slices (non-contiguous)
+    t_base = paddle.arange(24, dtype=paddle.float32).to(device=paddle_device).reshape((4, 6))
+    t = t_base[1:3, 2:5]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+    # 3D slices (non-contiguous)
+    t_base = paddle.arange(36, dtype=paddle.float32).to(device=paddle_device).reshape((4, 3, 3))
+    t = t_base[::2, 0:1, 1:2]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+    # 2D slices of vec3 (inner contiguous, outer non-contiguous)
+    t_base = paddle.arange(150, dtype=paddle.float32).to(device=paddle_device).reshape((10, 5, 3))
+    t = t_base[1:7:2, 2:5]
+    a = wp.from_paddle(t, dtype=wp.vec3)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape[:-1])
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_vec3_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+    # 2D slices of mat22 (inner contiguous, outer non-contiguous)
+    t_base = paddle.arange(200, dtype=paddle.float32).to(device=paddle_device).reshape((10, 5, 2, 2))
+    t = t_base[1:7:2, 2:5]
+    a = wp.from_paddle(t, dtype=wp.mat22)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape[:-2])
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_mat22_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+
+def test_from_paddle_zero_strides(test, device):
+    import paddle
+
+    paddle_device = wp.device_to_paddle(device)
+
+    t_base = paddle.arange(9, dtype=paddle.float32).to(device=paddle_device).reshape((3, 3))
+
+    # expand outermost dimension
+    t = t_base.unsqueeze(0).expand([3, -1, -1])
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+    # expand middle dimension
+    t = t_base.unsqueeze(1).expand([-1, 3, -1])
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+    # expand innermost dimension
+    t = t_base.unsqueeze(2).expand([-1, -1, 3])
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+
+
+def test_paddle_mgpu_from_paddle(test, device):
+    import paddle
+
+    n = 32
+
+    t0 = paddle.arange(0, n, 1, dtype=paddle.int32).to(device="gpu:0")
+    t1 = paddle.arange(0, n * 2, 2, dtype=paddle.int32).to(device="gpu:1")
+
+    a0 = wp.from_paddle(t0, dtype=wp.int32)
+    a1 = wp.from_paddle(t1, dtype=wp.int32)
+
+    assert a0.device == "gpu:0"
+    assert a1.device == "gpu:1"
+
+    expected0 = np.arange(0, n, 1)
+    expected1 = np.arange(0, n * 2, 2)
+
+    assert_np_equal(a0.numpy(), expected0)
+    assert_np_equal(a1.numpy(), expected1)
+
+
+def test_paddle_mgpu_to_paddle(test, device):
+    n = 32
+
+    with wp.ScopedDevice("gpu:0"):
+        a0 = wp.empty(n, dtype=wp.int32)
+        wp.launch(arange, dim=a0.size, inputs=[0, 1, a0])
+
+    with wp.ScopedDevice("gpu:1"):
+        a1 = wp.empty(n, dtype=wp.int32)
+        wp.launch(arange, dim=a1.size, inputs=[0, 2, a1])
+
+    t0 = wp.to_paddle(a0)
+    t1 = wp.to_paddle(a1)
+
+    assert str(t0.device) == "gpu:0"
+    assert str(t1.device) == "gpu:1"
+
+    expected0 = np.arange(0, n, 1, dtype=np.int32)
+    expected1 = np.arange(0, n * 2, 2, dtype=np.int32)
+
+    assert_np_equal(t0.cpu().numpy(), expected0)
+    assert_np_equal(t1.cpu().numpy(), expected1)
+
+
+def test_paddle_mgpu_interop(test, device):
+    import paddle
+
+    n = 1024 * 1024
+
+    with paddle.cuda.device(0):
+        t0 = paddle.arange(n, dtype=paddle.float32).to(device="gpu")
+        a0 = wp.from_paddle(t0)
+        wp.launch(inc, dim=a0.size, inputs=[a0], stream=wp.stream_from_paddle())
+
+    with paddle.cuda.device(1):
+        t1 = paddle.arange(n, dtype=paddle.float32).to(device="gpu")
+        a1 = wp.from_paddle(t1)
+        wp.launch(inc, dim=a1.size, inputs=[a1], stream=wp.stream_from_paddle())
+
+    assert a0.device == "gpu:0"
+    assert a1.device == "gpu:1"
+
+    expected = np.arange(n, dtype=int) + 1
+
+    # ensure the paddle tensors were modified by warp
+    assert_np_equal(t0.cpu().numpy(), expected)
+    assert_np_equal(t1.cpu().numpy(), expected)
+
+
+def test_paddle_autograd(test, device):
+    """Test paddle autograd with a custom Warp op"""
+
+    import paddle
+
+    # custom autograd op
+    class TestFunc(paddle.autograd.PyLayer):
+        @staticmethod
+        def forward(ctx, x):
+            # allocate output array
+            y = paddle.empty_like(x)
+
+            ctx.x = x
+            ctx.y = y
+
+            wp.launch(kernel=op_kernel, dim=len(x), inputs=[wp.from_paddle(x)], outputs=[wp.from_paddle(y)])
+
+            return y
+
+        @staticmethod
+        def backward(ctx, adj_y):
+            # adjoints should be allocated as zero initialized
+            adj_x = paddle.zeros_like(ctx.x).contiguous()
+            adj_y = adj_y.contiguous()
+
+            wp_x = wp.from_paddle(ctx.x, grad=adj_x)
+            wp_y = wp.from_paddle(ctx.y, grad=adj_y)
+
+            wp.launch(
+                kernel=op_kernel,
+                dim=len(ctx.x),
+                # fwd inputs
+                inputs=[wp_x],
+                outputs=[wp_y],
+                # adj inputs (already stored in input/output arrays, passing null pointers)
+                adj_inputs=[None],
+                adj_outputs=[None],
+                adjoint=True,
+            )
+
+            return adj_x
+
+    # run autograd on given device
+    with wp.ScopedDevice(device):
+        paddle_device = wp.device_to_paddle(device)
+
+        # input data
+        x = paddle.ones(16, dtype=paddle.float32).to(device=paddle_device)
+        x.stop_gradient = False
+
+        # execute op
+        y = TestFunc.apply(x)
+
+        # compute grads
+        l = y.sum()
+        l.backward()
+
+        passed = (x.grad == -2.0).all()
+        assert passed.item()
+
+
+def test_warp_graph_warp_stream(test, device):
+    """Capture Warp graph on Warp stream"""
+
+    import paddle
+
+    paddle_device = wp.device_to_paddle(device)
+
+    n = 1024 * 1024
+    t = paddle.zeros(n, dtype=paddle.float32).to(device=paddle_device)
+    a = wp.from_paddle(t)
+
+    # make paddle use the warp stream from the given device
+    paddle_stream = wp.stream_to_paddle(device)
+
+    # capture graph
+    with wp.ScopedDevice(device), paddle.device.stream(paddle_stream):
+        wp.capture_begin(force_module_load=False)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            g = wp.capture_end()
+
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        wp.capture_launch(g)
+
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+
+
+def test_warp_graph_paddle_stream(test, device):
+    """Capture Warp graph on Paddle stream"""
+
+    wp.load_module(device=device)
+
+    import paddle
+
+    paddle_device = wp.device_to_paddle(device)
+
+    n = 1024 * 1024
+    t = paddle.zeros(n, dtype=paddle.float32).to(device=paddle_device)
+    a = wp.from_paddle(t)
+
+    # create a device-specific paddle stream to use for capture
+    # (the default paddle stream is not suitable for graph capture)
+    paddle_stream = paddle.device.Stream(device=paddle_device)
+
+    # make warp use the same stream
+    warp_stream = wp.stream_from_paddle(paddle_stream)
+
+    # capture graph
+    with wp.ScopedStream(warp_stream):
+        wp.capture_begin(force_module_load=False)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            g = wp.capture_end()
+
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        wp.capture_launch(g)
+
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+
+
+def test_direct(test, device):
+    """Pass Paddle tensors to Warp kernels directly"""
+
+    import paddle
+
+    paddle_device = wp.device_to_paddle(device)
+    n = 12
+
+    s = paddle.arange(n, dtype=paddle.float32).to(device=paddle_device)
+    v = paddle.arange(n, dtype=paddle.float32).to(device=paddle_device).reshape((n // 3, 3))
+    m = paddle.arange(n, dtype=paddle.float32).to(device=paddle_device).reshape((n // 4, 2, 2))
+
+    wp.launch(inc, dim=n, inputs=[s], device=device)
+    wp.launch(inc_vector, dim=n // 3, inputs=[v], device=device)
+    wp.launch(inc_matrix, dim=n // 4, inputs=[m], device=device)
+
+    expected = paddle.arange(1, n + 1, dtype=paddle.float32).to(device=paddle_device)
+
+    assert paddle.equal_all(s, expected).item()
+    assert paddle.equal_all(v.reshape([n]), expected).item()
+    assert paddle.equal_all(m.reshape([n]), expected).item()
+
+
+class TestPaddle(unittest.TestCase):
+    pass
+
+
+test_devices = get_test_devices()
+
+try:
+    import paddle
+
+    # check which Warp devices work with Paddle
+    # CUDA devices may fail if Paddle was not compiled with CUDA support
+    paddle_compatible_devices = []
+    paddle_compatible_cuda_devices = []
+
+    for d in test_devices:
+        try:
+            t = paddle.arange(10).to(device=wp.device_to_paddle(d))
+            t += 1
+            paddle_compatible_devices.append(d)
+            if d.is_cuda:
+                paddle_compatible_cuda_devices.append(d)
+        except Exception as e:
+            print(f"Skipping Paddle tests on device '{d}' due to exception: {e}")
+
+    add_function_test(TestPaddle, "test_dtype_from_paddle", test_dtype_from_paddle, devices=None)
+    add_function_test(TestPaddle, "test_dtype_to_paddle", test_dtype_to_paddle, devices=None)
+
+    if paddle_compatible_devices:
+        add_function_test(
+            TestPaddle, "test_device_conversion", test_device_conversion, devices=paddle_compatible_devices
+        )
+        add_function_test(TestPaddle, "test_from_paddle", test_from_paddle, devices=paddle_compatible_devices)
+        add_function_test(
+            TestPaddle, "test_from_paddle_slices", test_from_paddle_slices, devices=paddle_compatible_devices
+        )
+        add_function_test(
+            TestPaddle, "test_array_ctype_from_paddle", test_array_ctype_from_paddle, devices=paddle_compatible_devices
+        )
+        add_function_test(
+            TestPaddle,
+            "test_from_paddle_zero_strides",
+            test_from_paddle_zero_strides,
+            devices=paddle_compatible_devices,
+        )
+        add_function_test(TestPaddle, "test_to_paddle", test_to_paddle, devices=paddle_compatible_devices)
+        add_function_test(TestPaddle, "test_paddle_zerocopy", test_paddle_zerocopy, devices=paddle_compatible_devices)
+        add_function_test(TestPaddle, "test_paddle_autograd", test_paddle_autograd, devices=paddle_compatible_devices)
+        add_function_test(TestPaddle, "test_direct", test_direct, devices=paddle_compatible_devices)
+
+    # NOTE: Graph not supported now
+    # if paddle_compatible_cuda_devices:
+    #     add_function_test(
+    #         TestPaddle,
+    #         "test_warp_graph_warp_stream",
+    #         test_warp_graph_warp_stream,
+    #         devices=paddle_compatible_cuda_devices,
+    #     )
+    #     add_function_test(
+    #         TestPaddle,
+    #         "test_warp_graph_paddle_stream",
+    #         test_warp_graph_paddle_stream,
+    #         devices=paddle_compatible_cuda_devices,
+    #     )
+
+    # multi-GPU tests
+    if len(paddle_compatible_cuda_devices) > 1:
+        add_function_test(TestPaddle, "test_paddle_mgpu_from_paddle", test_paddle_mgpu_from_paddle)
+        add_function_test(TestPaddle, "test_paddle_mgpu_to_paddle", test_paddle_mgpu_to_paddle)
+        add_function_test(TestPaddle, "test_paddle_mgpu_interop", test_paddle_mgpu_interop)
+
+except Exception as e:
+    print(f"Skipping Paddle tests due to exception: {e}")
+
+
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)
diff --git a/warp/tests/test_static.py b/warp/tests/test_static.py
new file mode 100644
index 00000000..d816af4f
--- /dev/null
+++ b/warp/tests/test_static.py
@@ -0,0 +1,412 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import unittest
+from typing import Dict, List
+
+import numpy as np
+
+import warp
+import warp as wp
+from warp.tests.unittest_utils import *
+
+global_variable = 3
+
+
+@wp.func
+def static_global_variable_func():
+    static_var = warp.static(global_variable + 2)
+    return static_var
+
+
+@wp.kernel
+def static_global_variable_kernel(results: wp.array(dtype=int)):
+    # evaluate a constant expression at codegen time
+    static_var = static_global_variable_func()
+    const_var = 3
+    # call a function at codegen time
+    static_func_result = wp.static(static_global_variable_func() + const_var)
+    results[0] = static_var
+    results[1] = static_func_result
+
+
+@wp.struct
+class StaticallyConstructableStruct:
+    mat: wp.mat33
+    vec: wp.vec3
+    i: int
+
+
+@wp.struct
+class StaticallyConstructableNestedStruct:
+    s: StaticallyConstructableStruct
+    tf: wp.transform
+    quat: wp.quat
+
+
+@wp.func
+def construct_struct(mat: wp.mat33, vec: wp.vec3, i: int):
+    s = StaticallyConstructableStruct()
+    s.mat = mat
+    s.vec = vec
+    s.i = i
+    return s
+
+
+@wp.func
+def construct_nested_struct(mat: wp.mat33, vec: wp.vec3, i: int, tf: wp.transform, quat: wp.quat):
+    n = StaticallyConstructableNestedStruct()
+    n.s = construct_struct(mat, vec, i)
+    n.tf = tf
+    n.quat = quat
+    return n
+
+
+@wp.kernel
+def construct_static_struct_kernel(results: wp.array(dtype=StaticallyConstructableStruct)):
+    static_struct = wp.static(
+        construct_struct(
+            wp.mat33(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
+            wp.vec3(1.0, 2.0, 3.0),
+            1,
+        )
+    )
+    results[0] = static_struct
+
+
+@wp.kernel
+def construct_static_nested_struct_kernel(results: wp.array(dtype=StaticallyConstructableNestedStruct)):
+    static_struct = wp.static(
+        construct_nested_struct(
+            wp.mat33(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
+            wp.vec3(1.0, 2.0, 3.0),
+            1,
+            wp.transform(wp.vec3(1.0, 2.0, 3.0), wp.quat_from_axis_angle(wp.vec3(0.0, 1.0, 0.0), wp.pi / 2.0)),
+            wp.quat_from_axis_angle(wp.normalize(wp.vec3(1.0, 2.0, 3.0)), wp.pi / 2.0),
+        )
+    )
+    results[0] = static_struct
+
+
+def test_static_global_variable(test, device):
+    results = wp.zeros(2, dtype=int, device=device)
+    wp.launch(static_global_variable_kernel, 1, [results], device=device)
+    assert_np_equal(results.numpy(), np.array([5, 8], dtype=int))
+
+
+def test_construct_static_struct(test, device):
+    results = wp.zeros(1, dtype=StaticallyConstructableStruct, device=device)
+    wp.launch(construct_static_struct_kernel, 1, [results], device=device)
+    results = results.numpy()
+    assert_np_equal(results[0][0], np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]))
+    assert_np_equal(results[0][1], np.array([1.0, 2.0, 3.0]))
+    assert_np_equal(results[0][2], 1)
+
+
+def test_construct_static_nested_struct(test, device):
+    results = wp.zeros(1, dtype=StaticallyConstructableNestedStruct, device=device)
+    wp.launch(construct_static_nested_struct_kernel, 1, [results], device=device)
+    results = results.numpy()
+
+    tf = wp.transform(wp.vec3(1.0, 2.0, 3.0), wp.quat_from_axis_angle(wp.vec3(0.0, 1.0, 0.0), wp.pi / 2.0))
+    quat = wp.quat_from_axis_angle(wp.normalize(wp.vec3(1.0, 2.0, 3.0)), wp.pi / 2.0)
+
+    assert_np_equal(results[0][0][0], np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]))
+    assert_np_equal(results[0][0][1], np.array([1.0, 2.0, 3.0]))
+    assert_np_equal(results[0][0][2], 1)
+    assert_np_equal(results[0][1], np.array(tf))
+    assert_np_equal(results[0][2], np.array(quat))
+
+
+def test_invalid_static_expression(test, device):
+    @wp.kernel
+    def invalid_kernel():
+        wp.static(1.0 / 0.0)
+
+    with test.assertRaisesRegex(
+        warp.codegen.WarpCodegenError, r"Error evaluating static expression\: float division by zero"
+    ):
+        wp.launch(invalid_kernel, 1, device=device)
+
+    @wp.kernel
+    def invalid_kernel(i: int):
+        wp.static(i * 2)
+
+    with test.assertRaisesRegex(
+        wp.codegen.WarpCodegenError,
+        r"Error evaluating static expression\: name 'i' is not defined\. Make sure all variables used in the static expression are constant\.",
+    ):
+        wp.launch(invalid_kernel, 1, device=device, inputs=[3])
+
+
+def test_static_expression_return_types(test, device):
+    @wp.kernel
+    def invalid_kernel():
+        wp.static(wp.zeros(3, device=device))
+
+    with test.assertRaisesRegex(
+        warp.codegen.WarpCodegenError,
+        r"Static expression returns an unsupported value\: a Warp array cannot be created inside Warp kernels",
+    ):
+        wp.launch(invalid_kernel, 1, device=device)
+
+    @wp.struct
+    class Baz:
+        data: wp.array(dtype=int)
+        z: wp.vec3
+
+    @wp.struct
+    class Bar:
+        baz: Baz
+        y: float
+
+    @wp.struct
+    class Foo:
+        bar: Bar
+        x: int
+
+    def create_struct():
+        foo = Foo()
+        foo.bar = Bar()
+        foo.bar.baz = Baz()
+        foo.bar.baz.data = wp.zeros(3, dtype=int, device=device)
+        foo.bar.baz.z = wp.vec3(1, 2, 3)
+        foo.bar.y = 1.23
+        foo.x = 123
+        return foo
+
+    @wp.kernel
+    def invalid_kernel():
+        wp.static(create_struct())
+
+    with test.assertRaisesRegex(
+        warp.codegen.WarpCodegenError,
+        r"Static expression returns an unsupported value: the returned Warp struct contains a data type that cannot be constructed inside Warp kernels\: a Warp array cannot be created inside Warp kernels at .*?Foo\.bar\.baz",
+    ):
+        wp.launch(invalid_kernel, 1, device=device)
+
+    def function_with_no_return_value():
+        pass
+
+    @wp.kernel
+    def invalid_kernel():
+        wp.static(function_with_no_return_value())
+
+    with test.assertRaisesRegex(
+        warp.codegen.WarpCodegenError,
+        r"Static expression returns an unsupported value\: None is returned",
+    ):
+        wp.launch(invalid_kernel, 1, device=device)
+
+    class MyClass:
+        pass
+
+    @wp.kernel
+    def invalid_kernel():
+        wp.static(MyClass())
+
+    with test.assertRaisesRegex(
+        warp.codegen.WarpCodegenError,
+        r"Static expression returns an unsupported value\: value of type .*?MyClass",
+    ):
+        wp.launch(invalid_kernel, 1, device=device)
+
+
+def test_function_variable(test, device):
+    # create a function and pass it in as a static variable to the kernel
+    @wp.func
+    def func1(a: int, b: int):
+        return a + b
+
+    @wp.func
+    def func2(a: int, b: int):
+        return a - b
+
+    for func in [func1, func2]:
+        # note that this example also works without using wp.static()
+
+        @wp.kernel
+        def function_variable_kernel(results: wp.array(dtype=int)):
+            results[0] = wp.static(func)(3, 2)  # noqa: B023
+
+        results = wp.zeros(1, dtype=int, device=device)
+        # note that the kernel has to be recompiled everytime the value of func changes
+        wp.launch(function_variable_kernel, 1, [results], device=device)
+        assert_np_equal(results.numpy(), np.array([func(3, 2)], dtype=int))
+
+
+def test_function_lookup(test, device):
+    @wp.func
+    def do_add(a: float, b: float):
+        return a + b
+
+    @wp.func
+    def do_sub(a: float, b: float):
+        return a - b
+
+    @wp.func
+    def do_mul(a: float, b: float):
+        return a * b
+
+    op_handlers = {
+        "add": do_add,
+        "sub": do_sub,
+        "mul": do_mul,
+    }
+
+    inputs = wp.array([[1, 2], [3, 0]], dtype=wp.float32)
+
+    outputs = wp.empty(2, dtype=wp.float32)
+
+    for op in op_handlers.keys():
+
+        @wp.kernel
+        def operate(input: wp.array(dtype=inputs.dtype, ndim=2), output: wp.array(dtype=wp.float32)):
+            tid = wp.tid()
+            a, b = input[tid, 0], input[tid, 1]
+            # retrieve the right function to use for the captured dtype variable
+            output[tid] = wp.static(op_handlers[op])(a, b)  # noqa: B023
+
+        wp.launch(operate, dim=2, inputs=[inputs], outputs=[outputs])
+        outputs_np = outputs.numpy()
+        inputs_np = inputs.numpy()
+        for i in range(len(outputs_np)):
+            test.assertEqual(outputs_np[i], op_handlers[op](float(inputs_np[i][0]), float(inputs_np[i][1])))
+
+
+def count_ssa_occurrences(kernel: wp.Kernel, ssas: List[str]) -> Dict[str, int]:
+    # analyze the generated code
+    counts = {ssa: 0 for ssa in ssas}
+    for line in kernel.adj.blocks[0].body_forward:
+        for ssa in ssas:
+            if ssa in line:
+                counts[ssa] += 1
+    return counts
+
+
+def test_static_for_loop(test, device):
+    @wp.kernel
+    def static_loop_variable(results: wp.array(dtype=int)):
+        s = 0
+        for i in range(wp.static(static_global_variable_func())):
+            s += wp.static(i)
+        results[0] = s
+
+    wp.set_module_options(
+        options={"max_unroll": static_global_variable_func()},
+    )
+
+    results = wp.zeros(1, dtype=int, device=device)
+    wp.launch(static_loop_variable, 1, [results], device=device)
+    results = results.numpy()
+
+    s = 0
+    for i in range(wp.static(static_global_variable_func())):
+        s += wp.static(i)
+
+    test.assertEqual(results[0], s, "Static for loop has to compute the correct solution")
+
+    # analyze the generated code
+    if hasattr(static_loop_variable.adj, "blocks"):
+        counts = count_ssa_occurrences(static_loop_variable, ["add", "for"])
+
+        test.assertEqual(counts["add"], static_global_variable_func(), "Static for loop must be unrolled")
+        # there is just one occurrence of "for" in the comment referring to the original Python code
+        test.assertEqual(counts["for"], 1, "Static for loop must be unrolled")
+
+
+def test_static_if_else_elif(test, device):
+    @wp.kernel
+    def static_condition1(results: wp.array(dtype=int)):
+        if wp.static(static_global_variable_func() in {2, 3, 5}):
+            results[0] = 1
+        elif wp.static(static_global_variable_func() in {0, 1}):
+            results[0] = 2
+        else:
+            results[0] = 3
+
+    results = wp.zeros(1, dtype=int, device=device)
+    wp.launch(static_condition1, 1, [results], device=device)
+    results = results.numpy()
+    assert_np_equal(results[0], 1)
+    # TODO this needs fixing to ensure we can run these tests multiple times
+    if hasattr(static_condition1.adj, "blocks"):
+        counts = count_ssa_occurrences(static_condition1, ["if", "else"])
+
+        # if, else, elif can appear as comments but the generated code must not contain
+        # such keywords since the conditions are resolved at the time of code generation
+        assert_np_equal(counts["if"], 1)
+        assert_np_equal(counts["else"], 0)
+
+    captured_var = "hello"
+
+    @wp.kernel
+    def static_condition2(results: wp.array(dtype=int)):
+        if wp.static(captured_var == "world"):
+            results[0] = 1
+        else:
+            results[0] = 2
+
+    results = wp.zeros(1, dtype=int, device=device)
+    wp.launch(static_condition2, 1, [results], device=device)
+    results = results.numpy()
+    assert_np_equal(results[0], 2)
+    if hasattr(static_condition2.adj, "blocks"):
+        counts = count_ssa_occurrences(static_condition2, ["if", "else"])
+        assert_np_equal(counts["if"], 1)
+        assert_np_equal(counts["else"], 0)
+
+    my_list = [1, 2, 3]
+
+    @wp.kernel
+    def static_condition3(results: wp.array(dtype=int)):
+        if wp.static(len(my_list) == 0):
+            results[0] = 0
+        elif wp.static(len(my_list) == 1):
+            results[0] = 1
+        elif wp.static(len(my_list) == 2):
+            results[0] = 2
+        elif wp.static(len(my_list) == 3):
+            results[0] = 3
+
+    results = wp.zeros(1, dtype=int, device=device)
+    wp.launch(static_condition3, 1, [results], device=device)
+    results = results.numpy()
+    assert_np_equal(results[0], 3)
+    if hasattr(static_condition3.adj, "blocks"):
+        counts = count_ssa_occurrences(static_condition3, ["if", "else"])
+        assert_np_equal(counts["if"], 4)
+        assert_np_equal(counts["else"], 0)
+
+
+devices = get_test_devices()
+
+
+class TestStatic(unittest.TestCase):
+    def test_static_python_call(self):
+        # ensure wp.static() works from a Python context
+        self.assertEqual(static_global_variable_func(), 5)
+
+
+add_function_test(TestStatic, "test_static_global_variable", test_static_global_variable, devices=devices)
+add_function_test(TestStatic, "test_construct_static_struct", test_construct_static_struct, devices=devices)
+add_function_test(
+    TestStatic, "test_construct_static_nested_struct", test_construct_static_nested_struct, devices=devices
+)
+add_function_test(TestStatic, "test_function_variable", test_function_variable, devices=devices)
+add_function_test(TestStatic, "test_function_lookup", test_function_lookup, devices=devices)
+add_function_test(TestStatic, "test_invalid_static_expression", test_invalid_static_expression, devices=devices)
+add_function_test(
+    TestStatic, "test_static_expression_return_types", test_static_expression_return_types, devices=devices
+)
+add_function_test(TestStatic, "test_static_for_loop", test_static_for_loop, devices=devices)
+add_function_test(TestStatic, "test_static_if_else_elif", test_static_if_else_elif, devices=devices)
+
+
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)
diff --git a/warp/tests/test_tile.py b/warp/tests/test_tile.py
index 8b1d3157..51cf7307 100644
--- a/warp/tests/test_tile.py
+++ b/warp/tests/test_tile.py
@@ -21,6 +21,7 @@
 # num threads per-tile
 TILE_DIM = 64
 
+
 @wp.kernel
 def tile_copy_1d_kernel(A: wp.array(dtype=float), B: wp.array(dtype=float)):
     # tile index
@@ -59,6 +60,7 @@ def test_tile_copy_1d(test, device):
 
     assert_array_equal(B_wp.grad, A_wp.grad)
 
+
 @wp.kernel
 def tile_copy_2d_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
     # tile index
@@ -450,18 +452,18 @@ def test_tile_transpose(test, device):
     assert_np_equal(output.numpy(), input.numpy().T)
 
 
-@wp.kernel
-def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
-    x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
-    y = wp.tile_transpose(x)
-
-    z = wp.tile_zeros(dtype=float, m=TILE_N, n=TILE_N)
-    wp.tile_matmul(y, x, z)
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
+def test_tile_transpose_matmul(test, device):
+    @wp.kernel
+    def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
+        x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
+        y = wp.tile_transpose(x)
 
-    wp.tile_store(output, 0, 0, z)
+        z = wp.tile_zeros(dtype=float, m=TILE_N, n=TILE_N)
+        wp.tile_matmul(y, x, z)
 
+        wp.tile_store(output, 0, 0, z)
 
-def test_tile_transpose_matmul(test, device):
     rng = np.random.default_rng(42)
     input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
     output = wp.zeros((TILE_N, TILE_N), dtype=float, device=device)
@@ -473,57 +475,53 @@ def test_tile_transpose_matmul(test, device):
 
 @wp.kernel
 def test_tile_broadcast_add_kernel(
-    input_a: wp.array2d(dtype=float),
-    input_b: wp.array(dtype=float),
-    output: wp.array2d(dtype=float)):
-
+    input_a: wp.array2d(dtype=float), input_b: wp.array(dtype=float), output: wp.array2d(dtype=float)
+):
     a = wp.tile_load(input_a, 0, 0, m=10, n=10)
     b = wp.tile_load(input_b, 0, n=10)
 
     c = wp.tile_broadcast(b, 10, 10)
     d = a + c
 
-    wp.tile_store(output, 0, 0, d)   
+    wp.tile_store(output, 0, 0, d)
 
-def test_tile_broadcast_add(test, device):
 
+def test_tile_broadcast_add(test, device):
     M = 10
     N = 10
-    
-    a = wp.array(np.ones((M,N), dtype=np.float32), device=device)
+
+    a = wp.array(np.ones((M, N), dtype=np.float32), device=device)
     b = wp.array(np.arange(0, N, dtype=np.float32), device=device)
-    out = wp.zeros((M,N), dtype=float, device=device)
+    out = wp.zeros((M, N), dtype=float, device=device)
+
+    wp.launch_tiled(test_tile_broadcast_add_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
 
-    wp.launch_tiled(test_tile_broadcast_add_kernel, dim=[1], inputs=[a, b, out], block_dim=32)
-    
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
 
 
 @wp.kernel
-def test_tile_broadcast_grad_kernel(
-    a: wp.array(dtype=float),
-    b: wp.array2d(dtype=float)):
-
+def test_tile_broadcast_grad_kernel(a: wp.array(dtype=float), b: wp.array2d(dtype=float)):
     x = wp.tile_load(a, i=0, n=5)
     y = wp.tile_broadcast(x, m=5, n=5)
 
     w = wp.tile_ones(dtype=float, m=5, n=5)
     z = w + y
-    
+
     wp.tile_store(b, 0, 0, z)
 
+
 def test_tile_broadcast_grad(test, device):
-        
-    a = wp.array(np.arange(0, 5, dtype=np.float32), requires_grad=True)
-    b = wp.array(np.ones((5, 5), dtype=np.float32), requires_grad=True)
+    a = wp.array(np.arange(0, 5, dtype=np.float32), requires_grad=True, device=device)
+    b = wp.array(np.ones((5, 5), dtype=np.float32), requires_grad=True, device=device)
 
-    with wp.Tape() as tape:   
-        wp.launch_tiled(test_tile_broadcast_grad_kernel, dim=[1], inputs=[a, b], block_dim=32)
+    with wp.Tape() as tape:
+        wp.launch_tiled(test_tile_broadcast_grad_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
 
-    b.grad = wp.ones_like(b)
+    b.grad = wp.ones_like(b, device=device)
     tape.backward()
 
-    assert_np_equal(a.grad.numpy(), np.ones(5)*5.0)
+    assert_np_equal(a.grad.numpy(), np.ones(5) * 5.0)
+
 
 # #-----------------------------------------
 # # center of mass computation
@@ -615,9 +613,9 @@ class TestTile(unittest.TestCase):
 add_function_test(TestTile, "test_tile_copy_2d", test_tile_copy_2d, devices=devices)
 add_function_test(TestTile, "test_tile_unary_map", test_tile_unary_map, devices=devices)
 add_function_test(TestTile, "test_tile_binary_map", test_tile_binary_map, devices=devices)
-add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)  
+add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)
 add_function_test(TestTile, "test_tile_gemm", test_tile_gemm, devices=devices)
-add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=devices)  
+add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=devices)
 add_function_test(TestTile, "test_tile_transpose_matmul", test_tile_transpose_matmul, devices=devices)
 add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
 add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index 5e48b62f..fdf59259 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -280,23 +280,19 @@ def test_tile_untile_scalar(test, device):
     assert_np_equal(output.numpy(), np.arange(N) * 2)
 
 
-
 @wp.kernel
-def test_untile_vector_kernel(
-    input: wp.array(dtype=wp.vec3),
-    output: wp.array(dtype=wp.vec3)):
-
+def test_untile_vector_kernel(input: wp.array(dtype=wp.vec3), output: wp.array(dtype=wp.vec3)):
     i = wp.tid()
 
-    v = input[i]*0.5
+    v = input[i] * 0.5
 
     t = wp.tile(v)
     u = wp.untile(t)
 
-    output[i] = u*2.0
+    output[i] = u * 2.0
 
-def test_tile_untile_vector(test, device):
 
+def test_tile_untile_vector(test, device):
     input = wp.full(16, wp.vec3(1.0, 2.0, 3.0), requires_grad=True)
     output = wp.zeros_like(input)
 
@@ -321,7 +317,6 @@ def tile_ones_kernel(out: wp.array(dtype=float)):
 
 
 def test_tile_ones(test, device):
-    
     output = wp.zeros(1, dtype=float, device=device)
 
     with wp.Tape() as tape:
diff --git a/warp/tests/test_torch.py b/warp/tests/test_torch.py
index 1a4bd0b1..40c77899 100644
--- a/warp/tests/test_torch.py
+++ b/warp/tests/test_torch.py
@@ -382,6 +382,27 @@ def wrap_vec_tensor_with_warp_grad(vec_dtype):
     wrap_vec_tensor_with_warp_grad(wp.transform)
 
 
+def test_cuda_array_interface(test, device):
+    # We should be able to construct Torch tensors from Warp arrays via __cuda_array_interface__ on GPU.
+    # Note that Torch does not support __array_interface__ on CPU.
+
+    torch_device = wp.device_to_torch(device)
+    n = 10
+
+    # test the types supported by both Warp and Torch
+    scalar_types = [wp.float16, wp.float32, wp.float64, wp.int8, wp.int16, wp.int32, wp.int64, wp.uint8]
+
+    for dtype in scalar_types:
+        # test round trip
+        a1 = wp.zeros(n, dtype=dtype, device=device)
+        t = torch.tensor(a1, device=torch_device)
+        a2 = wp.array(t, device=device)
+
+        assert a1.dtype == a2.dtype
+        assert a1.shape == a2.shape
+        assert a1.strides == a2.strides
+
+
 def test_to_torch(test, device):
     import torch
 
@@ -918,6 +939,9 @@ class TestTorch(unittest.TestCase):
             test_warp_graph_torch_stream,
             devices=torch_compatible_cuda_devices,
         )
+        add_function_test(
+            TestTorch, "test_cuda_array_interface", test_cuda_array_interface, devices=torch_compatible_cuda_devices
+        )
 
     # multi-GPU tests
     if len(torch_compatible_cuda_devices) > 1:
diff --git a/warp/tests/test_types.py b/warp/tests/test_types.py
index 51f5f99b..bf859d1b 100644
--- a/warp/tests/test_types.py
+++ b/warp/tests/test_types.py
@@ -215,7 +215,7 @@ def test_constant(self):
         self.assertEqual(const, wp.vec3i(1, 2, 3))
 
     def test_constant_error_invalid_type(self):
-        with self.assertRaisesRegex(RuntimeError, r"Invalid constant type: <class 'tuple'>$"):
+        with self.assertRaisesRegex(TypeError, r"Invalid constant type: <class 'tuple'>$"):
             wp.constant((1, 2, 3))
 
     def test_vector_assign(self):
diff --git a/warp/thirdparty/dlpack.py b/warp/thirdparty/dlpack.py
index 0634474b..399e0002 100644
--- a/warp/thirdparty/dlpack.py
+++ b/warp/thirdparty/dlpack.py
@@ -58,6 +58,7 @@ class DLDataTypeCode(ctypes.c_uint8):
     kDLOpaquePointer = 3
     kDLBfloat = 4
     kDLComplex = 5
+    kDLBool = 6
 
     def __str__(self):
         return {
@@ -66,6 +67,7 @@ def __str__(self):
             self.kDLFloat: "float",
             self.kDLBfloat: "bfloat",
             self.kDLComplex: "complex",
+            self.kDLBool: "bool",
             self.kDLOpaquePointer: "void_p",
         }[self.value]
 
@@ -85,7 +87,7 @@ class DLDataType(ctypes.Structure):
         ("lanes", ctypes.c_uint16),
     ]
     TYPE_MAP = {
-        "bool": (DLDataTypeCode.kDLUInt, 1, 1),
+        "bool": (DLDataTypeCode.kDLBool, 8, 1),
         "int8": (DLDataTypeCode.kDLInt, 8, 1),
         "int16": (DLDataTypeCode.kDLInt, 16, 1),
         "int32": (DLDataTypeCode.kDLInt, 32, 1),
diff --git a/warp/types.py b/warp/types.py
index 9a0aa8a0..94fee051 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -66,8 +66,8 @@ def constant(x):
         x: Compile-time constant value, can be any of the built-in math types.
     """
 
-    if not isinstance(x, (builtins.bool, int, float, tuple(scalar_and_bool_types), ctypes.Array)):
-        raise RuntimeError(f"Invalid constant type: {type(x)}")
+    if not is_value(x):
+        raise TypeError(f"Invalid constant type: {type(x)}")
 
     return x
 
@@ -1302,7 +1302,7 @@ def type_to_warp(dtype):
 
 def type_typestr(dtype):
     if dtype == bool:
-        return "?"
+        return "|b1"
     elif dtype == float16:
         return "<f2"
     elif dtype == float32:
@@ -1310,9 +1310,9 @@ def type_typestr(dtype):
     elif dtype == float64:
         return "<f8"
     elif dtype == int8:
-        return "b"
+        return "|i1"
     elif dtype == uint8:
-        return "B"
+        return "|u1"
     elif dtype == int16:
         return "<i2"
     elif dtype == uint16:
@@ -1386,7 +1386,7 @@ def type_is_matrix(t):
 
 # returns true for all value types (int, float, bool, scalars, vectors, matrices)
 def type_is_value(x):
-    return x in value_types or issubclass(x, ctypes.Array)
+    return x in value_types or hasattr(x, "_wp_scalar_type_")
 
 
 # equivalent of the above but for values
@@ -1493,6 +1493,10 @@ def types_equal(a, b, match_generic=False):
     if is_array(a) and type(a) is type(b):
         return True
 
+    # match NewStructInstance and Struct dtype
+    if getattr(a, "cls", "a") is getattr(b, "cls", "b"):
+        return True
+
     if is_tile(a) and is_tile(b):
         return True
 
@@ -3000,7 +3004,7 @@ def cinit(self, adjoint=False):
                 # backward pass requires zeroed memory
                 return f"wp::tile_alloc_zeros<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{self.strides[0]}, {self.strides[1]}, {Tile.alloc()}>()"
             else:
-                if self.owner == False:
+                if not self.owner:
                     # will be initialized by subsequent call, e.g.: t = tile_broadcast(a)
                     return "NULL"
                 else:

From 6d8d54289abe142db87aa6a95e259c5ca3285b3c Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 1 Oct 2024 14:27:16 -0700
Subject: [PATCH 055/102] Use intended device in test_tile_untile_vector

---
 warp/tests/test_tile_reduce.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/warp/tests/test_tile_reduce.py b/warp/tests/test_tile_reduce.py
index fdf59259..bf8650e8 100644
--- a/warp/tests/test_tile_reduce.py
+++ b/warp/tests/test_tile_reduce.py
@@ -293,13 +293,13 @@ def test_untile_vector_kernel(input: wp.array(dtype=wp.vec3), output: wp.array(d
 
 
 def test_tile_untile_vector(test, device):
-    input = wp.full(16, wp.vec3(1.0, 2.0, 3.0), requires_grad=True)
-    output = wp.zeros_like(input)
+    input = wp.full(16, wp.vec3(1.0, 2.0, 3.0), requires_grad=True, device=device)
+    output = wp.zeros_like(input, device=device)
 
     with wp.Tape() as tape:
-        wp.launch(test_untile_vector_kernel, dim=16, inputs=[input, output], block_dim=16)
+        wp.launch(test_untile_vector_kernel, dim=16, inputs=[input, output], block_dim=16, device=device)
 
-    output.grad = wp.ones_like(output)
+    output.grad = wp.ones_like(output, device=device)
     tape.backward()
 
     assert_np_equal(output.numpy(), input.numpy())

From bddf3d25bee75d3c803fb3aca42d43f769dd107c Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 1 Oct 2024 21:33:36 +0000
Subject: [PATCH 056/102] Tile documentation fixes

---
 docs/modules/tiles.rst | 127 +++++++++++++++++++++++++++++------------
 1 file changed, 90 insertions(+), 37 deletions(-)

diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst
index 48d2c788..27706423 100644
--- a/docs/modules/tiles.rst
+++ b/docs/modules/tiles.rst
@@ -10,33 +10,10 @@ Warp 1.4.0 introduces tile extensions that expose a block-based programming to W
 Execution Model
 ---------------
 
-Warp's execution model allows users to specify an up to 4-dimensional grid of logical threads for kernel execution at launch time. With the introduction of tile primitives, users can additionally specify a block size, which partitions the thread grid into smaller sets of threads that are executed on a single compute unit.
+Warp's execution model allows users to specify an up to 4-dimensional grid of logical threads for kernel execution at launch time. With the introduction of tile primitives, users can now specify the block size for kernel launches, which partitions the thread grid into smaller sets of threads that are executed on a single compute unit.
 
 Inside kernels, tile operations are executed cooperatively across each block of threads, allowing them to take advantage of efficient memory access, local memory, and dedicated hardware units like TensorCores.
 
-As an example, consider the following kernel:
-
-.. code:: python
-    
-    TILE_SIZE = wp.constant(256)
-    TILE_THREADS = 64
-
-    @wp.kernel
-    def compute(a: array(dtype=float))
-        i = wp.tid()/TILE_SIZE
-
-        t = wp.tile_load(array, i, TILE_SIZE)
-        ...
-
-    wp.launch(compute, dim=[len(a)], inputs=[a], block_dim=TILE_THREADS)
-    
-Here, each block loads a 1D tile of 256 values from a global memory array ``a``, where the load operation is performed cooperatively by all 64 threads in the block, as specified by the ``block_dim`` argument to :func:`warp.launch`. In this case, each thread is responsible for loading 4 values from global memory, which may then be stored in registers, or shared memory across the block.
-
-Tile Properties
----------------
-
-In Warp, tile objects are 2D arrays of data where the tile elements may be scalars, vectors, matrices, or user defined structures.
-
 In the following example, we launch a grid of threads where each block is responsible for loading a row of data from a 2D array and computing its sum:
 
 .. code:: python
@@ -46,18 +23,24 @@ In the following example, we launch a grid of threads where each block is respon
 
     @wp.kernel
     def compute(a: array2d(dtype=float))
-        i, _ = wp.tid()
+        
+        # obtain our block index
+        i = wp.tid()
 
         # load a row from global memory
-        t = wp.tile_load(array, i, 0, 1, TILE_SIZE)
+        t = wp.tile_load(array[i], i, TILE_SIZE)
         s = wp.sum(t)
         ...
 
-    wp.launch(compute, dim=[a.shape[0], TILE_THREADS], inputs=[a], block_dim=TILE_THREADS)
+    wp.launch_tiled(compute, dim=[a.shape[0]], inputs=[a], block_dim=TILE_THREADS)
     
-Here, we launch a 2D grid of threads where the trailing dimension is equal to the block size. This ensures we have an entire block of threads dedicated to each row. Each block then loads an entire row of 256 values from the global memory array and computes its sum.
+Here, we have used the new :func:`warp.launch_tiled` function which assigns ``TILE_THREADS`` to each of the elements in the launch grid. Each block then loads an entire row of 256 values from the global memory array, computes its sum (cooperatively), and then stores the result back to global memory.
+
+
+Tile Properties
+---------------
 
-To streamline this common pattern Warp provides a helper ``wp.tiled_launch()`` which takes care of adding the trailing tile dimension to the thread grid, for example, to assign a block of 64 threads to load and sum a 2D array of values we can do the following:
+In Warp, tile objects are 2D arrays of data where the tile elements may be scalars, vectors, matrices, or user defined structures. We can load 2D tiles directly from 2D global memory arrays as follows:
 
 .. code:: python
     
@@ -67,16 +50,18 @@ To streamline this common pattern Warp provides a helper ``wp.tiled_launch()`` w
 
     @wp.kernel
     def compute(a: array2d(dtype=float))
+        
+        # obtain our 2d block index
         i, j = wp.tid()
 
-        # load a row from global memory
+        # load a 2d tile from global memory
         t = wp.tile_load(array, i, j, TILE_M, TILE_N)
         s = wp.sum(t)
         ...
 
     wp.launch_tiled(compute, dim=[a.shape[0]/TILE_M, a.shape[1]/TILE_N], inputs=[a], block_dim=TILE_THREADS)
     
-In this example, we use :func:`warp.launch_tiled` to automatically insert the trailing dimension, and assign ``TILE_THREADS`` to each 2D tile of the array. Each tile consists of ``16*16=256`` values, which are loaded cooperatively by the 64 threads in each block.
+Here we divide the array ``a`` into 2d tiles of shape 16x16, each block cooperatively loads tile from the input array and computes its sum before returning the result.
 
 Tile Storage
 ------------
@@ -86,16 +71,86 @@ When tiles are created they are placed in either `register` or `shared` memory.
 Register Tiles
 ++++++++++++++
 
-Values in register tiles are stored across the entire block, for example, if the block dimension at launch is set to 64, a register tile with ``shape=(1, 256)`` will result in each thread storing 4 elements. Reigster based storage is the fastest storage on most hardware, however, because the tile storage is spread across the threads in the block, an individual thread cannot randomly access data that is assigned to another thread efficiently. For this reason operations on tiles tend to expressed as higher level maps, reductions, and reshaping operations that may transfer values through shared memory.
+Values in register tiles are stored across the entire block, for example, if the block dimension at launch is set to 64, a register tile with ``shape=(1, 256)`` will result in each thread storing 4 elements. Register based storage is the fastest storage on most hardware, however, because the tile storage is spread across the threads in the block, an individual thread cannot randomly access data that is assigned to another thread efficiently. For this reason operations on tiles tend to expressed as higher level maps, reductions, and reshaping operations that may transfer values through shared memory.
 
 Shared Memory Tiles
 +++++++++++++++++++
 
-Some operations like matrix multiplication, require access to an entire tile of values. In this case the tile data may stored in shared memory, which allows efficient random access. Warp will automatically migrate tiles to shared memory as necessary for specific operations. Shared memory is a limited resource, and so tile size must be set appropriately to avoid exceeding the hardware limitations, otherwise kernel compilation may fail.
+Some operations like matrix multiplication, require access to an entire tile of values. In this case the tile data may be stored in shared memory, which allows efficient random access. Warp will automatically migrate tiles to shared memory as necessary for specific operations. Shared memory is a limited resource, and so tile size must be set appropriately to avoid exceeding the hardware limitations, otherwise kernel compilation may fail.
+
+Example: GEMM
+-------------
+
+.. code:: python
+
+    import numpy as np
+    import warp as wp
+
+    # tile size
+    TILE_M = wp.constant(8)
+    TILE_N = wp.constant(4)
+    TILE_K = wp.constant(8)
+
+    # num threads per-tile
+    TILE_THREADS = 64
+
+    @wp.kernel
+    def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
+        
+        # output tile index
+        i, j = wp.tid()
+
+        sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+
+        M = A.shape[0]
+        N = B.shape[1]
+        K = A.shape[1]
+
+        count = int(K / TILE_K)
+
+        for k in range(0, count):
+            a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+            b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+
+            # sum += a*b
+            wp.tile_matmul(a, b, sum)
+
+        wp.tile_store(C, i, j, sum)
+
+
+
+    if __name__ == "__main__":
+
+        # generate some tile aligned matrix dimensions
+        M = TILE_M * 7
+        K = TILE_K * 6
+        N = TILE_N * 5
+
+        rng = np.random.default_rng(42)
+        A = rng.random((M, K), dtype=np.float32)
+        B = rng.random((K, N), dtype=np.float32)
+        C = np.zeros((M, N), dtype=np.float32)
+
+        A_wp = wp.array(A)
+        B_wp = wp.array(B)
+        C_wp = wp.array(C)
+
+        with wp.Tape() as tape:
+            wp.launch_tiled(
+                tile_gemm,
+                dim=(int(M / TILE_M), int(N / TILE_N)),
+                inputs=[A_wp, B_wp, C_wp],
+                block_dim=TILE_THREADS)
+
+        assert(np.allclose(C_wp.numpy(), A@B))
+
+        print("Example matrix multiplication passed")
+
 
 Tile Operations
 ---------------
 
+
 Construction
 ++++++++++++
 
@@ -132,9 +187,7 @@ Linear Algebra
 Tiles and SIMT Code
 -------------------
 
-Warp kernels are primarily written in the SIMT programming model in mind, where each thread's execution happens completely independently. Tiles on the other hand allow threads to work cooperatively to perform operations.
-
-Warp aims to give users a way to seamlessly integrate tile operations with existing SIMT code. To this end, we expose two operations, :func:`warp.tile`, and :func:`warp.untile` which can be used as follows:
+Traditionally Warp kernels are primarily written in the SIMT programming model, where each thread's execution happens independently. Tiles on the other hand allow threads to work cooperatively to perform operations. Warp exposes :func:`warp.tile`, and :func:`warp.untile` methods to convert data between per-thread value types and the equivalent tile representation. For example:
 
 .. code:: python
     
@@ -155,7 +208,7 @@ Warp aims to give users a way to seamlessly integrate tile operations with exist
     # launch as regular SIMT kernel
     wp.launch(compute, dim=[N], inputs=[], block_dim=TILE_THREADS)
 
-In this example we perform some per-thread computations, and then convert the scalar ``x`` value into a tile object using the  :func:`warp.tile` function. This function takes a single value as input, and returns a tile with the same dimensions as the number of threads in the block. From here, the tile can used in other regular cooperative operations such as reductions, GEMMs, etc.
+In this example we have launched a regular SIMT grid using ``wp.launch()``, with ``N`` logical threads. The kernel performs some per-thread computations, and then converts the scalar ``x`` value into a tile object using the  :func:`warp.tile` function. This function takes a single value as input, and returns a tile with the same dimensions as the number of threads in the block. From here, the tile can used in other regular cooperative operations such as reductions, GEMMs, etc.
 
 Similarly, we can `untile` tile objects back to their per-thread scalar equivalent values.
 

From 68c0b68ad4e009a39c526ebe1721663e09a40708 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Thu, 3 Oct 2024 15:37:10 -0700
Subject: [PATCH 057/102] Minor adjustments to docs and docstrings

---
 docs/modules/functions.rst | 32 +++++++++++---------
 docs/modules/runtime.rst   |  2 +-
 docs/modules/tiles.rst     | 62 +++++++++++++++++++++-----------------
 warp/builtins.py           | 32 +++++++++++---------
 warp/stubs.py              | 32 +++++++++++---------
 warp/types.py              |  2 +-
 6 files changed, 88 insertions(+), 74 deletions(-)

diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 30a9fd80..8fcc6f83 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -804,29 +804,29 @@ Tile Primitives
 ---------------
 .. py:function:: tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile
 
-    Allocates a tile of zero initialized items.
+    Allocates a tile of zero-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
-    :returns: A zero initialized tile with ``shape=(m,n)`` and the specified datatype
+    :returns: A zero-initialized tile with ``shape=(m,n)`` and the specified datatype
 
 
 .. py:function:: tile_ones(m: int32, n: int32, dtype: Scalar) -> Tile
 
-    Allocates a tile of one initialized items.
+    Allocates a tile of one-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
-    :returns: A one initialized tile with ``shape=(m,n)`` and the specified dtype
+    :returns: A one-initialized tile with ``shape=(m,n)`` and the specified dtype
 
 
 .. py:function:: tile_arange(*args: Scalar, dtype: Scalar) -> Tile
 
     Generates a tile of linearly spaced elements.
 
-    :param args: Variable length positional arguments, interpreted as:
+    :param args: Variable-length positional arguments, interpreted as:
 
         - ``(stop,)``: Generates values from ``0`` to ``stop - 1``
         - ``(start, stop)``: Generates values from ``start`` to ``stop - 1``
@@ -902,12 +902,12 @@ Tile Primitives
 
 .. py:function:: tile(x: Any) -> Tile
 
-    Constructs a new Tile from a per-thread kernel values.
+    Constructs a new Tile from per-thread kernel values.
 
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
-    * If the input value is a scalar then the resulting tile has ``shape=(1, block_dim)``
-    * If the input value is a vector then the resulting tile has ``shape=(length(vector), block_dim)``
+    * If the input value is a scalar, then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a vector, then the resulting tile has ``shape=(length(vector), block_dim)``
 
     :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
     :returns: A tile with first dimension according to the value type length and a second dimension equal to ``block_dim``
@@ -940,7 +940,7 @@ Tile Primitives
     This function converts a block-wide tile back to per-thread values.
 
     * If the input tile is 1-dimensional then the resulting value will be a per-thread scalar
-    * If the input tile is 2-dimensional then the the resulting value will be a per-thread vector of length M
+    * If the input tile is 2-dimensional then the resulting value will be a per-thread vector of length M
 
     :param a: A tile with dimensions ``shape=(M, block_dim)``
     :returns: A single value per-thread with the same dtype as the tile
@@ -980,7 +980,9 @@ Tile Primitives
 
     Extracts a single element from the tile and returns it as a scalar type.
 
-    This function will extract an element from the tile and broadcast its value to all threads in the block, note that this may incur additional synchronization if the source tile is a register tile.
+    This function will extract an element from the tile and broadcast its value to all threads in the block.
+
+    Note that this may incur additional synchronization if the source tile is a register tile.
 
     :param a: Tile to extract the element from
     :param i: Coordinate of element on first dimension
@@ -1010,10 +1012,10 @@ Tile Primitives
 
 .. py:function:: tile_sum(a: Tile) -> Tile
 
-    Cooperatively compute the sum the tile elements using all threads in the block.
+    Cooperatively compute the sum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the sum of
-    :returns: A single element tile with dimensions of (1,1) holding the sum
+    :returns: A single-element tile with dimensions of (1,1) holding the sum
 
     Example:
 
@@ -1043,7 +1045,7 @@ Tile Primitives
     Cooperatively compute the minimum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the minimum of
-    :returns: A single element tile with dimensions of (1,1) holding the minimum value
+    :returns: A single-element tile with dimensions of (1,1) holding the minimum value
 
     Example:
 
@@ -1073,7 +1075,7 @@ Tile Primitives
     Cooperatively compute the maximum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the maximum from
-    :returns: A single element tile with dimensions of (1,1) holding the maximum value
+    :returns: A single-element tile with dimensions of (1,1) holding the maximum value
 
     Example:
 
@@ -1106,7 +1108,7 @@ Tile Primitives
 
     :param op: A callable function that accepts two arguments and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A single element tile with ``shape=(1,1)`` with the same datatype as the input tile.
+    :returns: A single-element tile with ``shape=(1,1)`` with the same datatype as the input tile.
 
     Example:
 
diff --git a/docs/modules/runtime.rst b/docs/modules/runtime.rst
index aa628608..3c96be47 100644
--- a/docs/modules/runtime.rst
+++ b/docs/modules/runtime.rst
@@ -819,7 +819,7 @@ To record a series of kernel launches use the :func:`wp.capture_begin() <capture
         # end capture and return a graph object
         graph = wp.capture_end(device="cuda")
 
-We strongly recommend the use of the the try-finally pattern when capturing graphs because the `finally`
+We strongly recommend the use of the try-finally pattern when capturing graphs because the `finally`
 statement will ensure :func:`wp.capture_end <capture_end>` gets called, even if an exception occurs during
 capture, which would otherwise trap the stream in a capturing state.
 
diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst
index 27706423..044bfa32 100644
--- a/docs/modules/tiles.rst
+++ b/docs/modules/tiles.rst
@@ -3,16 +3,16 @@ Tiles
 
 .. warning:: Tile-based operations in Warp are under preview, APIs are subject to change.
 
-Block-based programming models such as those in OpenAI Triton have proved to be effective ways of expressing high performance kernels that can leverage cooperative operations on modern GPUs.
+Block-based programming models such as those in OpenAI Triton have proved to be effective ways of expressing high-performance kernels that can leverage cooperative operations on modern GPUs.
 
 Warp 1.4.0 introduces tile extensions that expose a block-based programming to Warp kernels. 
 
 Execution Model
 ---------------
 
-Warp's execution model allows users to specify an up to 4-dimensional grid of logical threads for kernel execution at launch time. With the introduction of tile primitives, users can now specify the block size for kernel launches, which partitions the thread grid into smaller sets of threads that are executed on a single compute unit.
+Warp's execution model allows users to specify a grid of logical threads with up to 4 dimensions for kernel execution at launch time. With the introduction of tile primitives, users can now specify the *block size* for kernel launches, which partitions the thread grid into smaller sets of threads that are executed on a single compute unit.
 
-Inside kernels, tile operations are executed cooperatively across each block of threads, allowing them to take advantage of efficient memory access, local memory, and dedicated hardware units like TensorCores.
+Inside kernels, tile operations are executed cooperatively across each block of threads, allowing them to take advantage of efficient memory access, local memory, and dedicated hardware units like `Tensor Cores <https://www.nvidia.com/en-us/data-center/tensor-cores/>`__.
 
 In the following example, we launch a grid of threads where each block is responsible for loading a row of data from a 2D array and computing its sum:
 
@@ -34,13 +34,13 @@ In the following example, we launch a grid of threads where each block is respon
 
     wp.launch_tiled(compute, dim=[a.shape[0]], inputs=[a], block_dim=TILE_THREADS)
     
-Here, we have used the new :func:`warp.launch_tiled` function which assigns ``TILE_THREADS`` to each of the elements in the launch grid. Each block then loads an entire row of 256 values from the global memory array, computes its sum (cooperatively), and then stores the result back to global memory.
+Here, we have used the new :func:`warp.launch_tiled` function which assigns ``TILE_THREADS`` threads to each of the elements in the launch grid. Each block of ``TILE_THREADS`` threads then loads an entire row of 256 values from the global memory array and computes its sum (cooperatively).
 
 
 Tile Properties
 ---------------
 
-In Warp, tile objects are 2D arrays of data where the tile elements may be scalars, vectors, matrices, or user defined structures. We can load 2D tiles directly from 2D global memory arrays as follows:
+In Warp, tile objects are 2D arrays of data where the tile elements may be scalars, vectors, matrices, or user-defined structures. We can load 2D tiles directly from 2D global memory arrays as follows:
 
 .. code:: python
     
@@ -55,31 +55,42 @@ In Warp, tile objects are 2D arrays of data where the tile elements may be scala
         i, j = wp.tid()
 
         # load a 2d tile from global memory
-        t = wp.tile_load(array, i, j, TILE_M, TILE_N)
+        t = wp.tile_load(array, i, j, m=TILE_M, n=TILE_N)
         s = wp.sum(t)
         ...
 
     wp.launch_tiled(compute, dim=[a.shape[0]/TILE_M, a.shape[1]/TILE_N], inputs=[a], block_dim=TILE_THREADS)
     
-Here we divide the array ``a`` into 2d tiles of shape 16x16, each block cooperatively loads tile from the input array and computes its sum before returning the result.
+Here, we divide the array ``a`` into 2D tiles of shape 16 x 16.
+Each block cooperatively loads a tile from the input array and computes its sum.
 
 Tile Storage
 ------------
 
-When tiles are created they are placed in either `register` or `shared` memory. In general Warp tries to determine the best storage for each, by default tiles are allocated in register storage, however some operations such as matrix multiplies may migrate data from register to shared as necessary.
+When tiles are created, they are placed in either *register* or *shared* memory.
+In general, Warp tries to determine the best storage location for tiles.
+By default, tiles are allocated in register storage, but some operations such as matrix multiplication may migrate data from register to shared as necessary.
 
 Register Tiles
-++++++++++++++
+^^^^^^^^^^^^^^
 
-Values in register tiles are stored across the entire block, for example, if the block dimension at launch is set to 64, a register tile with ``shape=(1, 256)`` will result in each thread storing 4 elements. Register based storage is the fastest storage on most hardware, however, because the tile storage is spread across the threads in the block, an individual thread cannot randomly access data that is assigned to another thread efficiently. For this reason operations on tiles tend to expressed as higher level maps, reductions, and reshaping operations that may transfer values through shared memory.
+Values in register tiles are stored across the entire block.
+For example, if the block dimension at launch is set to 64, a register tile with ``shape=(1, 256)`` will result in each thread storing 4 elements.
+Register-based storage is the fastest storage on most hardware, but an individual thread cannot randomly access data that is assigned to another thread efficiently 
+because the tile storage is spread across the threads in the block.
+For this reason, operations on tiles tend to be expressed as higher-level maps, reductions, and reshaping operations that may transfer values through shared memory.
 
 Shared Memory Tiles
-+++++++++++++++++++
+^^^^^^^^^^^^^^^^^^^
 
-Some operations like matrix multiplication, require access to an entire tile of values. In this case the tile data may be stored in shared memory, which allows efficient random access. Warp will automatically migrate tiles to shared memory as necessary for specific operations. Shared memory is a limited resource, and so tile size must be set appropriately to avoid exceeding the hardware limitations, otherwise kernel compilation may fail.
+Some operations like matrix multiplication require access to an entire tile of values.
+In this case, the tile data may be stored in shared memory, which allows efficient random access.
+Warp will automatically migrate tiles to shared memory as necessary for specific operations.
+Shared memory is a limited resource, and so the tile size must be set appropriately to avoid exceeding the hardware limitations.
+Otherwise, kernel compilation may fail.
 
-Example: GEMM
--------------
+Example: General Matrix Multiply (GEMM)
+---------------------------------------
 
 .. code:: python
 
@@ -152,7 +163,7 @@ Tile Operations
 
 
 Construction
-++++++++++++
+^^^^^^^^^^^^
 
 * :func:`warp.tile_zeros`
 * :func:`warp.tile_ones`
@@ -161,14 +172,14 @@ Construction
 * :func:`warp.untile`
 
 Load/Store
-++++++++++
+^^^^^^^^^^
 
 * :func:`warp.tile_load`
 * :func:`warp.tile_store`
 * :func:`warp.tile_atomic_add`
 
 Maps/Reductions
-+++++++++++++++
+^^^^^^^^^^^^^^^
 
 * :func:`warp.tile_map`
 * :func:`warp.tile_reduce`
@@ -177,7 +188,7 @@ Maps/Reductions
 * :func:`warp.tile_max`
 
 Linear Algebra
-++++++++++++++
+^^^^^^^^^^^^^^
 
 * :func:`warp.tile_matmul`
 * :func:`warp.tile_transpose`
@@ -187,7 +198,7 @@ Linear Algebra
 Tiles and SIMT Code
 -------------------
 
-Traditionally Warp kernels are primarily written in the SIMT programming model, where each thread's execution happens independently. Tiles on the other hand allow threads to work cooperatively to perform operations. Warp exposes :func:`warp.tile`, and :func:`warp.untile` methods to convert data between per-thread value types and the equivalent tile representation. For example:
+Traditionally, Warp kernels are primarily written in the SIMT programming model, where each thread's execution happens independently. Tiles, on the other hand, allow threads to work **cooperatively** to perform operations. Warp exposes the :func:`warp.tile`, and :func:`warp.untile` methods to convert data between per-thread value types and the equivalent tile representation. For example:
 
 .. code:: python
     
@@ -208,18 +219,15 @@ Traditionally Warp kernels are primarily written in the SIMT programming model,
     # launch as regular SIMT kernel
     wp.launch(compute, dim=[N], inputs=[], block_dim=TILE_THREADS)
 
-In this example we have launched a regular SIMT grid using ``wp.launch()``, with ``N`` logical threads. The kernel performs some per-thread computations, and then converts the scalar ``x`` value into a tile object using the  :func:`warp.tile` function. This function takes a single value as input, and returns a tile with the same dimensions as the number of threads in the block. From here, the tile can used in other regular cooperative operations such as reductions, GEMMs, etc.
+In this example, we have launched a regular SIMT grid with ``N`` logical threads using ``wp.launch()``. The kernel performs some per-thread computations and then converts the scalar ``x`` value into a tile object using :func:`warp.tile`. This function takes a single value as input and returns a tile with the same dimensions as the number of threads in the block. From here, the tile can be used in other regular cooperative operations such as reductions, GEMMs, etc.
 
 Similarly, we can `untile` tile objects back to their per-thread scalar equivalent values.
 
-.. Note:: All threads in a block must execute tile operations, however code surrounding tile operations may contain arbitrary conditional logic.
+.. Note:: All threads in a block must execute tile operations, but code surrounding tile operations may contain arbitrary conditional logic.
 
 Automatic Differentiation
 -------------------------
 
-Warp can automatically generate the backward version of tile-based programs, in general tile programs must obey the same rules for auto-diff as regular Warp programs, e.g.: avoiding in-place operations, etc. Please see the :ref:`differentiability` section for more details.
-
-
-
-
-
+Warp can automatically generate the backward version of tile-based programs.
+In general, tile programs must obey the same rules for auto-diff as regular Warp programs, e.g. avoiding in-place operations, etc.
+Please see the :ref:`differentiability` section for more details.
diff --git a/warp/builtins.py b/warp/builtins.py
index 1a940161..fa7e8a5b 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1745,12 +1745,12 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     value_func=tile_zeros_value_func,
     dispatch_func=tile_zeros_dispatch_func,
     variadic=True,
-    doc="""Allocates a tile of zero initialized items.
+    doc="""Allocates a tile of zero-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
-    :returns: A zero initialized tile with ``shape=(m,n)`` and the specified datatype""",
+    :returns: A zero-initialized tile with ``shape=(m,n)`` and the specified datatype""",
     group="Tile Primitives",
     export=False,
 )
@@ -1793,12 +1793,12 @@ def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     value_func=tile_ones_value_func,
     dispatch_func=tile_ones_dispatch_func,
     variadic=True,
-    doc="""Allocates a tile of one initialized items.
+    doc="""Allocates a tile of one-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
-    :returns: A one initialized tile with ``shape=(m,n)`` and the specified dtype""",
+    :returns: A one-initialized tile with ``shape=(m,n)`` and the specified dtype""",
     group="Tile Primitives",
     export=False,
 )
@@ -1871,7 +1871,7 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
     variadic=True,
     doc="""Generates a tile of linearly spaced elements.
 
-    :param args: Variable length positional arguments, interpreted as:
+    :param args: Variable-length positional arguments, interpreted as:
 
         - ``(stop,)``: Generates values from ``0`` to ``stop - 1``
         - ``(start, stop)``: Generates values from ``start`` to ``stop - 1``
@@ -2173,12 +2173,12 @@ def tile_value_func(arg_types, arg_values):
     input_types={"x": Any},
     value_func=tile_value_func,
     variadic=True,
-    doc="""Constructs a new Tile from a per-thread kernel values.
+    doc="""Constructs a new Tile from per-thread kernel values.
 
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
-    * If the input value is a scalar then the resulting tile has ``shape=(1, block_dim)``
-    * If the input value is a vector then the resulting tile has ``shape=(length(vector), block_dim)``
+    * If the input value is a scalar, then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a vector, then the resulting tile has ``shape=(length(vector), block_dim)``
 
     :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
     :returns: A tile with first dimension according to the value type length and a second dimension equal to ``block_dim``
@@ -2241,7 +2241,7 @@ def untile_value_func(arg_types, arg_values):
     This function converts a block-wide tile back to per-thread values.
 
     * If the input tile is 1-dimensional then the resulting value will be a per-thread scalar
-    * If the input tile is 2-dimensional then the the resulting value will be a per-thread vector of length M
+    * If the input tile is 2-dimensional then the resulting value will be a per-thread vector of length M
 
     :param a: A tile with dimensions ``shape=(M, block_dim)``
     :returns: A single value per-thread with the same dtype as the tile
@@ -2301,7 +2301,9 @@ def tile_extract_value_func(arg_types, arg_values):
     variadic=True,
     doc="""Extracts a single element from the tile and returns it as a scalar type.
 
-    This function will extract an element from the tile and broadcast its value to all threads in the block, note that this may incur additional synchronization if the source tile is a register tile.
+    This function will extract an element from the tile and broadcast its value to all threads in the block.
+
+    Note that this may incur additional synchronization if the source tile is a register tile.
 
     :param a: Tile to extract the element from
     :param i: Coordinate of element on first dimension
@@ -2496,10 +2498,10 @@ def tile_sum_value_func(arg_types, arg_values):
     input_types={"a": Tile},
     value_func=tile_sum_value_func,
     variadic=True,
-    doc="""Cooperatively compute the sum the tile elements using all threads in the block.
+    doc="""Cooperatively compute the sum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the sum of
-    :returns: A single element tile with dimensions of (1,1) holding the sum
+    :returns: A single-element tile with dimensions of (1,1) holding the sum
 
     Example:
 
@@ -2551,7 +2553,7 @@ def tile_min_value_func(arg_types, arg_values):
     doc="""Cooperatively compute the minimum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the minimum of
-    :returns: A single element tile with dimensions of (1,1) holding the minimum value
+    :returns: A single-element tile with dimensions of (1,1) holding the minimum value
 
     Example:
 
@@ -2603,7 +2605,7 @@ def tile_max_value_func(arg_types, arg_values):
     doc="""Cooperatively compute the maximum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the maximum from
-    :returns: A single element tile with dimensions of (1,1) holding the maximum value
+    :returns: A single-element tile with dimensions of (1,1) holding the maximum value
 
     Example:
 
@@ -2662,7 +2664,7 @@ def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any,
 
     :param op: A callable function that accepts two arguments and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A single element tile with ``shape=(1,1)`` with the same datatype as the input tile.
+    :returns: A single-element tile with ``shape=(1,1)`` with the same datatype as the input tile.
 
     Example:
 
diff --git a/warp/stubs.py b/warp/stubs.py
index 01c8234d..3b7f8823 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -895,24 +895,24 @@ def spatial_mass(
 
 @over
 def tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile:
-    """Allocates a tile of zero initialized items.
+    """Allocates a tile of zero-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
-    :returns: A zero initialized tile with ``shape=(m,n)`` and the specified datatype
+    :returns: A zero-initialized tile with ``shape=(m,n)`` and the specified datatype
     """
     ...
 
 
 @over
 def tile_ones(m: int32, n: int32, dtype: Scalar) -> Tile:
-    """Allocates a tile of one initialized items.
+    """Allocates a tile of one-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
-    :returns: A one initialized tile with ``shape=(m,n)`` and the specified dtype
+    :returns: A one-initialized tile with ``shape=(m,n)`` and the specified dtype
     """
     ...
 
@@ -921,7 +921,7 @@ def tile_ones(m: int32, n: int32, dtype: Scalar) -> Tile:
 def tile_arange(*args: Scalar, dtype: Scalar) -> Tile:
     """Generates a tile of linearly spaced elements.
 
-    :param args: Variable length positional arguments, interpreted as:
+    :param args: Variable-length positional arguments, interpreted as:
 
         - ``(stop,)``: Generates values from ``0`` to ``stop - 1``
         - ``(start, stop)``: Generates values from ``start`` to ``stop - 1``
@@ -1005,12 +1005,12 @@ def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile:
 
 @over
 def tile(x: Any) -> Tile:
-    """Constructs a new Tile from a per-thread kernel values.
+    """Constructs a new Tile from per-thread kernel values.
 
     This function converts values computed using scalar kernel code to a tile representation for input into collective operations.
 
-    * If the input value is a scalar then the resulting tile has ``shape=(1, block_dim)``
-    * If the input value is a vector then the resulting tile has ``shape=(length(vector), block_dim)``
+    * If the input value is a scalar, then the resulting tile has ``shape=(1, block_dim)``
+    * If the input value is a vector, then the resulting tile has ``shape=(length(vector), block_dim)``
 
     :param x: A per-thread local value, e.g.: scalar, vector, or matrix.
     :returns: A tile with first dimension according to the value type length and a second dimension equal to ``block_dim``
@@ -1046,7 +1046,7 @@ def untile(a: Any) -> Scalar:
     This function converts a block-wide tile back to per-thread values.
 
     * If the input tile is 1-dimensional then the resulting value will be a per-thread scalar
-    * If the input tile is 2-dimensional then the the resulting value will be a per-thread vector of length M
+    * If the input tile is 2-dimensional then the resulting value will be a per-thread vector of length M
 
     :param a: A tile with dimensions ``shape=(M, block_dim)``
     :returns: A single value per-thread with the same dtype as the tile
@@ -1089,7 +1089,9 @@ def compute():
 def tile_extract(a: Tile, i: int32, j: int32) -> Scalar:
     """Extracts a single element from the tile and returns it as a scalar type.
 
-    This function will extract an element from the tile and broadcast its value to all threads in the block, note that this may incur additional synchronization if the source tile is a register tile.
+    This function will extract an element from the tile and broadcast its value to all threads in the block.
+
+    Note that this may incur additional synchronization if the source tile is a register tile.
 
     :param a: Tile to extract the element from
     :param i: Coordinate of element on first dimension
@@ -1125,10 +1127,10 @@ def tile_broadcast(a: Tile, m: int32, n: int32) -> Tile:
 
 @over
 def tile_sum(a: Tile) -> Tile:
-    """Cooperatively compute the sum the tile elements using all threads in the block.
+    """Cooperatively compute the sum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the sum of
-    :returns: A single element tile with dimensions of (1,1) holding the sum
+    :returns: A single-element tile with dimensions of (1,1) holding the sum
 
     Example:
 
@@ -1160,7 +1162,7 @@ def tile_min(a: Tile) -> Tile:
     """Cooperatively compute the minimum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the minimum of
-    :returns: A single element tile with dimensions of (1,1) holding the minimum value
+    :returns: A single-element tile with dimensions of (1,1) holding the minimum value
 
     Example:
 
@@ -1192,7 +1194,7 @@ def tile_max(a: Tile) -> Tile:
     """Cooperatively compute the maximum of the tile elements using all threads in the block.
 
     :param a: The tile to compute the maximum from
-    :returns: A single element tile with dimensions of (1,1) holding the maximum value
+    :returns: A single-element tile with dimensions of (1,1) holding the maximum value
 
     Example:
 
@@ -1227,7 +1229,7 @@ def tile_reduce(op: Callable, a: Any) -> Tile:
 
     :param op: A callable function that accepts two arguments and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A single element tile with ``shape=(1,1)`` with the same datatype as the input tile.
+    :returns: A single-element tile with ``shape=(1,1)`` with the same datatype as the input tile.
 
     Example:
 
diff --git a/warp/types.py b/warp/types.py
index 94fee051..454f7cc0 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -3583,7 +3583,7 @@ def get_feature_array_info(self, feature_index: int) -> Volume.FeatureArrayInfo:
         )
 
     def feature_array(self, feature_index: int, dtype=None) -> array:
-        """Returns one the the grid's feature data arrays as a Warp array
+        """Returns one the grid's feature data arrays as a Warp array
 
         Args:
             feature_index: Index of the supplemental data array in the grid

From e5129c353614f66a7e1a505945ea5d572debdd5c Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 8 Oct 2024 04:33:17 +0000
Subject: [PATCH 058/102] Working MLP + grad test

---
 warp/examples/assets/pixel.jpg | Bin 0 -> 33802 bytes
 warp/tests/test_tile_mlp.py    | 327 +++++++++++++++++++++++++++++++++
 2 files changed, 327 insertions(+)
 create mode 100644 warp/examples/assets/pixel.jpg
 create mode 100644 warp/tests/test_tile_mlp.py

diff --git a/warp/examples/assets/pixel.jpg b/warp/examples/assets/pixel.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b784f952919e2ebe4c8aa1eb41979fa2f0cbb51d
GIT binary patch
literal 33802
zcmeFXcRXBQ_dhz(qDJqcjL{ipbRpUdM(;g@F&MoENeI#V=snSil3++kBsznr5j_!|
zh!7$~a3^^_pU?BX_w{^#_jUii&pwXxK6|gV_TKNk_L;r*IhV7S%K#c}4TJ{Z%9Ses
zE&Ktv`~+x#2cw+<00cr9Knef=t^(+;5CX2?(eIP(3h|$-BObH=h28L2@GltxJce8$
z08rtte)toL#|-%E1N@1OCHkYAiO0VazPdm6e_u5bdWPIkFhl|@DTmjTfJ!My$SOb}
z+z^T101ClNUwQoRw*0nn`H6DOH6Xx8L0sI+AMD`d?T7*+y*<T)9el(gU<q-6qH?g0
z1JWH8!0m`~L3=3yzrK6}<VHIw0WGBUB=mgLQLbpMP(PGOsJ<yO)Ez1B1XRAstr)Bj
z?CIl)3UJ^K_Vn=bR|r-D{xPnA$G@}1f!u#Y0^F5=miT6?d;6idrNB~P2{Al<K~Nyx
zhM$wOf-y|vFAMlPCE#C8y>sUd_>Lsl+s{QDA}=p5E&&yXLdEbBV*Vjs0S>`pUjDp)
zTY#bbk$z~O0JOIk_iu|1j^2R*N<e)5|7w`0kDlH?hW{f6o}Ry3`$yV803L)_{y#D1
zZyMr*5;sQqdk6X<QScy?R{-zd>Q2aibbSK-JpOdR2`P^9KzZWD{PB!K{;BHsX#OMp
zyTdMMPoF;;cxL~R!sCC0{$t~RSB>wX0?Zp3_`4DWObPhgl7f>r678h$2g*9iJIOjq
z$%)B0IYGpv<mFIe4pK6XVn_)`6w*n;Nlso)>Tf#;FaH1sFC^->9lT{Q8gB&YB#UyE
zc9a%FN=ZqJNg-wB#pE5JvSKKtoGcU~>jaUK|3}}z4~=&t2ao^Rp5Jzy@OGf`C~0X)
zDJL-}l$5-fl&q|Tm^?&IT1?UrB8QZcmO@I%$N;&WkO~^!ex45aaX@=IxS+({d{8bx
z?%#YVs2U=afKaf+U$=%H4gt=1StXz@+AA>luUu2KC(0zi;Wu*-S*V08L{>&hS{@=J
zAuauvggMI3AMY=}P5qfoe>$(A?uT*+@b)wH_V!Q${xjMBRi=mcX(xvO2be<u3a|Fh
z+%x@q?up@Npai~M_`WLWc{`z<L;l}Hf6t;nL#KuI$5$Kj*NiYh`Tm7G(A<CgQNaQE
z+q;#3{tiJXC*WV1NLL3h7ZiS;;(g^`V(9-cbdGXz4l<HZc`>Lwo@gmKq>Pvx$^j)N
zDJ3f<;f!=dqL7Y%>-&2<2i$S+L#evp{Rqzh-i!V`Zny>iuq*Vp+#OfcZ|d+I;5#h_
zf&9&Z^j{o^<J&L(dn*2`E8_ppUj8ZjAJg`4as0F3H~i<(a+CYN>Awv8mx2E>@LvZ0
z%fNpb`2QaR{|-)2Uig*H9en6<xkB&^p{{OkXl$T?(1qh8GXQ|#u?N!ApGX1#@bn7s
zGuBk&wzRV5zPg2v%_so001^PL1Jd6|)zDD)cS!V~)A9R1u@+!T?6<A|Ec3tSQaRy+
z5PY!4jh9eC`uGLl@ohYoxf9^?8|UIN1Af8ggvWJwEb51EARf>D&UgG5-u;bT{@~w1
zB7niq)L0$gx8Knrx6A*)j{gHAUHv@qGE#UMkdvnu-adilzp&G9eD62*^a#Ya?N9n0
zp;Dr}%uMiC5dP!<XaW!bJ%AyA8{h~C1fT&PfB=9P{_2I#@dp^=%hmoDeYQXPhIlPU
zycQaO#B0C+-T+U4!ykRX?=is3;M2dk^>>zp{>i#RsRjTLf4ICn<im&1834d}+U4b0
z$>rsFDF8sY2mo|?{afGrF#sU<9iN}}?>x{W0Dw9Q0BG&~cb;Po0Ps8(0AQK(aqx5a
zQx75jKY=qoT>n-D08m*00QBPk0J+V-c*B?dE(es&005?Vu5{l50D1WU0KW^~w(bAY
z@82>1KfL`PW&Yytasi+YxJpb+LQHg(goK3j>Qzz-ItmJMatbDD8Y((ACU!PfCRP>>
zZb2XiCqE|(E3X8IUsyy8EXEFmN<&1Y1x3X~e;c`Sm6Vi%jDnGZf>HE3>vhrp&*`!o
zKudb%=*kh{6(E3s_6i~GmCIfLJDwrJ-y0nNBm(>%0ueFEReT}q6~O<!4DYoBgha%b
z^8iZ1D*yr-LK=L|!R`4GwIe>`>X--V^646Cbh_)di|It^-3=<~Tc9*hJ6WJ>x+Ovo
z^nwv|yFRg|7V}mo!h=KrrJJJ(1ksQXL}qU+&+<LBER!#SF6J;!*4S*0<w(ptxzE12
zvT|_WC2z`>8(H}|hb=QGe|HP4RhV;Ccp1U{=kVd1E=3UhJ}UYF(8B0K08^q^DNZyp
zO^rvpU)#)rV6n0!CY$H+dNjn7OReE~5}mHh_lL-V&5gt2)G_y`V9Kozi$z`UKEB&I
zI6Sp;dA_o6NIZV?<)(hU_3G2QnmXCZXB+y3F5o~%d5d=u!$n#g?K>v&fs>IN;<d?n
zX0;%+4>_ylEKPg5UQ|H8&^AE;FwLwVA;`AAll^?4Legz$T7tzwMOQY7PXB&KmV3DD
zBiv!w$I%bP;M~_Kk=us~h`S9{6jgmAwsxufGZOFb|5PZS;G>4w)OpsnN8B;A5GG0|
zHjASgZ}R{C$<be4?hC1-vzx?}u=W@n&Yu3xm?J?=O*d0zz;*P*b1K8S;BD;<wLsJn
zK@WNW)MM{mhI1WGobWNhJ=p;D+!`}CZ`lg9(|fVGb>L^A3IDZpqrhFvI8?~rx*oLN
z7}T%Q$iRYk*LO_DIHC7QSPKuXh;Oh^m`7oKFhN)_KOJ6?SYcs~g9AQ1%s$VRtpT~}
zsKeo(3G5OTGp_qmB-#a<KQLg)6>cVxXc*B>CX%#M^0dYY6;V-*jOd|7D>wBvg$=L^
zdM!4Z>IBYVPLoZfE%gtIbRMRd5SKO}3bnLrgwoAgL~k>&U1e#v{g~>O&9j4c>=f6Y
zWpc@ZOxJHoecE(B3gLg15Xy1h(psza^E~p1V|HKu!^dQSL?y(Ra=u{;@q3Az=DXb#
z?B%eR=<F9$@kCSgC=*Gvb_o2FbfG2rlUvd1J|>|Ghfft6E&<o`db=}oJSi23afJFw
zKY5A^+oPtRafo?Xg#sdLpm>i7Wv`-!1i$3i3X@Fy@aax~9_f8$-|bh=pDYh(UL8Mh
zzNc;d)OfO<)MCbRXLF~w{xl$IlCMYeW@Y_6yGcrbeh#FA!kzX@M3gMiYv7Gj{<fLA
znAfF0t=y&P{oe;0xUWW!+GI_PnafO1can)pDN)B*e{-Yn?CUmvm%T3$)Mj!$Eb8ng
zmz4Z{J9kBI>haS{fO_y+>*x8<k364W?LPl@mt$mRjG^!B8p%UJEX(S6IYck!R^p3|
zYa_6u=Pp|@_i14JCJokIp85h5(5bwk-EXUUN|*&^_!X;LloKJB0C<8$*6>!yeHMRQ
zgwG(+11k!yGMRXnT5u+o%p|Jhbm<j}iB1>$(<FFz_3)?dPM;?GaH#l=xmS)UpXWj^
z0ate9n!mF^5B*Yw)Gh&L&mG!5DJCiO@A3ENxtDwT7;V!V<*g@IYTfQHv+byzu2T5)
z*2leP$*;jHInFH0CMtwwPAan<F}x8fYK_|P4-8pwN)RsRqhwi^vDzN45QA!x4L_xS
zZa6=worj)w_TN3sPc%Gzp}_I$I0sg9&S|gO6Om<f!%6{j$2d!nRC{CFuX##cQaayU
z%s-jkmK8UeQSit`F03rbdDZXBL4d4Obi|Zt>5vf(<!r+a%z3^b#a_1gq3tOW>jg=v
z`=Ucm#n$9bWu9(i8RQ_kOCBU-EK;de^ua%qyVxse#Oe#n^S4~;;2CPv4!suVldd>U
zMe!jcX&ZX6gxTl?v^-a_4f3->&*-jm*{f2;fctM4lA6uafB_>q^G(f9HMFW#v?{Ox
zNre}!p~VWa{zuWyeqP#fuo;w_7uNEIP`L-U;&&n7{r4Wu#dqZIn;|xXFH%Q8Og9S{
zO@(TGlpxCUwlO~5xFx3<9h!g!H|I~)vO6B%$jNtq)!^UrIN3J1QdjOM)&4n`<|B*S
z3q`}4$znkkZ{@vNI<>KNmOXa9ksqF9*37vX4K2hX2D2Mh)*?{miCI@wZp%PC#}HuQ
z?;|GOBZ(3jn2iq2+$r?T+7DrD2RKeOF(5R6@H&ZJ$h0%l6REolxQey$NK@}s_v^2s
zmi)I;*)^;mGx*v~)#A3lc|J_}bz{RuesZ&9>TMNON?wR@e<xG2F!o7$3G1;-#VBrK
zWJW~R;96R|U*%pY*euG1$%L(PvjWE$`o1P>LzPEUb~yBnNafVa-aV7BthTpU_JBKZ
zsTO&okck>d`9x}YQtb{{<k6$b)>nt^uk@2_3wx)IK5g+%9|bSj-7L(kEf5eXs|*Q1
zbJTb~uNoVBLqjmj0Md0cdt?muM@{EKRz9X`4T+NH=MXfgR5T*pjkhM(GVHLELx<Ys
z8sZ|K@3m~-8nt)P_P>(EPu2?K&lRg3MhU2`OiT^8-JtRp!CA*t#cziGYG??Qu;csQ
zwnVcVbZ@L;bAfNCPNrUaJ_9M@fuw5iIlQ6xVhR{gV*Q{$jc=!VILA5Y!0}^B8IANE
zMi3bvaou|vd2kUJA-!D~>%LJGuli{C=Tk_iWG(LVlG31z+=C^luJ{KEjdO(MD~G!_
z&e)}`Sd5aWUWjOMTuZ9lm@40C=XW<3$&OnbSb8+<{q*N*M%S_O8~PfHn;UzZo24|r
z4&B)+$>*OG&V5I@gO!5a{OWRieLTpY5Z85jgx#mV12UMDH<%fFnGO$mgxGlUwYLBB
zv~+?$=ev$6Yq5fAc*-<sbEBlZ<2=^Q8dqZ8<RUH*=zDK3-EbnEsvvOfUCG$UlVl3=
zk|7-<+A?0r^>?|irUSYNZL&}kn7oh<4q3VPeT$}PeD5gOA0rR9lEGS!sscHS1J%lT
zM}I2c{Bm92K>y2_4x97HONzTp+mn0OLQOPWJV6W-xh*th9C;I@%%xe#;dk6!Q#btF
zY5dXe(?*=IM}S-03cUm!q^4Q9fcgYF+A1+o#q$?n!<Xa*HN>C@&_gBGBaL)ZgV`m!
z;(%Hrf%t_rw%gk)GZk)pjl(UdadiDwfGdo$fK;<6-UNl|m#;J-_NXPdp_LXgc30s`
zhLoN~KF`iu!Hther%=Ju1oJT1%#Rs%&SD0mqJvYz<%%7w4K&ScdBVg?Ds4iN69urP
zuJ|*0x|z{LK6MMpicxWq%q0;Uqsq!3z@^V#N|Keebi*dKFG?jsDJw{4MJZx{bqj5W
zI!|6ShU(SZNNmpt>}>2!N|xPGQp~e|B<3w?UR1}%sM0DF8uH}^ThqIZo~L}>{Tk(I
zeA7|sz#|vG$>q-DX=K>_D^XbgABWSMet8uhUV5@((h3FNOW)@g<;{BNTEXRqAnf$@
z#J<2AcI96ml@Z+Wu)W2SVGv}2CT=chsxoKdsgo}(Y%y!dGD{*HHJ$7<eD4+Rp6WZM
zp5X^o{nv_%-Wb;=eDy_^wU055UI$kr+%z}c>N-{iTSAk2D{d&&jqJAEg>H}OQ4XHs
z@3Wf$<O^@HeWA^I9)lDb`G}Z@D)0K&d0KLF-}Q!6)_MAC5V8^o)efQk9&;#YLtV#=
zgjL!M@ehI;0*KR-<*vIW6*Jeb;3_u4{Ibl<mulI)Zn=qYqCMHK-e{iEF$w<ixOwqC
zF0Xobxe`Q{j-JUE@Ig!Rsf;hWXy;T($V$}TxJB+kPE%4Ywf+G6Y7b}3YLmZ?@XKfA
zuE29gR9X@rQDv;rktuQyqd_;B%sQl%w!nMK=y7vVy=GoE_6=|8v2m)H4TJIG;=0ti
zC0jAZY?zKb`)mx*E9gObrQV8@k9^z6u=7QeE6qD>%LCJN;(+YZ(T!=LED!~L6-<l+
zP*Rd;iTeqWuiqXr#mWn3z@Bh-1;C4AvHOe-W%Aa9^&xxi>i(|z9<~X0Y2EVO+MF2Z
zAh&tZk0UVV1Ud|Zf+{P^x61Ps>&-;lY|_ie0V1}QW{kwP$p|qzEjq0$<kt{v$I$nu
zb)qJ9yM|q&nSFXj+DyhJ(Yp0kU-Fm6#;?m99>v6G#qVMl-#<a<(a=@QTGM<}CG=$K
zzEwfRNY1VeXZy;VPu{1;q;^F(iA2hj5l+M$fCG1zsIZ|3KB>MF6t^)H$<0>D743DW
zw{a9D$kJpaYDdM7?+iUuhqBw;iZ5w8WRuc8kVy%Ado!)V<|!rBRB@Nbm}CSAE$2vj
z)RJ5hl?DN0(Y5M0Mi&*jK}JgJv|eFGqBOO2)y3+W8bJ+;dJE|Xp^mlb*Pn<aC=J%e
zD2<zSgV&||iu58jhoA7SGs9Tp?r>{2RTABf+%6Lf>dt3GGkklW)^<zbq2m3@gg}D0
zIvu7sekt21!M-V1M)EvI^XSFam|TFA=}^w?7rsv`H4T&-158QirbwNtMf+XNk%+B|
zHqI0=F-8a!t;4`yms?CTXZlR!F|q&jj7(e2XH7+i9eE1@*@__!k~{v*>2K7?PPeym
z$7DuVIra^(f@>DOYODIy6C<42<-bls37GS^bU&Ta9E&nXTXeYFOIb`!`ya|5@lEba
z&J7hW8J~u->0r$!s+^m07#-k6qJf14CLJKUV(vM|fX5<X&PgoKvWRn-3#B5H>Ls=w
zjD9x1aQ>w$?^w-IMa=l1#CUNjzR~R-v4p1r<Uv<vae)z*VR6JA^EfGLT49EkZp=^C
z14Fl_+5i(?D{W9+U*c6e1y$leW5CcMO@bOm(!oI}!FPoA1n}CLsr*_=1ZaXG3#wJ)
zfe|JHdw^Jp9-uLltTYFh{6nq$=eP#=NE(Tj1wIl4(*Otvt`HIu5#SR(^uwns1hj-S
zbUZ}d5PF~lltGnPl95-<5Tv3mg%1x&@e$<}VuI;Qz|7o>#Ku|<ZKgO4WmZLYE3ErP
z$(`8`m;fO(+%+!kK>2EEs30yjzW>oVRK!N~buVyu7p7>?g)I|EDI4=-yacGc=Bv}@
zjEjvwnZ%v!D1Ij0%LeC<b#rOe3K(dXiwcy6^7e+hl!xNMzyHcZHOoWQ`!uG!1F6uV
zhBJP2f(a8_alyD%s|JOX(W3C?s#tcpLdpX{t<rd4f?#QK<rEq_?<rVP0?X8ZobIYm
zlOn&wVb$>!Clymwb_bvAG|sqacbqFnIZ0Al4{tbv^uDbx#wal*-EX9cO9}<vgre2r
z<%&$@@@q)9i*zVYOavrFW5iI0P#hGwXRQpJNNpj7X(%!0o)jknd7=Iac?TJLz-z^#
z??l&{=?A^&MVkyZLS?ja{3f=r$S-QNw5bM*_YC0gD4y_5`xeNo*unUPCO$XMxYRf~
z39%>lU{Kliz5V_DH`mMFU$9_UzQeF<KRC9&-)f{87HxXlmu-d0na5FU&PSJ>WMd9#
zQ%!ZO1zmHz`aLjuv@~aRO?as@R-Uc1?9>%F8SNJs)^beuD1H?xJngr?%sWM02|Zek
zhHPt0zO2IwirM=Ky0^E$U_gcD<9OG`MAmIqNRcjjJNE08%0N;!?@a|O<`<5KEWTR4
z95*%T(OCDUxD`L2?cDa@ymkv>!C#~)aVd4@rPhL&_~xWWg}w&yH;7f1Ux8r)$1JZ_
zH`yc9Cldq>JuI!ZnkSvRRF1~9j%dM!#Y~S%$5a!^ID`*YnWRKPli8xL?L&D>d25wj
zLp4B-7OW}Fw&7uEeN~nn&1clQTt+3OT1|_#P};#%qk@7dZ!!kMQB9Bz5ZtRKr3%x^
zO#U#^xZW@%0uw|2HlfO(%$oAl@_xG{^}{;*&pES>8hqCB%){}5N&AlLaE|yKHyM7&
zjlg~{YUsdK*+jay<m!DP>Yq)$>1@-kM{(GcCcG86>&W2FYTeBxp4cwD-|N1*IG*Eh
z;)TU_z0rt=oiD>_m+xT?;umJa`(W|;;1W(BWCae_{%9_k>%C=WQimiTIosqg`y_Bu
zVeq_Uo5af5JkX9SUD*qhYtIyDKP4_`i;2UXX~WsA8q-ZhZ&V~3W4+gsS&$bW4VL)e
zSvp?dPu}b%T^9||*FK(oyb=PdOHYUTA*fs2uRDdkZyXA(dg3$~DEVpY)zO@5&pS|<
zmsH@bdrD9mKVCM$*C;-_{&)0DOeO*e?;w`b0wO%LovBlmZkCO7(Z%V;N4X_NN5}QA
zlsU85Sq>1_6vfzdi={?hY^4j9V%6|X4l}QnR0;~+&2~m*9jkeP{17?iW-pNCS=3$P
z%qhz&HflJupSFg@3(Hf(G8C2PBLiVvw9hZkW^W-*1OyuIWJK4vg9EzZI(=obDzbrW
zgJFUAsmTl6h0$W?Z$C(!-1<%@YvpPawOi8BKg7$%#<QwWl~{O^C7PPc*q>E~^fd@a
zvhi1^%VbI^IRb}4xDe|~E2lgeC0crRQSHTG3rAB#2J&86x<C<~OH@2B^THDt&)2@e
zU#qry<7wN}8Q(>m=Y?Zy*+6xf8ZnU1bi<+^VHPd|ys)pP8lc-61lC}wytKt$m?3k*
zgkMFE`&_cEfByg>^St{d07#ng>NTxy$-WWKM|V)&U_3Pp!eoWoYO%!oa6A&)nUkqX
z6Hl5|I>XC{!`hO=ocN=b@9I@*rTzR^rI<$<LK}6am303LgJFLH8ynjfPgcR)i>sy6
zzywgz=lJ_&n{3?C-G0oOJQ;8X&t?buK+Z<8wP9I#q-A@H^v|6tJ>96!;A_UKx_LYi
z>QBEfHoev4GTdauG%9f^K(>zZ`Ek@M*7Z&<2jW=8)dRYISV~e(Hdd=K-NUasz)J2j
z>LnAG7l&2Qb;s3knzI1!0E3~~3R(H1;B~U^^hT+Oeo(?Lav~|6kIsmX_Gz?Ri*q>r
znfj=7_e7-{k(uidC&sCWD&?t1(D1pu?{`Rg2t!hvY~Hu|Fv#h)T+hAlxmBXW{1e~r
zA^Q|3ZD}W8ayi9Q(ZO=MTdbRvlOB15y9TRGxE-YnY7f2GRpNE$i`L2}iF1H{p>NEc
zchM35u5C9Ul7;^mi*(L$Kn>2I7o@~8fCK(Ea4YYX)7k2odarq{MHx%c3$P$kV}Xy&
z%EfDtd6e|5J&eunBO3@Q155xWVRl0yNlx+9%hnAeAE4pR(j&x+M@X1OJ85Yib~eyt
zfw|m*7P9|Ld-I+r$`5me(Rnfn^Qk1YG07DJ`9)zim(y>s+uN4F?Ek*b$hALDW6E{3
zrcJLj`SqZ9fUQ+>^^5xP2B+$HmPh5h-t-4Nc*S&9kaC3EOK9-h^IRCXuWS;V-~^|4
zc$R;>*1uRtdh|_+wWKR8dWb~{V>}1fUL}BluyL=PS$U43>LA^4g6SZG{_x$Nb+T;B
z=z4}w7pxwp5uGQaQ?&@rVdIo5o<Jr-1I+h(0_e)ztNURd2?CCen4l`4o#hz|j?O9f
zg7^tj^TDI9(TAlp(Q~FI7zA1`?%ab@O3~E^_G}O0IDZLP<z#Rw74j<?5QJ5F8|WLg
z<4pKyk1cU@uz`Is!<I)WRbL*C#;GxTV|bHgjoO;}_<fA8^Njf4(JXOR6y}@gn5p(E
zb8~^X1({_irsm*h5dIhMg!jpAPD%|6Q=%o0+sRaa>umb87#oLPdE<?~xD-h@kmaZ&
znBw?FwwH;CzSvobxB?`dA3roHH9=ZCx?a(P*tqE0_4gA@hAleQlWp@+>9S=$G4*l>
zYIiLkL2LN8_w>Ox3)g!G%GNOQbX;AWr>ql<Bhrz~K-#r^qqX|d2_RO-qRV+TsD^f5
z3PEmT^Q~fUsQ#8TDCyaclZkXBCOr>)bzk$fh7m0d)>cqN++JYD^yOxKT*S|~iRJXl
zz9A12Xn*wJ`N5)DQhH04l(rt-o&ABfbSRqEwaq7)eHRvBi(ZDiVl}gIshxv~V-MMv
zhms0w?#Af!@G1ygPnA8^Imfs>@#W>g82<W2@kf&!6wKa8vp(CVn5b2+*i!Yd3(;*C
za9%=n3230sJQ;khv3rkh&A1B8N*yBG9<;Jiua&m8hR8UvxEH?_*A5=0<-9fiO8n}!
zJ1uZ}uT2(KJp1;b3>X4B&t1~qF-Q1aj2S?9yTtuuJ^d5X^PP1-vlomXdAYG_6I`&U
zBegYK(r?r7k8~TB7gLc?9M4I@@x>gIeR~+gi_ChBu+8&4Ih>fd=6B=ywHW@Dbt~P1
zjLf;sPq#*#zN^u4Ile?=g7y|1MfV#|y~ZG}neWVR`fQx3U`w|?;J1dM;K`qR8+4Zd
zXY}1>?dfABStB4U7K5^KXtgKb3)L|N!;Ng}tTnefYG@q!z_-Uvdon<S#bqD!W=nDU
z{R10zRumlzW+imRt9O``L_?wdhzyR`LEp2h^|N-8=Xs!~vSf3Kg<$hrs+c078wmkE
zi5!WtVvn9eJL-q$Kfe4?CfeAfp2shvoYxfFiE~TbwRMfI$SFO1C|>z(b^g}V4|{<n
zr%g~Zalh2Lmyh<ohu)GI<8`XOM;R{Ss2Q;>9!=Wx-Ol4RPgb8oYglhx*u=2LEzZ|#
zmw=~T_t2q(jqRSZ{li@S*m)?{>7^)s{iOSFV~qG%OxEb$#R9J|$Css?GH@c+-knv*
z#A~`B26}O~u`&JqQ6uziALd?9&%Ie}hqC?4^=(wPRVegq7MG)*Blhj1vUFi%yCToA
zl4x@v1uyW2(s~?x8<nUd>YX_(-t@g`cs*Nd$rvMkE>6tlB2dPe+Cp2Pu_6l378uR8
z%Z5OQ#j}rI>)`u-{<7xn!A-<X`-{|v!}qdU&}S@X<K-F}t3FY~qKjS}$?r2Jdxs|)
z2R&ZO9=<79G2RTPn%z#t@BY421nFceai(^l>3V@jP5u3E*Q}gZ-uAEfRqV{|Bnrv5
zu7Y`>_q1N`Oy8q`uV!l<R8U*gesP7zC&074WUnE!3exRLUVW!FO*(DT-9IWpzg#`v
zI{liVNMTVyZNISoKC5b*G}KF0(qGhDTd-jiVV7wHXE4wDkRY-$yD+;J>QWjee^U2L
zh%0ie)8*upcc0}aNgRG}o_8OlQ<iBOo}34c1%@X|9zglF^prKZ51JjYr3r#1VuGbn
z2_os((wA?%zbu@cDBWa|6WH#Zqj%Ddw1w7xv3UOI6ycqJJm6n(F*4z7AdQS|1<4Df
z+^~}hwVV@WsXg$OJb-*$L8kwrGy_KnT`*X-Qw^NV7(CAY+N*r!#U&s$?I&5ukrnNU
zJcq-$zUIj`#mQ&;Rm_Dn^~nj}iQHIMcNhsPJe-~6(ZRzDu}7;=x(jhd&0lv6bxwA-
z!>A3zIDJ?0Ijlv@PtzVf$Ng-qf3{#`X3_U0NqiC`vJ~%`B7-`#?OEb4<gll^o5Hf{
zRMO%n;uoXPJ2fY)DCo0sYW!nJrl7gkDa1YMO`~5hGqvlMTjCEPs{M<Jj0h(iu4f%p
z@l8gH9s_*f0>%q!9jN_4z<B!l^RI92`^44KeM}E?(pR^M5tm!)m(ZcKSJW0V;$+BA
z51T-kwCj2Y08gVni#m76ihHbE1u4Bb-_Vwp2gN`2%lF<B7)WY5{EoBxEZf3gb{Bpv
zSa>qAqbjn^Ot3<I`Fq@YjkZu_of&KOhwA3;Hjh-YkT-#QgKb{6PU{+l&w+Mx{D?CX
zE=ixo$MLj$UlLqjdrjF5y-LxZh8%D;%F|k8>v-(NQKtvVx|xQ36re<vSA20v2PaOs
zSM13{ZEwM=m7aoVxo)n74by!(FF#+%y97|5fFVn29kWYo{Ykd%A1qoMSbk}}amo?j
z$=bkuI@7uYIGNS>vj0FGuP=QGx4Z<LZf<h@a5|K{`U7YE?ZBV<0?I*`zoCOUee{8k
z+3z)*q8go_mm*j{Yu@)*_6trSr*0)2AM%1|@gXnapYzXq;@=@J4J`p3H@yT8giu8l
zXlV2&(8a$YRlZ_y^r-oaodxyN?DNGa%|Hf|bAR{X?4Pt~l?WUbZxy^4mFO3`)l3_j
zTXMg@)TE|*Ld`C!!^5ycSg<r9Yu6{5iC@rTwvoX?CtswB)E0To@U`L3*Gxdowe4<(
zG8Vri+nd64Q6;R&exJyVl5R2>nP^m2sKrR#<Zu*fW*K5;vNAX2>(0K}$Y7))aRF;D
zt$;I5t?Tvkf=x}WnJiz2k&EOEF|(vb@yUxYpOjG+-OCOqG5W!H&7RcuHmbWwBUX-z
z$}o4fWcU+B($#)R4Y<K-EEDoi@l8kp7p`Pg3N9MPAS?InT#nK(dOGWx8#8rQKQB#*
zUJjFoV(B^cRNN<u7P0QIzV1wZa3&;4ZTs5`TkyfB?oX+cZMwR)Pa9lq--I%kzOm<;
zYjAy6ZNE3eVA0T#-*|hBQ!|mb53@2jkuRy?W>jMWXZ%|F0%a;*65bX6h?v)CB|1S$
zBkrKg)GgWQ$sg38%BHB9TrB%`=w?jRo?!J^(tYolb1vP1EywRdMt=_=!4)FHf4%4b
zvuA-HKyErhdI<>6zXy=u_W*_?bS7JVrD%XX-bim{Ujj0sBt_?@hI&)@IP?YR<9k;`
zH6uCs+$y?Las6Cnpn#49&MjOj3d|6u-f20!J(t1&?Egh|%Ajm_L$qoAbWw5CF{v3c
zqAqR|Tc7jEH(Sk_GsZdKJkF)6g~7DdYKl3c$C;lqCXkYAFh;s1+v(E_XJDujN6<*9
zGkIA}M5=*E4wLQN;F?`c;+^YQA9ce_CD(Q^ySOH)&5N}ZEw4`s;N!-Zg{8x>g4T^a
zs02}K%Tdu_r)a9;QPFVXq3GsVa?`fH@k;=vnEVTava^teKK-=YQ9oUdVpOkfAoE~T
zSympoX}zu7fCZ&TXWI_C7pIn08_V)Jb0W2CWRGK{z$cSe%)Eg^jOF-9fI*yKx+l7f
z<9(*7=}!@O?S$*VC#{m2BRRF{t!M@*rKTtzYPbDHa+-lw<7jG_I7PISloHMPXXJG7
zJ3(a5aVCoK2faKq_0I&6&sO1f6F82EMa?_(FXI<`_jvQ=c`>LQCabW!1<@CYYeQaQ
zg}F^G{QfR7cUTu)Wc&v=Ms+tvS8udk-%UBZF%Wot`Wj27ke&Ms*Q2Xz%ArpQr+3cB
z@B=P|-=qQ%Um+nTB)Iz57Y$bkXlQvLs&w2y6(a`;sG)BJA*NBXM9uiN--E6R$4`H4
zRgowY;9p=$+#WbteYACt<@+wnMNvmmfwh0)qc(TFhQ*boFn&dw`#+G!n@p=*$=(F%
zyW#ueU3u`4jy@Ts&)MS3<Nh9Xhc6K}Id>C$z3<39eTgjp%>ScSK|O3kR@9{sx4{-Y
zJ&b)8Vt7JZS2vu?HshPjbt5xt9lmT+FuJnzW*MuqZyvqpqa4{DUw=mdBUB|k+=#Xs
zpYbPd+u6HSY;wWr`g)VnOu|fo!)nLq)1C{|<`S@X%c{g)s~P;n40@XNsLNm>2tTZ{
zp)4ZX_peoDao!r3rr4Fk5PdlB0B_Jfkt#Y64v-1t+4V%cF>U5z<benEwZzt-EyUt~
z+~iYFR3gn3Aqae9pg;1>x#FspMJMJ?w_s)#^(@j?WJ|8}rIUB9;>?!t&y;8f)Q}NK
zyo!ucGRExTi9VPqLEW!$xvJ_zaMo&oT)?Ae&rDI1V&rx;k#((KNIG<ClIQiCpVw{P
z1%^FaVn2o$gjh~9dDQCf#&D=5T~7)Pw0hT)@-ipe<vD4)*zn%M<c;AP$FteRk_?%<
zx!~~b52?n7pKln@_n+?uohMoyp>}=mkNps~n9B@GoOmh{e3bQ+TK%--7rSA5jWIB(
z;aS$_E$JNp631^}KiPy<&qvkCZ+WU83*YkDX(!uYuAVv~&q8Up-4P!Bm5f5ZS$lFS
zY;poTPdOS;wzxU@b#UH>Phseh<Hr3wvxhDXq_^K^3(?i|gw2mu1Z;@VqAlRU!*7yr
z4Gp!tYOQGcN})WrpZs*iDu<c2`+{}iu;>-|o6U}=S5R?(J5`p(qTF;q(cxReIXkMM
zY}%$*q}n>}{`30anD^~N)$<S88lo%0TwWF_vS&?o$AAodUfs~{xFaN7r(H8qsHkjG
znN-%i;r<f-sw68#v@IwH=`7ynY#KH>x5Yzta#z_z0X}+n_Uo(j0kNuMVZrQ*01;`A
zB9c+st<m*R+p735uf+RQ^+SqGSq0)DQ`LRP?9+4NM!VLZHJ2B)lk`F>pzVy7OP^oD
z9ZbY~zHUoR-Dx<q_cpc5v8O9s=Mu3H4%G}ZFC3L8_nSUqJAAkFv=o{7Mfi<<d9z|3
z)Tv>7FR66Hv_{qEWwAzxy}XzDkYlhgrn&NEJUUwzgpKlAsctZSA!Nc&+?@(ktjcVc
z<_V0e9DmU_kfVCAT%ptK?$)!SVN_ysZKvPxMXLM;)D4v#|L*xa#Y;e=cMaKQ)cCim
zZ(zcsxu3rG6FYx+1d108?cDlcwj<O3!I2g|?y({a$%@b14e$Qwi!kjnW`BtY_vH|g
z7Gk`vZQG?~MX?p`nHuqgxd-@oGC0dUAi#xeSB$tO!_5>mLm?Jp!Z_e;X7ki>4qOsW
z%r*Hl9C@_8J0yoZ$=XY(^)g#AO;<DJFd$4kO>n<aG+Me9)|a2Hi@BBiF6IR-ecxd9
zH&tyiV*@q88=NcmSykl?_v+j3827P;7z^3km08(8MCDjalPAoZ&RZwUz#G$R#;(4q
z`$~HvZ2G0fxb&wEgbWl=Q#eG|^scO+t|_3+=;p(gMG!>EL)sQ%Hb-k$`#4pMJiy5(
zYw-KC++w|)8k_c<m`%Svv6p@u<gyO%X6rBtFw4u^9mSPK6)slyNZlWwd6n(J8>k~p
z1IK^xu&jR($K1`%_#w0yO$&;G$?n3>OK%zk!<T5oHwg-Fu9;5cq;v087=!yXER1&t
zrzS9}730NakjP9;{1a8^_qPAP(%&C~U7_Wnfv6fe_;N>JN+cS)CiXw69Q?iGugv33
zMHZu+a0%E(o|K??M4xqc8~SxYKZ1^LM&6=SXJfe#xdc?CuBoY6Vj5Ql-wVJ7r+<kx
z8#uA9Fqig0k|IgD?axIcNqI@RF}#$@5fZvl!7Rz&3>*>MK#V#sOWeLr>3yj3Pb(rC
zDV0bNCgYnZsM$zV<5ZxrF}77mfPo+r^nu1^>pSfPXWWryXKKw>D9zT<1;E3G(4j|!
z#)dpt0N5GMh)YV4?LWQRMAFFYA_p7fC(tM0)=PFmumJJf#Ks`4G;Wm$0wS$=kRW&Q
zHIM;Wyc%`z$F;KMa-o`5*+HdGGL|Lj*A+#r1&64pSX()qTE*90T~&nn_n#$8EJ!I2
z_E3cc^`Brld(8+Qo8q$e$yZU1eeAbea)2|33{2tV-AKcaolm1)%bXey_I7Q#xEeSv
zt2?YWz2p@UniBvmW(PAldg!JjK<Qz+vMe&E*#0^3+*Rt3pl(VQCvPXkgu85r>=VLn
zl_5{3X!TJqQ5+~8A+UirM;GH6weJy5O3VA<{<Bv1M1jU#Mj0vZ=x=iGWT*)Qfv@H8
zf+Q^7t7>di?9PZ2gd9kqi>1{)-SYFpdpu<NQ6DsOm3lqUNgJ?U8u&dRUbyR*maZFX
zE7uTg*NCDK$tEv-!cfFy%?nH7ab=h+(IYCSy_NtGw5oFIABG~7uYfWzsT&2bfx&^?
zZW7N750-D}efkdae36;1bW?S${rF^Z2Pd~iEBe9P*)Vn3JtqTFS6@g|7ZDlR9F4Zt
zjQS;@lea=?Fn9ko<pPoC!|`wEEcl6_^P;G8$rA>?w>=DxW3IF-#_2|K;|jSyUQK!!
zp{yBYLWc#b+S10c1q-uFfNQF{8M3%l7+cd>l(ck~4?r2o&ot%~^IZ@mR90o(y`dU$
zZi(^~Fv<lU*Qe}eDW-2bQ;gU$RpOh-)z}*A%mwe#NYH;i<wRTB4XO(IXhDPnY!+EX
z4TLSRo(W6`%OTiN%x$;PV9U2!<8Zo_!bDhksmR25xyM#a;8~xxThw~Ijgyw@E*$+K
zo()dZkGpyfD^S%nR?lMR^kHvJ9N#pz!||b$pL(jcCVt}~5qusK`9YDI*YKqJp<1OO
zOETg3xR3ZJWp^?j?(uC4uDKiN@sH8$rY7q}m=h6R)jf`DWCK9!AU^>|`7Vhwm~og1
zkVxnFmSnVMQsdJ-t0#xVau=i<8JKdW6lSblclYrtjlS5urpLXJgN>`p(oVT-$=qfM
zxgI^^rbM0N=p%O6dfbG{T3t5AI?>TZCM=fEFm?y&hWUc-Jmoa{C@Ywt+2tqv?2N}C
z67d$D8aww=$4A9{r_xx&HqrDJ)2Vj^f5)J=)UKPC9A=urY5#;aD$xNI4p`_`i?ghE
z?lC;D8}nZ*x*^8yP{K-U8KITzDk~)%vp!0ciQpXp*`~;tv783Uc8G^B>#gDj%c>kr
zqGR|Pl}YUhV~NU}P(EOKQChv-Y(LSyI#vJOnh*}N_`8&K(OW#ix|$*py(Q%m*(=gM
zLI|}mqD-me@%Ff!=w(#~mO}H!1`_sOkB4zi1Fv3Ga?g64f*dCZ#*@jfjrPHGD@1Z2
zXHX)(?XF7AD&1S<DK<dEa)sIpRcv%9{rvmFfZ!Pcs`|2E?<TrC<dQV?udj+JZNa+7
z({F#oNX|*LGq&1$7?)ftO2(}DlfD5+5_HdsA#KsWI%uPJ)tGP?APihtd^K3TSR=kO
z{MGFAb-}&L{;VDKuUF%n{pg1#$g@_o(i$%TT#iMTfQcbmH6;q@P9Eo0<m4&%Iz}o;
z*P&b8^NcT0dAU03GX@&<iKc($#U%Yr&+c-Wr8k6r`^Cf^4!X(@Z?CD93XVg(m!Y$k
zd}%zoUJ-_RpIkL^xw<&`o>I{*QtG>ZA>G8xRtS*|mWo~iglQer*awK<H>%F$?xh%+
z<etqY6xF~VIVYE<0##o{y<%2JhteaBo~Zpgq2qn%ea83Y0u%P_aWdf)u<LD9wW~)W
z;{`!F(N&d_INNJZL>k)^l)QdW%o($;nL=cX_+H=0$gl8HVXkMJeWy}im`hB_??dF5
z?>>s~3?umLw~V7(9AZ<j=8IezbOG4Au5UGCUhWIuvdr-80#)-aNNEk&!I*f4jwN~q
zIwh8tU&|v19Muy;w1+r}BOHWC3)BwiBbb@Y%FYW1!cxYCEOrIktdF!~qs}Nkr|xE3
zcq%V@1@p(t1p<j|`04Ladj%n2K0A5Nz2ys&LtTXI?!k0C-kdPVJ$An6G;VWCS;s1h
zlDyO#lsq*-gJeE}mw+AG68`a+2pUP+-P=J3+V4Kxk6B)ros;Ucy-~lx#B_BQ!9nJ4
zW19!oHWYvE#l}~Sc8ZLuB2<gNO3*nb*az?H?mm$$v=}IPu@oWEPK6W_J$RhXEsX=J
zYq{q5=)KFBwZ1=+X{5^koG73nEZQwUNDdwnx##K{b+24mj=h@QHAI5YFkR2}>-y`+
zR^}kScFU5+Eh8dOeB`fRto9^IE3cVc#gboatxBu+v<_O1`}IvJf(Xtj_oM7Gug_r^
zO@{;`1zhzSGcw2r3?Kg3dbN0XPO&X4V(SEZ^lZ&I7QEcF<A2x52nZ5dLsf$uaigqP
zf4OnSzOopa_v|ZwhK0xPD_)x&EuE8LAE|^lIZXT@)PGenc0h>+P4dE8HDLtWGP5&|
za3i`vhMu)M1+$|{YJv)VKyueq$`Fy|Fs%xP5%;yxu)9nfpVUkUm?-##RQG8t<C1Ql
zM7^Zx-^x*|9-l1-CG`YIMj)E>hk$w%hQn(J^VlZh6&<P`bI31ZCW`o9gq#;l0z$r?
zDVfpTG)=+!dXF($sOT3;WDJ$VHh#azNx2^6z&zH&o0xU7O?LExHV2C=t1amgqfCh<
zS#lvsMp{u%mst)h9sQm&^?o9|ot910=!*31P4l=FS3p7PXc%Qvl`zRqPa_wfz`(25
zNM}Y5d`Oz;s4tjAZXL^#vq?1&qs~U$`$#sSheE_JNjgWGCOVSF)uHSXV8jSZ)b&8n
zVljrnjJsV$soI!Py8`(KtwqqV2QrFJBW}=pjcjp$r8J$6w~S)RH1dDe(?g?~Kq;>*
z@XB0An1N@8fP#k#>1(Xbripm|i9$(<oc~wu5J^>Jp4<^%t9L7R4#_(nn!<_eCGR8&
z&0KjwKGkaw<_Fyo9cjLxyB*gyx0BeJV;`mg`9~2aiWd9X^z@?NxG_W&8CzO;XImOV
z%W1jU-d*x9j#4`xcCYicbH1Br4?JVE!Xyk5r7OnUE|Kgw8kte9Z0}3Ud1!yy8!WId
zxE6v=NiDg^8;bfxnZwL(TW814l1NXd&VJ%fcEG?SXxS<ZQmOq(%luq3XR`b8Cn2WK
zM^VK$qbI<9j27!jp&<7C$K!62Y8mK`RLTKWS{#;9D1OKL5<s|gn(!0(I5{l=(b9v|
z896bbyMB1aZs@AqX>F?9`03L-h^Ip${fQ1KUu;76H(!0C^>mx}WVBaBo_F)TZ46pi
zE}of?Xn4h|5vQvEtLgOlc-B%=jYkxyG+s(bh-E30jGb1`M=D2gclW2Ghx@nO_wKWc
zTst3nm3Ia8Tbj({HZ$*(>G_4@w84excE-M~@A(^Je-Wok);KFAMEBAU^lD_j#ylJJ
zxCD%6_1zG;xR=m;5-&XM6Bm04;MNwRA=NsRksjBbgUl<b)>@2hEq#m93KLu_%al~X
zrSJ`Hxy_tMx_<GHp@{L1rT3m^`SiW8#ICcDnq)c7979xTPj*4jJ?vbOJ5Rx+HCa!i
zkv?;54=~Ui?=aZ)8*ZS<a3|xgT{u#SqU+KMxpD2zV!9ZUTd(I9x_c#5z+}48hf#R~
zdCHvr;Z^bT)l0xu`4{vBqF2<QHZL76Rdl^(hSG1MQZ|w%%jD<))HAl9)0Gs;Wch4>
zrLmW1;>}~};*Z-*AJdsWWWGwVw#>wbSZc=SiQy9?y-{^q>OEBBg-_KZQ#&*HI@<!?
zW{elJtQRRFYPv7hZ8Tdh0cq0JjT{1~=AU6?zEX<;KZWrFTEb-OdA1+=bp3v6!Ow^4
zTShnbdFU3tCw<)a5TlAyXK|BdVvFzGrF-<)Un942mfs5YI^|xx`k<;>^6jW55!-8}
zn(VviyIks9-n|xO!pCnhd}31y3g-L`5s#ays=ksMV8I<!3nFNF_Wg^<Zv4L7Cds_k
z>HL|}$C`NTW=(pbFiQROmEjzVHx*UW>y%H~u~VGH_c@+Vpv+3u2EA^9NlffLY~;C!
z4kJk8=;31a-jTYoZ?=j!c*>qfxxPv^C}yq3;*7otV8^R<OJfia%IctZy?Kz#n%<`u
z)HfQ1oTPwE$~+gzYZax9g%qLbT%8BY!I-2eMh_!W{v!pRs0gv96i$0@di7QNXJtMm
zOk%^}B3S};Uu$+UI?qInex4sOd|)T@$7Nyge3^qXHL2XZ`87SsJA_YC;((LplI3SC
z9$pRMj|XzpI`YF$*E|fAE^-G-zTg+ZiP>e9$rR!Aewx{krm954Xr@QMVt>LWWodXu
z(<ZYO1u!1^SQXP3bnyLCQ1#3$8k05iN6S1_K2jrD`D&ZxMo+s4?O-^?CX4(j8Q!fu
z{8#$?{V~x}WNklO%<E)4dD-aq2DF~G%r^Wu4`!-H_;QqTzA>4YDfxh5=Ub#2KFNsh
zL8by}Nwp6x;k-_3K%k3`qt`~SIMYYxEnlCQt!>($JYP8DO9dOuni!%=y)t#)lUM+)
zWU)8YCtcmsHqbIWv@$PC;Trjyk&35i*usk?qf(ov=R&r!wKPi$QN4F`_H?w_n-FaM
zy`w+x-b!Y|Lerl^(fMaekdcDJUuaI2*;Sg5>ZZHwij7hIvG>NMZGSA!Hfgr(>ZY&f
zzuNV_t-o~%xJpo7oum%YX%;}}n9sFMu1~Q{%HO0S$+~}CftAne6gZoPE>WMf@1Vmx
zLC)QHm}NNX@a_#P-o(2%s_0Oozqf!=HgRPDrkFf?T-tNNn8)7vE?KQ>Bh9O~cUVq@
zol&bOS&pge^)a6jW9L}Y=o&v5<ZI#KB`0Kczm%50wN0ut55J~TDe#;%Bgq+XDzwfa
zG-42flQxGxU<MZUW-HRMmo%0RuPM^;4CVyCS#}&*Hs*e)&+#b=q^o;7@3kb5ymy|{
z$d{M?hDS^R|2+9?l8OY=yljt2jARqdTB6Y@J0QoH7gqj4TA8m+l9RDn4bj!k!i|+l
zTe-{hi~8C(<z+`Ekkdmng-3{1ZXVFl5ovk&<6#k<UJ?Ur#n{-hjXru4G;h=p|3oH5
zuJ(f4pFNHpR#!+h{)A`VL$q0=!ay{X`4Ui`yxhNUYezWK12|A#4#g$(Nd+N|70s!r
z*feIjBReaF?vs_w>gIwRAf^ipNr_oTnk9#R($Hm9ZjY_}?~!S)GX=-&*U8vHg_g5q
z0t$EGx!>m3siN?Y{`X9A>{^~8$RIgdTb6zSX$D3k%9w>FTPo3oE(ioluJ@CTWif@I
z%h%HTs!?N;##XvUrlQ=aE~%ww%q1W>o2RMCS|vqavpI;<vVSoBVQ>|V7H6ianv&zB
z6y@7~MmSC3H7klHgp`bjEYgu8LTx%pV|P<{cJhY(;03!hjlj6s73Ty|sKgLCkDnNK
z{#t@G)AbvU{`syx$Q-Tm{Z^tR_GX99&~(dkDb`l#caGFgszkK0`*GO@kAFpE$BJqq
zwI0G~q(D0C7&I{4QV8<+k@H5RYWZw2%^E`^#PWrsPBM!do1mli)*Y%Z&dUoj0`lRc
zsTnZZv?WLW9tMZ2YTT-|+({~Lxy{k~$p`3Q0WQMw03Vg|#R~QW^%7C;bg8w7^oUXT
zTUp&Ty2==tUMW>#QZ~`S)6zEI1VoC_A|G4SHH}Hb4b9>~MEM*|0>yYtRNVs?f)ZQG
zWrK{_<ngz;#}Bik`X_ZtsF5Zw)%7FSQuZ~l{1Zyjf(?XfrtmcBSbd+tcq)|{5L@QG
zbqBfOg^58{sKl2Pv+k}ZY}gnqU14TjV`(NI>H9?6&XQqd;d7_)_k;IVua?H4(}aWu
zzYMk@4;YdYptI5sVcLpNDyr*OuNjcolu&6{)75EZf=Q;SRyz&Y>3da*zJ$ScvQBvn
zRAna5u2-wfcu6j+&D|Iu%%5`LcIYD;<truDw3NLaUX2q-?9850B13$RCi5^sl^c#H
zT*qlTGP$}%J3NgbR97npm40aB<xgt|#<9_*lRW&|BtRRj8{f+mkGrk2+U%aNA3gY}
zgwID+D@KZoU|P*qxzy&ac;K2oj*rlC`y|FVVSrIHL%V>bca93Jm4FbcrfZF>TEhg@
z%1dav9PgrQUOY6%%{ja#k2rcd-bV1#Dfmb@?ge+E{#F7jf7SStGQ~y&t7S<k?714{
za1;7yPF3YdS_>zH(sulzz|caCbc)8R6ufpw()dNTt-NKAZ4siRdnfd+Gr>gXi)9U$
zvEBT>n8L5#K;e7juDVLhxXo*6;X<KEc*p=lhFw_7m2}=N!gQHA1%LgwAf^ccearPg
zf<18JAO?|K|GF1(tf3z*l>;FV&xkH(Wl!iaS!~i+%V1(MQ=X${hGF#IsYh<H!&~EM
zvyZwX1oSzV$jl4wHZ~6>0KJNP?>DG<*h&+)K6HMCb2R?&d7G9%+9+5}dssqN%`-|U
zrhEZ(xV0b6&WKfpP3rStqpM7m8C2;smNni;#Fc6X38l1|Q6AlQovwESnfIWp+K|oI
z-2{dOH;=np*5S*VWj7~CQtt%|R<zt_tb2mwHx?fkQg+SF&c1VUbD*ZR$`Kf-SJHWR
zJgHRnie}!cxZB}uo>;Zsbut7|m3!lku^u?l&{dBLD>I$)pz;mwYLoE^FO#nV&5QCF
zt>QWP2|JBU^6N>yp4Eoc32Z4iOuJTe^J39Y9o}TmYXdd4_>bxZ$XgkC2D1~{zq`F1
z3R9xN4o{~TNw-XpDHf_T`orx}7>fwKLOLm7YilbkVi&HhQL01>tpQuc=lixHIo8{%
z6xMS1Khz%L6xgx3SYL|4)b5`=VDl|clY|+R=Su;g^{1C{bY3sgb@a<yY;v8nvsN3m
z&P8W00kF#DEPn4?Izcq7?1?MOQ^X~}kBMzlZ~uf*V4!royg;nx2vS*ptT4)X9E~A)
zx3Bs@FCaS_vIg%{V7IPi8g?Sh1BJ?1Jmp~zBYkNce<ZHg^ka{wr6KJ@Zzxlez)!ff
z7Tag_6pgB{jfdLd;jn~^tIKxFQCU@CWbU3L8G1dJ0F8xWS=`5;EN$=m_wFkb+zr7o
za@H6$D!-AQYn<k<vtsi#Hx~|}AUPxGyO~&iWQz{N0_~d{5j`0Jy*VKH5>WY3s|RRZ
zP`08pD2t>{ds6OVJfE5DT0T=$7R~iioBu2QHUi0_Oq`IV^8=K9rgo+J8Z_hWGI)oG
zf;h5=#MrT`^BkS?lqUEnS3GhRn3=mws%W}?#2*&qNo3olJ0_GjHX@%PvT?+~hc0X_
zTn&j!LDroF;zY#xf{bval>{N;2S3rY@P#DbEBuWDV1v3wss^99gs!Iwzi;w3UF2Gl
zrvp>)CeDLL)FQBpzwAWbREmAAH~Af*H81RX1e34BB9M!E@F#{9K(k})zb!U2eT#)@
zz)5iI5@xuhv9~g9$mzD+2(92>pt}SFmX?_-k3y}AGuVsr-}lkjYLHK$yaw$Xa3|=|
z2{DwR9!-)>wV@IvG$u@!I34Ixvkpftc#W~JLCU4Re#X>3>Hh#{|Jncy0|5X600RI3
z01yr;YXl+$;s_MNCWgw5BhHdxunmjsr7^@`HF+U$Ku;XNqmC;NR^ely!l+3pvplG)
z%#dP)W+NgABs-mi;T^+R$mTgSxB^m7&VZB1LD?Y#35GKvj0G{qiboChIsplfv7gBt
zTYnq>0O=?_Sb;}5_cWQJtqDQwg4mS&hQbcjh98iKf2~FOIJj4fdkpdHYGOT53<Y;B
zR>Nu<!~}^o7O1M+Kl;88JTXzDf2~x(o*j1me@|gzfc3BKPJNEZo>l&4@0kHF1~+Qd
z!{qx=32cK0c%H!#l^(b^!;(7zV?hdehb5>9{{S)1y4qesgbRS#@EWmj3PG7ZzzWS(
zTdYep*zW#<oWKlhwt4VUeXj7XebsE5GC~^-+5p9Yk{e2j>IH3~rz#YRWD~@nbk`M|
zS2zsv_U!V?xQfWd#BfLz*B~%^hm_kScIyECG_i{`3^yg!`>5?s;7I(<9N8byL6gEx
z3(e%e7fv(}M=;~1S)0GhVn<T)%iMoHqM{g<6)I-TI%T7443VHR+GfT(`+ed;L&Hwc
z+)nTP)6+-c6?z{4_j?UseYKPQ4?p4;epq(Psouw52~JBs`(wsylg_NZy7{gy>B0uJ
zY2(H$3va$%_O+I;Op<PXqP|g2RgCh?OlD0qJk*Rvcr*s~Frc{sSR2}1q5@lAuXtR$
zr3sq%%Mw3?EOL~yx=Ls8h0FK?#-nEE9WI55g<Aoa3?Z9OZ)6k<4~x5nOLqOsA%G<_
zk0&z+DK!YZ@_)>67mw#K`L?q7qc;eo17s!NLHF8EegvmDA)Zwxgd28gGiOY@3KtXT
zQy#p%D3p3oBY^4Dq+;Row;seN)8za6xDyouV2*FQsz!kj3s0Ra?=k_X_L-mShUjOE
zN>D?LP>Oj;%+Pe2ARIirZ^WuZj|{>*i&l8D=R~wNtxAV==yUwzTiIbZEgFUm!hT9b
z2>Xj~7813YWs#J0wx29Fk)wJtfDe6r=Yfsz{{Vb9eu>b)@IL_a-~RjOJ^#c2AQ1rp
z0RaF50RaF50RR91000315fC6DF+ou<VR1nJ+5iXv0|5a)5P|SM{{Y|~3(dEI#-?13
zWFO^Ch>^v;^&<R^GW2+`jLj<CnQqm)cJ0a~WO^{41E=J0OWa>E_YZUPpB`h>J;OW+
z@mwDv#<`y)+1zJ_><c-Tbg+*Ce8L{=wiuqgWQpv{AratDhC(>|v$bhFN$tU0btIF>
zHgWcR4o(la;QODc`3@f^V^~QgjOlHNVhxHIW>10P{Y1j~m|5SiC%~IHEaj0AFLBqz
zJU^}^FNuV~1?t(exy+VC1R^0KEj)*Z#PE2r7M&1?iNwM^V6<$>w}6;JPQJ-F9S~tK
zf^dlDm_ucp{n$<E4ns30!pk7CKH|;>a+p13sP$xfl#t8T`-475t9~}z<E-%84Zaf`
zz8oDH21voZBzD8Gu$DL4{{XWyq=tFTodO7q7(uWhy`adJmcJ$Ti7W|e!NiA$*OBbO
z1VYFn948i8exdxnE&fY>&Ogc@SJZHX5L+cQ*f=}`aW9COwo}vWczs`d5ajAZj0SZg
zOZhs$e%d}m{FnA;$auc*zJ~)77%_{|N9FJK1-pFt2E*5dw%CsktKQy(Hp_1nmInp*
z(=*w9_Z)poz6aamr!N*;JZ-)YUVLzImjwFvXQ}rui2L9;cyc{h_bx;B-Jg>maAVVa
z==dLDw}LylVZn3uKFPkN1NMDMB>QYI&c0$e5gg3m&t(q}Oj+-VFktG-EYC8|93ncB
zO4wn(@NKr+1FI~u`JDuCCtM8o*2Ckr+ikpFr;b@3;IbpRWI1dw`0cjKZ*!Svi3OYx
zb2?^p-ecU8jIz#UnP&?uzbu#eAId+3e=85j{&A*dhvYBizvce`90ilnmRTj98O-oU
z^28899rQ?;V&2oygJ3;2IT#v1C4G%8W)i=tf2#igMW^Kd04{B%VffGXwz3T2IKOAx
z{{Sr>nD)!|;LZVVd$n4}qF$i=mO--pvd>ZLnYOcS{Vf~cfquu>`N!1#kF&#{h4FmH
z$@|Br&i=2?e!2g|03Z<o0RaF500RL50RaF20000101*%&F(5%vFkwJ(|Jncu0RjO5
zKM+F1a{mCrF99F9I4`~pt4RL<fE?g5@&<75Uwj@t*>oa3PP>7zCu~@;V#SD76i1vh
zgq~R|cpl8zy1aW`&`w`pH;vUT<>C05!1i7VWf!)~ZpzFX56N=Ud3iZ!hu;R+NzU3t
z+YTwgzdONa47SDhc(kyMveH~EnHC*z?GHN;enb!$_&gWg;(g%nmd_LJU2hPbgSg<x
z2gtjeaJhbRrQ7j56Yb`1`S-=*eb^hv9^HsJ6pb^3hh%v1JdL-00w(jqwn3P8GA~n)
zQ_*eMwll<bS&J;ZcV(BT>xpk0F%vGiGWxRM9v-|r2L?OK)t3X6Y^M(KoU-PA6ib=Q
z#!ny?!_+vz8n=NG$y;o;@K*BP@ZD!CBJFSFHriaX^4Wu3U*|8i-rh&b`~c#~-Qs3b
z5G{hV;sYdasFyopE&SjZ;AL+wCfQA;;WCz7l|IcooipWxNO?inYbHF1mA2c*t_7UV
z8O)&DvlC^HjF7u~?T=0a&Ij>?oQ@ewJ;~s~Y4RTTE8NGsWV*d@5&Xf3;dRBAfs;!W
zq@Qz-wjdtL{X#uSp2_(U{KAibNhbi3P5}G5`k9$@`@{5bNAMmOaq$w%EaB?b9kh;?
zk@*go$83jj4TgM0l1=M|U964?XBlKL<JT9|CxgDF?hJ^0TP7E-Lx_^gC#V)*R%9LC
zC)A>HFK_h^lkgtSAf0dtIDD3O2(b(?(pvIG)A%gMya$FyFH%VjCf-I#c?ckno=C{S
z+;HLHlM;HLj_<sL5J4ZoWr&u-B3@Y}l5v)CmO%&c+h$@Jwi|7mB$7)b-~=BH{zOY`
zHpd3=lgC_T)Hd5~ypPHKr}n?z&+dO{M>hI4MBTUXzuE)$%*}{hTWz-6ZMki>>^}j1
z+RD$`*6TkBcz#0OkcQXx4C#;FR`-5v4YAdaHlx#it+(Be+<vkU5%}FbFY@3&csHwl
z+d8@*`#8xd2zs{7fbhJw%#?hSA$ah$Y`fnZkUB{lXAcZVnDRZI1jm!d)LyTv?l~MM
zk=5xOfB(b)C=dYv0s;X80|fyA0RaF2000315g{=_QDJcqfsvuH!O`#_;qfs4+5iXv
z0RRC%5R*@(L}rt3U3zZ~Okb^fpXp%Bhxor%GN#|^AVBmei&FCdgfN3M0Nmgs6v~Eb
zn3%0i@EcL<rtnB*UZE4vn|jnde}mJQ9;<LAO8)?Fr*59LnNz|HOT9X&gc(#-#8VAs
z5Ja9Q1rTZ_HR#bD2<=VqW}pR9Te%5lR1o3_ThV_(sA?0BPkCO0E_Z~#O~Cc3Q<(n%
znywOG5ymI20VqH{HF=A13W+j6ntd|XIf0leYMP_UZh(ZKyif^UV`Dyzpo4UocQ1&K
z>V-E6Ql)=`(UmGzV0s_?MZ?o01j&^V6&l4cq*P_i$8ko>W_qCtsp!=;P0{BljZxTc
zd(HP5+bbYW-qNlgh-Ht&Qk955z#svLJF0<T5l=*<Q>3Rz4w9WEIz(2Z*#I>Lx<nh7
zX;Fl5)LBMhfS9Ekl%AAQ;!;CVi!NC$WwQ6Fg^1JAggC=OV5az|(`#a9!ZF0nl;pVV
zQ;a~Y5C#(BP%#c6J6@ZClnhSSrP}`hARUR=k4il#2oPh;3L_bvS@nZ5OAW!MT%UMu
zH1uM|1<fN4pehB!skDYdYm~2vvTlvELo%Xf;Vv}|r=f(E)TC&H3NeNZ?>NK!E`&eL
z`rKm;!B{Y=%)m^*ihHHSh&hMT(hT7-j}ruNyMZea+)PKPtSbquGgDl6_=`ER79+B$
za#VI1*k{mY5#L`(06B~tkpq~m$EcxwB)lcdmo8kna^=N~91vnErc5<)+}k#TpF<W|
zH-Dfv6-C6`4OAF{P6#pRSis7J$eTn}30f_)GTghJeGII$P;iF&bTBy(=ztQvU_OpB
z)IY#nxp895QAQ)AR;J8%0#=|y4A&h^zL>;RevV-sO3CO_OQ$jjB{4A`g5f67?0&do
z_lhz3i)$AWk6%So*qSzVxbivUFta*|LAZCpO~5gWa4ZB5SbvyYSh32k1xw=GtjcT9
z+(pD_5N8rl!wGy=z=TU7l*Sm#CoqBtbu`6Nu)&zAghwbBlVgXizo7I;6w2R7^|&$I
zRt#gG;6e#M#j?Z4_$f-2QlPe=6$D~~IL8v<EY4>$h|&-*(W*5e#2JGVPovHzXJ!}T
zSkuxK=rHO&0bp29XzrHqCE>!v+ALUqq9Aud2t8u-m9uC!D-xwa4xv^%hd=PtxpLzd
zLQ9E3f`S}FF+*6T$1D?%Oe8x|6-HO3fNhSAmLJ37ezb&^-kyo9=hIm1NHAdp5_duf
zF9{*yJ^BV<x7Mbhyg~vDT|hiRUWQkrsx%;)X9=gHsWgK=juFG$cL6mL0hMkzu4fQ{
zHY?OL4@8kt9xk$oIGA{L3=;i8F;SCJ#V}JO8E{xAw*_EW>8Q>lW}zC4I*OO+72+YN
zQEA{Ii`tyA;40pcdkxX4Wr%7sHWhIr49YYnv0*hm0dZm)MGE7&c*MfQ>_ILiRt|kM
zn0ckqCaKaz7PyN1^CDworSBR#&FKWhX@Py^`0d9*7~O(YQh{CJT-?lQLz{}4i)@c?
z2;3XKWh9uUTryIl1{U!wz+f7_q0g0wp-8&F)HPpoVVG*!sPI83>keknY7o{kx9RYj
z%`;Lu_0RA(3_)6yAcV7@FhyQ(OUhTxZ(K)yXl`alD58W=#j`*#vlF0^#kBQ_mcyrp
zHmGGGVqPka#A1MzdO}+>T9=>;(r9#qS<bpjtTzB}3b=OAyS8UlT|}E>m7t2#uMa20
zui4HcW$rIe;#|CezJ8L=Z_JBUM|JjqZt{i1IrKP*3L=acPBU2QG(7`8k@YunlZi|(
zqZXw-3cQ|&lo;G=hJ8dIO*fdzRugcdfxRBQdq#l+({a|-zFehN;`?S4wiI9odX2F&
z{c&uVtNO#9xZ;=H0<x@%bfDs7Le!jHyv~T4g@Wb0IjPP~wZScL1C|In8_V~Ai7T2@
z_la9?oLQ*Npp>g;m>QTe96jYWI~RqNRs4e+YBHsW!>EYlg9HlH1;C1-V2jXbdIZv4
ze}J%J#s+Q0IEjLuhx7VVfLz+W6n`=q5hoJXm<2jHjZ?tp0$OxaQ&+MoD`&Y??FlS<
zK`TvBv!g5()*6^;#vqX6lxj`|LSZ2nEnA0_@Ia=>nk$r2E)mP_lvTJLZdroET(B6G
zcI~-=2I<-Q%cNCvU!_5xU~VvKn_YOlK{GrELXqC-dYxdOw|HVLFIXY0^HId1C0t6T
zSGX0qa)>i9#L{gRAqh2N#sq$pne+-#gm^*<>@bvhVXl239&nA{7s|@IQjFpYM53r~
zkM1QvJHr<iF56c__ccD#N8T?5?f|=#z}dw^)z+HC2v%$~k;w}=G{vQA@=KEVCMMH4
zs8WSGz@>q!<0LKnOouRv4H$RKY&O6@Q5@NNz4nBmwNb+gYMxTy>=2zds%Q?emnSJ_
zmGk}NP;`eM-`yrm2Kp0WIVy4bgcXDGj%@K7WwcQBAuf+rToo}Am)=yc(-wOuTJ?<7
zxQ3vbdJbU27oa~u1XYC6KtL4iirdwOXJ{B9!YdGYjRD^<w3denZn<tUa5l!E)G)za
z)?G!tbPoB720K;x#H_waqs0g73opDS`-&d$#e&^8vSIhAAU*v){{Z_J0Xs_n0LSS4
zVbt@Gwe<73s1>o<{{SH{o(u&X=Y1o9HV6Jn^H;GrfQkiO^BQkPT$lzUv00JSl{gmb
z`~K|&IdBJG_<nlJZd`Su3j&v1S9o~mRkGfoW%FdeY%p-|Pv#uDI1PW+2_2RYPTAwy
zS`sT+4gUb_-d&r8-9nmK9JD;fMzIuJdKQ#03{*V>DDe~!Wfe{zLg6-Naf)?}cQeP(
zAqqn}qoB>gef<EE)zKZ=Jci-ed$TgK!h+865G5K_6{rz{P%pK)jvK`SucmM78-q=@
zK1?#Oz1Y+59ItsV@$dbJ*(BK3M4ToU*z4^3Q_Q}C;_H|w8GO8UVPFB|>S_AIx~wcr
zk7|R&X8viFt0+FiYLhI=E0J3G{j)_U1<~L8qyw|k<?sE=5|Xgno9rFfBFyha@zON7
zbuV6D!2sxrExH{Ea8z>HmN1&dqnDp)l8SVf_$A{(7EOm+sY{;DC6bbb#HDdOWig}x
zrT}*>Vz}U&qNv3dTsn#AD=#J{8eV+@<5D&luMp`+wMT{G5e*>a=4f{GIDu^%APxfA
z!#?YLIG*|f{{RI*yo8ka{m&@D7VVg(&K#M2G4zG%eOIOa$YEwQFXyAYV?4(o^(8AV
zwDNd_mzHr2=vbq-6F`7Ho<8!&a;$ZVp+*Vt{$ZPyy_gamCT;N)v%1sMznD#!T)VA8
z#AGPFe>(o5OaMa6UqhJTNp;ok%9s`wk1zL$To@NiM|<2`PUvOx9j?yOxrXjCilf^T
zI}<jdU<AQ~Wh7rwgo9g_CS}XeV3!9Ff+8oZ8gmI7n|#9s%q&993qlq&fX1leWtxC1
z-1UGBHHiFK@JyS)9}_d!mAGcA=sEe>DH5Fxc3}d5sx|KtK)d5n0^&2#_tW?B2B}Q+
z4`|A@%MVV{z3d5dGqu@&dyjR<1b$2VL6|!SfkWhhx&h#<M&tyiw65acyc!L^ym!{}
zuRY(?V>Y@QV!Hb~T)85i<zQ>RP;*mmE^h!!TR06r@hw$s-D9zG^s)-9kYes4mopfy
z(Lj?bI)_YmH^ptiA%wh3+_>Db6c}n2V&mSAdL?ce=I>d&zN0CLQHVsc9172QgFt7N
zULc7;fJ&-&sD&--wtP5>IIb`83NS~EA3G*N*BLMK#3ljW>*zxRYmC$8;Xp8HNc^i6
z{{Rb!rid(bfld%W)=uLn`LRAd#ZZR>qKD~3DuD?dtbYfzWSZ3d{{Rsvs7I2uEDj71
zIaJa8(gONb^zRaNy`{q64MV82)N}L100IV5t9Z-xfhc)%{r!mItJPZef~XE<9}@*>
z_Jej8*@U>Lt*LQkvzRuNlJ7t&mV{!LPM(8MD^L}v_DU8ZqNPBVMfHi4#2U35<>)fF
zn|bLf6>A(8#iF+zQm2cH!YjTfUcQ`sOMnBc3`>C5hQA^pJ%O|@(3<){bXdR&oq(43
zGukjRpm=qQKX$L7{{W#B(HdWm=b5LEc1VMt1MU9+AU4yVEB9gza=NZ%Pv#UF1Ouc&
zrfER;uf$yqJd)m(_?T2oklfXIm0Oq4y2DVdgf|HC)&)}lQALM|w8jj%aVlPBzN&^H
z(vM05qyiO?xCBA7(9X4}#SOLaOcu<M)>`c~DqJy!EP8c=NmidSmdfAgVziata2%=c
zC?)S$!-fH+bK)LT=2!xb7J8gY?6<B+QGG2hG=`7_VX7aa;tVZOjtj5c3%ad+!2EsT
zkQZ6{{{Yr-Rb*0qfu84L2#WRV-Z=c0VvSqh%rCh!S{_KEjmvpHlQ>U!t!mK>vSDS0
zRl+P2GTbBv)8P?DWy6Tp#LUtquz!P(@EEZTMr5If*UX743{_KtM%bxQ)-cYKcp{}P
zs)8!w7PvjLG7EIX%MbQr3KUK#A~gz7OuQlG%zN^ss{{rac9>j1w@v&<R%P8s`1pg-
zpOWwB#rDvj%P;#Z&VUK`ej?==g%8Ky<|{~ScHL*RX4c;=aoCnTF~y(Ev7u&`hL!JZ
z<v};!F{6>`F)o>lRFFJEWJTdf!<e85;-ij|=Bp5|hzQXDyi1;j;{N~#9)_N$f*SO0
z^y5R`W=I5pBrQQ0)Tc#2wzA5dsMc%iHi)u?IEdBf?}!1#{be}>?*X8`rG4dLN!pPG
zt+u$920Sd>HLag9D*@i1sPL%Na_2Kx0DWd`HfqlQ0IDTaRdA&%@S;$MY*ZCU2Ce14
zYMP4+7TK`v`;LOH7M>wu$PcyYII_m2=jvSHQo`ds+-<*H3PnXnw0C&SAf?&FH<MDq
z#NRVSrUWI+^iaRR)*6h_hWt=`@L?SUDu!tS*(Ljiq1Bqh(rlDr85N^+edFv0h--7-
zG3f`kC3;F6hUG!-quh&I2Ufm}ph3wi>PobtiYS3bh8u|{>BLxDuzIhyPTk-WN(Eu|
z;&lUfEe_YKtV|+a0NlC@vO_aD(Fm$B-WLiyO5IEZU43Q>aW-cVnO@QN!=#o7+!+Z4
zjJa_ul`H&dA4G<r#2C!S(uH~EB)$d^Q4O$A9*V>`hFWh@%If<-xrU0&VKF5Ijv|~?
z7RJ8v@`uDA2yon14kad~lv;bs0@!5p8F6arRbV#96$pr_x~y3ILQrWwnEXKs>A;s{
z!D8vriHJb&DQJ|N9~T<<bRfw_BBn^j>nnnh!I;EWbq_fKf*kxuX7e{n=M#BSy(&}~
z^&yBDn4uRLXM$hnn5!w4C=k@vX$KJ35Zn<ggY5!_!$Lza1G8jM{6IJ_+CER1nzVi8
z^PyWIHfI6>dppOez7u5@qGmfnoDR21m8jUU<LNV;*|%EhEITVqE1@X{ii?2aS_hfR
zWhthKQDK4=afg_RL(I68lLx9HvSCgmf>o1p5}@1-`roC^Cf>Gv0?>@IRC#3#Cw(Hu
zCEkF56MVtU8O%z=@wRUZ>_jg4>eR_%J$C>y`C^$POgNTpXUuY@qVF>L9!*Y-#=RSQ
zaTF^jIEyxt4LGf}4+&NVhq)00+8fX2UX`oD63Ju267ZUY(^1|d&2a^2q~;>bxTs8)
zJ$a12ICOtZ<pp{vFydS<JtZ^51$CR@W)Q^8f5PG{$CJ?dPY``9@Iep;BoN^d7A8y(
zVk*qF7V{KvH{6N_=;bx>QQ|q;)qdZs&s=q;aLVk<*$I0Xr@SO0;79&J7nJ`1#CphL
z^8)}YoZLX+OKG2o7m}I?cZ4S-XeC@Bkert53qVm$V$59}%OZ~rN^1Vmg4e=1L~by+
zV^TZo4|r_t+8{jj8HO3}%(^J<X$>A=vT7>3<}q@CnV97+T)##b<LGA@tRPFDiA$If
z0D(}UB-$M23nh}4yujcP+(CBWzt6N4Le=2*GxdQ1Q?XCZ-<Va4JJmBPdpny{CTEW7
z{{XYy{M+?{vX6J>C@AR7!pMbcjKl&Gvx|ST4v@r#E@J#~)~X0Wsw*S`1Y9t5i-}!D
z;xmR(Kzp?i(SFWiQ2;g&U2S$8!k<~M5b`%Kyh^Q_^#w63TTqpzo{T6)#!*Ye&WkKx
zQsv@Y-fm5O9!PEnqR+&2XbGuUOmVO^6lT3m4hRIrS-y`ysMr%&UJlOQCjk4X{{XRs
z6zpr?affIlr9+}z18y0(wXA(V>_-<KUB^vD6@cI|Eqx#$r7SmPnJtPZiGBeWe33wQ
zr0Rg)0Jfz;D419mvP_d&I){Go(3PL>+yJxCkGp>rIZHbK08z3YvR1NrKQV6au7&vs
zR051Dhu@EXk^;)M3i2PQCNH8MkvxQIK^Vx6;xw^~>vH`a&NE+3(?~{Up>6LSK48Hf
zf?`e>O1@$;f)I9AFkZc2PE}_tz22XAS~wUD9?{b9dB8>7V{-DFIGPzn)Ax!8lK#vU
zTh)N}>kKTes>|Sul&gBqTso5df#4u-+SSXD6wE7u67JmZyzT@;O~yz@h+kb_N2i$J
z)NhmA{_x9KtSceu)l3&FkiLR>g8q*Q;ug?0Ep6AG^Az@DOWfz%G45kav+9nd9hab9
zpJ-c%S3&gi_l9fI3&}av!_v}Rsi>`$xQU<hc#PU>>(oG)5L%o4;RTH<Wm2SIsDQvA
z;MBSlUoiwMIk@io=>9~oWw!bA8KU)b_x(p(oddrW#7mXF9p3Nvh-0TKpzg4hyi9u>
z`{Fiesn_@YM@vqZ1QR685`?KEF-SJSw`})=(!CZ|EQPjpXBI##(PJ9%ForOv-T`j}
z-~FjhaN}9>OogKJK7XH>8d#4f#{U2i1+I5qQVVMh9Zr$zXu|uiQ@;A%BjFHRtQ|*^
zKEMU`N81O4=^f^elb7Fq@rzQUP*ar>U}$^72CzzzFN-l{%a<-J+G{RR2&j>RndBx!
zgDQsQ2Q0M~(KM(v?Fy^Eto89MxfQvB)bm?^>(0FCp%an6ckpgh&~3u@{Ys7_`ThJv
z7KQHlg6h0}gr(Hm=>?_yL|fY(n4<*rLl_GU6j5Q;El9UUa{<cfUx>LnP|(FzVy4Wx
zzNAZ<@by3E+r$dAe^gb5jepEbm)7CgUyq4OZCUZ-`~F`R!d>PcZai-%{DA8Vh4A=l
z{m#)t+NQCOj6q$_uwxUQsLOQl%Z1l+S`9)q8F6D5FZ?rT@fR>r=(7ZQ%bOSs1!07`
z?Gt*#q+(yhCG8FatDz|@1FszdRZg3x?Rxu6OC`P6Z`@)9Xgq${VTCw4e}8wiyMXHS
z@d|Cv&`%)!r9{hJJphZcA{ukvEDOp_^Hd$U{Y$x0^un}roX07n>l|*_S=1qb7An>k
zkHoC2Vu@ce%?=K(Pwug(WJ(=KCIA*R9;bfYNV%^YZ95*F4_GZjB{hrT#&DcHpglHl
z<W5Jk4H#Io?gZr3oRsdNSc!BNdA;-ISRo7}gpTE@h^9!*B3_;)vI)+@6hz*VU7UK0
z{{SrBJV$U~TL+^ySU`pnSV3u^)q8-0aT5N{u^dANJ%E?WSJoDG0RDN8uqX`&LIEft
zf#LoA)%WKme()4)29Hjxvhs4vGQxW&H!Q%Ta{?wJZZ=LRCF8xb853t$-dvv2!nT+3
z4H9On{_zd$hG5V%4IM0*usuv8v*`GS$#9-HAKjrCMHd{A7G9^{_WRVK*qB|O@u-^#
zt|;Xg{{YIAi9NH{w>?Lb-akBc9em{XIzT`LmnQS`$A2+tfWTI}N8sEX$GP@7_PJyt
zj3XFkp0L7m%C5*_d1q9}VKMaM(1c=zgvk_D7Ck$Z-8PA`D5+$k0<dzJ&;3MBz#*!)
zEH$(RskPuc++D~NP#O-96xQhVddgI-r!3&_gQ4ChIax&Kwlog#a}5^6V(F_TfkMzR
ziA|c(^p#ht-!lMngiK6Z6)kC)Xhl^hYPf?y6_)we{`|__ODMi!R4gnS`KaZ9H+AVA
zdtPC>ygt?Sh5&M2Gyec%Gla~co#E-#@4uMt*NI$FS0{IH=l=j@R3%zK>Fq4T8eJN%
z)neclJwM$1^44QniNzcm__p?J?J+cUQ|vl^o+khpMgVAAYgZb_5&H8FL(uaUCP+rT
zCvyh{s1(Ig^a9HZar{6w#isfQO2Z_jC<4ZWqJ;|Nb+@}xl=)_NPA$@4Ek=1NWtQFu
zLh8vnyb#%C{KE2T;<W&<T+l=T&Qu!l3j@mZAGt+Tx^KRwRLS~oBCyX$7K_ZdbO^+p
zANCCE-7*nqHLCn#54ZEHs+gstN$)8EmQP!#c1?mS(&_q=ye6)-Sp8;FV(Oz$>JwTz
zPM@*NRYP2wO^><h&d&4Q%k1p$<_TCGI~RgDY_b00C&(zW{^t*~-K~H%?8_<}al3~g
zjxEQuCj}X9B<|whuRf$vW(+LLsKQyC-5o5nPRoHOX@=bbHVB~<P!gDAKiR<*Eb@PS
zV6&Zj^Yq8O6`=QqA;RVO{rpUp?jGqk=2a1BH*j0moCLT91If}bj0tX>u_da*ChL#f
zM&t(XTY%`(u5)P;rC>H^%&j3R_Z^(KKc%UYq#8p3r<kZV#`mJ4h`Z&Uzxf$8hRfT(
zylW9aQ=!k)g>qN`UTOTN#AUf6F75ttDbGwyi874q@6qi2-ky{xOYPl)yO(~^oKpbn
z_&oRWN}$1AQYD8XazUQL?!NDS?lkbOn5~A`mR{W;7c*rMUhvL;+Y}f@n3#!}BdLj*
zEEL7OMUv*B1x8YfWpgoQIR4;`dmPMzbQ};ZIxRc7{{WuR*+vUay}jbu?Y3)uoxAy)
zsdZiL{$dhZzQ#$C&}c8h{U3M>1CgeCzbt+7dIaBys1AoP6*Oju3h-q)1;4Wy4o8cK
zpz)ky7uZ$e<1k+%a7|XX?m&#*iDfsXhYn!0*c*Q@Fl`r0O*j?jQsmTmyaTUq`x5xr
zYc&1eq-?Bcud4_eN`>j<`I)bnPzNM?={oSf_Xn}}lqleu9E>0$`Y&s%zuq#HSGc&I
z#+GBfzE69%b~<!`qO=oTl2{gdOU1<IALip7K84SrW(i3zGWdE(ZJ1OF11m5M;!CSq
z<zXil%ChFBsLS*w8*9<tQXq@S{eN>Bg<fz&uqYdkT%^8v`~4!fiyM4z=*-08lz)Fz
zXcVTf@|JsZEaL;YGZg9|-wWy&zwVNhf&sq)Q6MU?%W5&vu)ZZ8A)$XSm=xmzQSr=B
z@XEK?ubAnOqFOo3V*vL85B!K$N`_Y5(e3t?9ak#xqwZLZhAmH|5*FDSM+fHNCW4i#
zqmOUHIm<53d6|2dwE=n2$5@J1)xScL&Lz->hkX9bxM?qvV%6lW!N7vA_H{krA_d@c
z9pRUsOaA~Y+7PoADOaK{{R5d)dRYN86~)BxOq07DfwGRguh{p4lfceBJjILFc?-qy
zHO^h0y<=)z7vhh{)+mZy^2JPI)<=b@WmvtBCoix}<|L4Qve4qa)aSHOuaIzRj@$%o
z*(hrPV>4c{$98{G?JgsIn7@VswpvhMcn?`BXfIa^R14dNvQCACC-<}n%1CEx4Ex-`
z&{*m2?FBSd7aAN&jD}X~+mr+>^Vn;S4~XeuZ<HzY$L&bv;xyg}$~lV6GRP2nadROb
zNX^qM)M0^U-Mwr4vBEB8C1kazkWx`)mkqt3D%?ia=6O*9LztV(8gzB8^8EE36^Mmq
zj*!DlUIHzzB0@H(b-|MV0J_0Y-H4`GEe@4veW6~}lY^lVs>lbhEYL27NK_*9Z@u+~
z#Am=WXIy8FgbcRaL8T8$`-V4S!OF!;bFk?3CF^6Z*stg78ae=qupPMRqvlpcEPL<q
zE`=^v-v0n`Y8!xSd2g^$2vV)~yEk7l{ySI19!#|7B+V-vCdy)Ju3fK)rAF&D0K4^w
zDXtlu8-!MGIsX7vG@HXQ+-OVMBd<uxk8xW~u@2J~;p^&Ha>~+W+Gb)~h&wtZnB$gE
z$x|is309C=IJsaGA1!}<S$vOegy{Dp(5-Kte#A}?X5xaQz;6*@8>Wc7n`RQ)i10aF
z1a^m`v<SbD4P&Q>n=onzj~B-f_y})A13qgW&<kkBt8YlzrEkH~p+PtAl@!yiUhHxz
zWK>v|p^KAJ_0tP#;GCoD6y-)(7KEsIf}#HaRVoZjCpe|Ypo*;}0Ne_=Dzyb%cE*fn
zK8ZE@?jpuiIaL8>aA1hRsfZUZ%3DJ>0oqzdX4qWT;=5pjuC@ZYCDHUY(*Of6-dmGb
z^AS}Sq)L&eLvJy#Qn(Eg(qUQ~8O%!&=Mb5dFjc!>`KhiGQWcs*+P-_rStZGS#X>Uc
zYVI&t1Tv1seJOV*rDBH;U@VktyjN})NfzPpF87)9G5-Ku!U)tBGjBq*h1HQfNG_8m
z<(`O;BxfJj!k`563gE`maC>tud&3*c4|VsH7kPu1iDL3OA`9Ajo#jxHJ5!kDb)gsr
zUnE+n;lbZfv!IQWFmtakEpFb3s?o$t$0HD`GM+ge=h&GpP$R6chI1@MOeXVCD;YVK
zG07|{nFbh*H;b7jpvE1=M&76RJqpjIW?=^~TQev|r6)9|*<VGjD)h0KL}qQl3K>>R
zuuU`&Mz1M>IGO^=ikQCeN*ct%K8d~QiCEn>rz_3Hc~xM9mIh#ieUe+T>Pqp{&c{~)
z36~zvp~oABW&;bevB?4cvohvn4#HV)4kl^6Lj~6)y|vs&aX;O9IJgzW5O#|b7`SFG
z1w$wuK@qTHh=|LBF~eu1<1R}Zi_0)UyM^Umkq&`Z0jtk2p38)2`Ig{dDQxYVc!dbA
zchX!tO!gQ)isn3KHg_mfFbwk>#q23hRam63UigYaDPn9qJB<k8;e=OYXon*e4d<Ba
zy`03j7ZN2+vcJGeiynb0RdF%hW=2;~(ht<kmS)r$g)cTRWtle&aWa-y5tV6jlFY<c
z+YQA+vi7M<Uo$S<%riMGr3VpDm{LZEOi;ekrNRffGN@o)4?}58V?^Nd6P&i!nV2?9
z3wIN3ih!F|XkZ1Qke0e-#HG;^;uk8JQ8<-X4EhBQuSG%VYE*MsRxbptW}>aUz_GV6
zvk)Q`%vhpip}{eElmb#N<r#rvQOl!4ECwS3mR3}<$m$<6VQL`ig{U|tV=~Ysy<m_D
zdjq{(S_;aTgJh=@NN`N6f}4pP%ygKP>97>uWz4`BIAwuh8A{6UQtK3Qc8UPxg5814
z#s_|k7_*78CuvZ!ZX5K+Nkkas3QCRO!YXClxG)mY91+4`;hquJ24kwS!UDO3fQ_>Z
zE?u-uA+1gD=}TNjfB<e92W)26!VZX1>k#={F~H_|O=mCzY}#?9mkq~BnlEs}Z!;SX
z;6=r~L9no$v9EX=7C3;G)m?XsLqn)BH`XE2_h=OA&Gm_2C!}+Y@fN~ZvDcyDGxvf?
zxZx3iLEPQ#Dq><SOBjX4@LKON#0k|zR(Ea)4U4*og_v3p8FMk=%PS<oDQ$R5FTp5t
zZllQ5L5&XV%R4Gwgbg8N4qydwHe0^Y+Jp{es;9K(5UJi^N>mzx(}_cX>2kE1IEJk?
z(p0;<)W9PD01>Lq9FYZLnqY5{E@Pl6M2iJyNu3po(8)SdFq$(!Ma1-T1PTKC#N5t#
zSWrsbp_Op~99$0JfScWOJ(S1+trDY&d2w)VFf7ETG$Ai&b1w8)JjF?iLM>Q0=3P)=
z`Go0>o#LX6xFDeG4O&*^<J`of#^nvNsGymnh=HypuFEBw$|#pGnlM?bJ4Oa)c%htj
zj?$@9Hr%_Ib#O&eM%!<!UBGb=ekg>z$0o*}U=5cBz*Y8ug3<v1%gjx{ZI)MrrJ+zc
zL|#ZYbsH_pOnb*Ln{3EN-eN0K!&qReTH+f7HeMr{0m<2^Z6vvkKod9WE2KiREo~T-
zBMiNXb2&ITBF>=A5dfvTvNnM?xM03CZrM5rTmY-JAUA+*r`{cP2F5T2VYH?=EZYQK
z9b4-y5lBX4D+V^*R1116Qe!gJGAy@nF=(6(#>*F~H&L$1tiiIyR6I+RPl<ZiPSS;O
zSm4JO7-dBTvn8}@CQRI;T}P9AL0!WGa?NT`SeoXg&jKEdt4O8e47#yHk|S-Hm+3m3
z#RnRN0o<ak91clkmvPY*mLjjDps>j(-9c)lRt>?fhG9ZzWm26Z$dz6o$QE>tL@0@v
z)uA@vD3&{bs%NA38v?`0XMN@e1zumc3X4|vpVTP|Hnt_r#1yrtElm_#=>iRurFo;?
zEQJRn1~`LaQkCfv{6HANu)ghmCR@&pY8YzOZ&C*tS8q3ut;L~GiC~LQoJWk7$8_}g
zjr9W|oUZ+~?JJc)4wcS3?haudeY5aE(q}H4X|B*5Q3{pkZ+Dq(Jc^)|?X6r|70JO@
zV=GMPjvUf`rZEBH(X4;pnP`{<7I{CIAYy9OjbHqMWe}7nI((Hc<C%u7Hq9@Tqzf9x
zed37N12I4GjDnzUrhmSW8?pwhP0TP*PzBND+zk~V-pke^GPPBXWdfB42~iYfr$1N>
z)v5k?nK2<Pz{iGC@QftkyU{p}%U=Xi3n~i8ZSneyE5S99iQPK6`(S{Fny@d#40FX<
zJ|gQ;X`5YOp>54&Jmvu{6-q0gw4_kA&(6Csl|)ffQRl>Kb#z%|T%xU^t9<sHqDoPA
zc9?sWu`zUP!)%UZudJhEQC)|j5-PBBU)qk=(E|F$X_nv<Md;4r<KnlHirwEx2oY~v
zsv}y6YMj8?lmyz*`Imv0a*VFuY(e6xFLePz2O+O`y0?kd9S(p@MW=Le5dg^VM;|K#
z!H_N#>+vbePy}tR@~&aKS|}T?(&ENgwQ97*Un!7s7r&&xB^3jV1_@%61aM?d-D~17
zZDf0{weJ-Zgbaf%-*A8e0AQxaj?+*Ui^Vj<C=qE!O?yjDpt>&=UlTVJ0A`018wTTG
z5r^DjraGRa3*%o5E@5h0GHU*1)F|CV-%l^Z63A!^x-i|*a0j3Wvs9PF5K^2>R##sT
zvc23dLHPQ^P9nC_vHAK#Mz_gDwe(^Mt6((b+^fLAI%t1zS9FbQuhJ4wnmXS8(y4)>
znHs-xd1Caat^Js$wUWCGJ-z%86JepnKlXBmhC<AjU(`lGK-a?f$ZDZICD0%Wf!g4I
z<ND(*gecFo-XM6p*#vSIeM}Rf<=%<-g){jxgn01^(?rf(60q=SbgtiM>{iiGW$_<w
z^I+bC^Z05#N`>@$hb2tWblws2=l=j>T6YE#>*iYl!ApahzwTli(~&qh-yYF0W8$uk
z(I81^`#zAVoGo+=#Iwz`^KMu`V)cu_M=Zj?TiBUIRKB~umF)!unGCV~N<n@BzVIqK
zHU9u*j+HA<jUPX0cTz5n`TBih2<=;4d_;Ir4lwKK7*3b1JI_r`1r5@RyZM$CcMFp)
zLp>mvin2FhuetFlo|%g4hh461L})`>aq$%(-gP>VGf~u18Sm{X3@4CMk}3pfpk=$W
zdo?q5ibkaig+4yBHAvWn0NzaA-OK<Hr(C>nN6`03Q&{%qqR|}#m#SF-FIEQ$^YvmK
zP~y;eu8|i3t?ImS_J-OOqE|S-ejr$^ZLzd^`H6=znsQ++aNRx_j0B+FHSda+*CABP
zyR)J+bR5=jE6O@cppDaxd%)XViKp4~))WP<lKxlm^@EUYXU{LvCEG1Y#>(~|q@W9F
zM(;X_lw{CB*N^!yOI&5Jdh4;w3yl;|MK_nMz=NA+YpIjwRPLtS1F`Y-hS)hB4dObs
ziAZ2qh=IMeL1))v3XzqV6BSe*A{ACU4{pq&xU!Z7*6OzkqtQsg?R76QblKhe9ZMq@
zjRC!zz5Q_kM)dYiqx-x#!&o$FN&WVraxgbHrS-(PnKZi>k;ZAK-l7_Jy1E|JY6!$A
z9YOu~jXKVH!<<wC1g9R!jD(;%a`lRh;!#+-@%@R;?k`SV#xl_23Zdh--Z&fOanBuP
zQq{}u{b=@vKoeLn<vUhA<<@QL{{E<vQu|e}w9HV&^e%pCUCF2?1VN$7y0O*o-W5g>
z2SUv_@AiU5>cwS%GW-4DGXak$^p*IhIjX%8?OGWWZ2G}gHJmWz{{UdBLq+EDXPzMC
zvv-y`J^qs$2bQ!K4LsQvmhr#s%Ry5xlss$O=@L7DKr>~ruQvpmkxDm=^8)=98DscG
z0yZpNbBKDI4p7a8{{Zp|`htk@?mrNuAs8$#d`qs(0(NnaX>MAS1(c~*3$8dKgqW;z
zM&E>GK3iZ6d%7cVUZTo2AGN~}<kW8V8uKh57c%b^((ReBM#9SN<{ce2R&ZG=a@q0@
zAFWGP(qft{H{bf8v_i5>-#?1;5fnYQkia>GMwSb4?{ys{PNO}#z>?uQ-_Kc%R3(l>
zeYwP{xZG16zq?8R_4#8uwiS#h1AX!P<}|9e$O(xoO0vgpbGcX`v0~(I<pPk{J;N3o
z^l(|d4OpwLSzf2WrF-Iunly5yrV0;Fp)Dx3l|*{|U|2&_EO{-{Ur9`L4nuztTvlbM
z<yG$2_E|w<${d8g%d>4gU=?o+KG3uQsu?p0#Q;3S0lPle`Hm1Mo37U}Nuk@9EAt%g
ztOvNz{T4kswS0czvONd^p!gs!O1@vHgMkJeK4NodG^H<c@e0WYTz;UFsfS(7iW^HS
z%*w#Jp4k1PM(hl^t7(jxO)8DLglHMp=k+jFl&n+MAUH5^h13PhRg&9(=2n$}`d{7@
z%P~i2QZh>E2}>aX(Fi!p?Xrm6V1Un8_bQG`w~-lE!llyJKWMZ{Rs~bOJItb0nGVJv
zrNYnv@%ffG7B&}zue`ACw>G8Um?K93igg1)0kyX!u@!AaJu3eIQxH{&0nc~^259BE
z-sJ;1`b4WpGGW}Mrc4iLIV&(sY>sLy40>7i6FOk2c?m;W+UK^RJy<~-qVsSbIkCLf
zeV|5Nly8RaG>j-!);sr>n69hWm|LV$@q{4xHx1TGh(r)*7N<PR3XBa}$4P6gNV5VB
TVb!wz2$~$PLS7U(v5)`R9Xef)

literal 0
HcmV?d00001

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
new file mode 100644
index 00000000..9d4e67ef
--- /dev/null
+++ b/warp/tests/test_tile_mlp.py
@@ -0,0 +1,327 @@
+import numpy as np
+import warp as wp
+import warp.examples
+import warp.optim
+
+import torch as tc
+
+import os
+
+from PIL import Image
+
+TILE_M = wp.constant(4)
+TILE_N = wp.constant(2)
+
+#wp.clear_kernel_cache()
+#wp.config.mode = "debug"
+#wp.config.verify_cuda = True
+
+wp.set_module_options({"fast_math": False})
+
+rng = np.random.default_rng(45)
+
+def assert_equal(result: np.ndarray, expect: np.ndarray, tol=1.e-2):
+    if tol != 0.0:
+        # TODO: Get all tests working without the .flatten()
+        np.testing.assert_allclose(result.flatten(), expect.flatten(), rtol=tol, atol=0, equal_nan=True)
+    else:
+        # TODO: Get all tests working with strict=True
+        np.testing.assert_array_equal(result, expect)
+
+    return True
+
+
+def create_layer(dim_in, dim_hid, dtype=float):
+
+    w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
+
+    weights = wp.array(w, dtype=dtype, requires_grad=True)
+    bias = wp.array(b, dtype=dtype, requires_grad=True)
+
+    return (weights, bias)
+
+def create_array(dim_in, dim_hid, dtype=float):
+
+    s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    a = wp.array(s, dtype=dtype, requires_grad=True)
+
+    return a
+
+
+NUM_FREQ = wp.constant(4)
+
+DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
+DIM_HID = 16
+DIM_OUT = 3
+
+NUM_THREADS = 32
+NUM_BLOCKS = 36
+
+IMG_WIDTH = NUM_THREADS*2
+IMG_HEIGHT = NUM_THREADS*2
+
+def test_multi_layer_nn():
+
+    @wp.func
+    def relu(x: float):
+        return wp.max(x, 0.0)
+
+    @wp.kernel
+    def compute(input: wp.array2d(dtype=float),
+                weights_0: wp.array2d(dtype=float), bias_0: wp.array2d(dtype=float),
+                weights_1: wp.array2d(dtype=float), bias_1: wp.array2d(dtype=float),
+                weights_2: wp.array2d(dtype=float), bias_2: wp.array2d(dtype=float),
+                reference: wp.array2d(dtype=float),
+                loss: wp.array1d(dtype=float),
+                out: wp.array2d(dtype=float)):
+
+        row, col = wp.tid()
+        linear = row*IMG_WIDTH + col
+
+        # linear = wp.tid()
+        # row = linear/IMG_WIDTH
+        # col = linear%IMG_WIDTH
+
+        # # normalize input coordinates to [-1, 1]
+        x = (float(row)/float(IMG_WIDTH) - 0.5)*2.0
+        y = (float(col)/float(IMG_HEIGHT) - 0.5)*2.0
+
+        local = wp.vector(dtype=float, length=DIM_IN)
+
+        for s in range(NUM_FREQ):
+
+            scale = wp.pow(2.0, float(s))*wp.pi
+
+            # x-coord
+            local[s*4 + 0] = wp.sin(x * scale)
+            local[s*4 + 1] = wp.cos(x * scale)
+
+            # y-coord
+            local[s*4 + 2] = wp.sin(y * scale)
+            local[s*4 + 3] = wp.cos(y * scale)
+
+
+            # write input back to array so that torch can use it
+            input[s*4 + 0, linear] = local[s*4 + 0]
+            input[s*4 + 1, linear] = local[s*4 + 1]
+            input[s*4 + 2, linear] = local[s*4 + 2]
+            input[s*4 + 3, linear] = local[s*4 + 3]
+        
+        ## load from input array
+        # local = wp.vector(dtype=float, length=DIM_IN)
+        # for i in range(DIM_IN):
+        #     local[i] = input[i, linear]
+
+
+        f = wp.tile(local)
+        
+
+        # input layer
+        w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
+        b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
+        z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, m=DIM_HID, n=NUM_THREADS))
+
+        # output layer
+        w1 = wp.tile_load(weights_1, 0, 0, m=DIM_HID, n=DIM_HID)
+        b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
+        z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
+
+
+        w2 = wp.tile_load(weights_2, 0, 0, m=DIM_OUT, n=DIM_HID)
+        b2 = wp.tile_load(bias_2, 0, 0, m=DIM_OUT, n=1)
+        o = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_OUT, n=NUM_THREADS))
+
+        #wp.tile_store(out, 0, i, o)
+
+        output = wp.untile(o)
+
+        error = wp.vec3(output[0] - reference[0,linear],
+                        output[1] - reference[1,linear],
+                        output[2] - reference[2,linear])
+
+        wp.atomic_add(loss, 0, wp.length_sq(error)/float(IMG_WIDTH*IMG_HEIGHT))
+
+
+        for i in range(DIM_OUT):
+            out[i, linear] = output[i]
+                
+
+
+    weights_0, bias_0 = create_layer(DIM_IN, DIM_HID, dtype=float)
+    weights_1, bias_1 = create_layer(DIM_HID, DIM_HID, dtype=float)
+    weights_2, bias_2 = create_layer(DIM_HID, DIM_OUT, dtype=float)
+
+    input = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_IN)
+    output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
+
+    # # reference 
+    reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
+    with Image.open(reference_path) as im:
+        reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0    
+    reference = wp.array(reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T, dtype=float)
+
+    loss = wp.zeros(1, dtype=float, requires_grad=True)
+
+    params = [weights_0, bias_0,
+              weights_1, bias_1, 
+              weights_2, bias_2]
+
+    optimizer_grads = [p.grad.flatten() for p in params]
+    optimizer_inputs = [p.flatten() for p in params]
+    optimizer = warp.optim.Adam(optimizer_inputs, lr=0.001)
+
+    for i in range(1):
+
+        loss.zero_()
+
+        with wp.Tape() as tape:
+            wp.launch(
+                compute, 
+                dim=[IMG_WIDTH, IMG_HEIGHT], 
+                inputs=[input,
+                        weights_0, bias_0,
+                        weights_1, bias_1,
+                        weights_2, bias_2, 
+                        reference,
+                        loss,
+                        output],
+                block_dim=NUM_THREADS)
+
+        print(loss.numpy())
+
+        output.grad = wp.ones_like(output)
+        tape.backward()
+        
+        #tape.backward(loss)
+
+        # optimizer.step(optimizer_grads)
+
+        # tape.zero()
+
+
+    predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
+    predicted_image = (predicted_image * 255).astype(np.uint8)
+
+    predicted_image_pil = Image.fromarray(predicted_image)
+    predicted_image_pil.save("test_tile_mlp_wp.jpg")
+
+    # print(input)
+    # print(output)
+
+    # numpy
+    z_np = np.maximum(weights_0.numpy()@input.numpy() + bias_0.numpy(), 0.0)
+    z_np = np.maximum(weights_1.numpy()@z_np + bias_1.numpy(), 0.0)
+    z_np = np.maximum(weights_2.numpy()@z_np + bias_2.numpy(), 0.0)
+
+    predicted_image = z_np.T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
+    predicted_image = (predicted_image * 255).astype(np.uint8)
+
+    predicted_image_pil = Image.fromarray(predicted_image)
+    predicted_image_pil.save("test_tile_mlp_np.jpg")
+
+    # test numpy foward
+    print("NumPy output close: ", assert_equal(output.numpy(), z_np))
+
+    # torch
+    input_tc = tc.from_numpy(input.numpy()).requires_grad_(True)
+
+    weights_0_tc = tc.from_numpy(weights_0.numpy()).requires_grad_(True)
+    bias_0_tc = tc.from_numpy(bias_0.numpy()).requires_grad_(True)
+
+    weights_1_tc = tc.from_numpy(weights_1.numpy()).requires_grad_(True)
+    bias_1_tc = tc.from_numpy(bias_1.numpy()).requires_grad_(True)
+
+    weights_2_tc = tc.from_numpy(weights_2.numpy()).requires_grad_(True)
+    bias_2_tc = tc.from_numpy(bias_2.numpy()).requires_grad_(True)
+
+    z_tc = tc.clamp(weights_0_tc@input_tc + bias_0_tc, min=0.0)
+    z_tc = tc.clamp(weights_1_tc@z_tc + bias_1_tc, min=0.0)
+    z_tc = tc.clamp(weights_2_tc@z_tc + bias_2_tc, min=0.0)
+    
+    ref_tc = tc.from_numpy(reference.numpy()).requires_grad_(True)
+    #l_tc = tc.mean((z_tc - ref_tc)**2)
+    #l_tc.backward()
+
+    z_tc.backward(tc.ones_like(z_tc))
+
+    # test torch
+    print("Torch output close:        ", assert_equal(z_tc.cpu().detach().numpy(), output.numpy()))
+    #print("Torch loss close:        ", assert_equal(l_tc.cpu().detach().numpy(), loss.numpy()))
+    #print("Torch input.grad close:    ", assert_equal(input.grad.numpy(), input_tc.grad.cpu().detach().numpy()))
+     
+    print("Torch weights0.grad close: ", assert_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy()))
+    print("Torch bias0.grad close:    ", assert_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy()))
+     
+    print("Torch weights1.grad close: ", assert_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy()))
+    print("Torch bias1.grad close:    ", assert_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy()))
+ 
+    print("Torch weights2.grad close: ", assert_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy()))
+    print("Torch bias2.grad close:    ", assert_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy()))
+
+    
+
+
+
+
+def test_single_layer_nn():
+
+    @wp.func
+    def relu(x: float):
+        return wp.max(x, 0.0)
+
+    @wp.kernel
+    def compute(input: wp.array2d(dtype=float),
+                weights: wp.array2d(dtype=float),
+                bias: wp.array2d(dtype=float),
+                out: wp.array2d(dtype=float)):
+
+        i = wp.tid()
+
+        f = wp.tile_load(input, 0, i, m=DIM_IN, n=NUM_THREADS)
+
+        w = wp.tile_load(weights, 0, 0, DIM_OUT, DIM_IN)
+        b = wp.tile_load(bias, 0, 0, m=DIM_OUT, n=1)
+
+        o = wp.tile_map(relu, wp.tile_matmul(w, f) + wp.tile_broadcast(b, m=DIM_OUT, n=NUM_THREADS))
+
+        wp.tile_store(out, 0, i, o)
+
+
+    weights, bias = create_layer(DIM_IN, DIM_OUT, dtype=float)
+
+    input = create_array(NUM_THREADS*NUM_BLOCKS, DIM_IN)
+    output = create_array(NUM_THREADS*NUM_BLOCKS, DIM_OUT)
+
+    with wp.Tape() as tape:
+        wp.launch_tiled(compute, dim=[NUM_BLOCKS], inputs=[input, weights, bias, output], block_dim=NUM_THREADS)
+
+    output.grad = wp.ones_like(output)
+    tape.backward()    
+
+
+    # print(input)
+    # print(output)
+
+    # numpy
+    output_np = np.maximum(weights.numpy()@input.numpy() + bias.numpy(), 0.0)
+
+    # test numpy foward
+    print(np.allclose(output.numpy(), output_np))
+
+
+    # torch
+    weights_tc = tc.from_numpy(weights.numpy()).requires_grad_(True)   # use .numpy() to avoid any memory aliasing
+    input_tc = tc.from_numpy(input.numpy()).requires_grad_(True)
+    bias_tc = tc.from_numpy(bias.numpy()).requires_grad_(True)
+
+    output_tc = tc.clamp(weights_tc@input_tc + bias_tc, min=0.0)
+    output_tc.backward(tc.ones_like(output_tc))
+
+    # test torch
+    print(np.allclose(output_tc.detach().numpy(), output.numpy()))
+    print(np.allclose(input.grad.numpy(), input_tc.grad.detach().numpy()))
+
+
+#test_single_layer_nn()
+test_multi_layer_nn()
\ No newline at end of file

From 66acffd96d7bda61fc767b97e1a0136d45df2f28 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 8 Oct 2024 04:39:11 +0000
Subject: [PATCH 059/102] Use scalar loss function for comparison against Torch

---
 warp/tests/test_tile_mlp.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 9d4e67ef..b8e34452 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -140,7 +140,7 @@ def compute(input: wp.array2d(dtype=float),
                         output[1] - reference[1,linear],
                         output[2] - reference[2,linear])
 
-        wp.atomic_add(loss, 0, wp.length_sq(error)/float(IMG_WIDTH*IMG_HEIGHT))
+        wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*IMG_WIDTH*IMG_HEIGHT))
 
 
         for i in range(DIM_OUT):
@@ -190,10 +190,10 @@ def compute(input: wp.array2d(dtype=float),
 
         print(loss.numpy())
 
-        output.grad = wp.ones_like(output)
-        tape.backward()
+        # output.grad = wp.ones_like(output)
+        # tape.backward()
         
-        #tape.backward(loss)
+        tape.backward(loss)
 
         # optimizer.step(optimizer_grads)
 
@@ -240,10 +240,12 @@ def compute(input: wp.array2d(dtype=float),
     z_tc = tc.clamp(weights_2_tc@z_tc + bias_2_tc, min=0.0)
     
     ref_tc = tc.from_numpy(reference.numpy()).requires_grad_(True)
-    #l_tc = tc.mean((z_tc - ref_tc)**2)
-    #l_tc.backward()
+    
+    
+    l_tc = tc.mean((z_tc - ref_tc)**2)
+    l_tc.backward()
 
-    z_tc.backward(tc.ones_like(z_tc))
+    #z_tc.backward(tc.ones_like(z_tc))
 
     # test torch
     print("Torch output close:        ", assert_equal(z_tc.cpu().detach().numpy(), output.numpy()))

From c93b16ed277c9c7891f568039211fbf2bd35f4d7 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Mon, 7 Oct 2024 16:19:57 -0700
Subject: [PATCH 060/102] Allow tile storage to be specified in API

---
 docs/modules/functions.rst                | 20 +++++--
 warp/builtins.py                          | 72 ++++++++++++++++++-----
 warp/codegen.py                           |  3 +
 warp/examples/tile/example_tile_matmul.py | 19 +++---
 warp/stubs.py                             | 20 +++++--
 warp/types.py                             | 20 +++----
 6 files changed, 107 insertions(+), 47 deletions(-)

diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 8fcc6f83..ca1bea38 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -802,27 +802,31 @@ Spatial Math
 
 Tile Primitives
 ---------------
-.. py:function:: tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile
+.. py:function:: tile_zeros(m: int32, n: int32, dtype: Scalar, storage: str) -> Tile
 
     Allocates a tile of zero-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A zero-initialized tile with ``shape=(m,n)`` and the specified datatype
 
 
-.. py:function:: tile_ones(m: int32, n: int32, dtype: Scalar) -> Tile
+.. py:function:: tile_ones(m: int32, n: int32, dtype: Scalar, storage: str) -> Tile
 
     Allocates a tile of one-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A one-initialized tile with ``shape=(m,n)`` and the specified dtype
 
 
-.. py:function:: tile_arange(*args: Scalar, dtype: Scalar) -> Tile
+.. py:function:: tile_arange(*args: Scalar, dtype: Scalar, storage: str) -> Tile
 
     Generates a tile of linearly spaced elements.
 
@@ -833,10 +837,12 @@ Tile Primitives
         - ``(start, stop, step)``: Generates values from ``start`` to ``stop - 1`` with a step size
 
     :param dtype: Datatype of output tile's elements (optional, default: int)
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(1,n)`` with linearly spaced elements of specified dtype
 
 
-.. py:function:: tile_load(a: Array[Any], i: int32, n: int32) -> Tile
+.. py:function:: tile_load(a: Array[Any], i: int32, n: int32, storage: str) -> Tile
 
     Loads a 1D tile from a global memory array.
 
@@ -845,10 +851,12 @@ Tile Primitives
     :param a: The source array in global memory
     :param i: Offset in the source array measured in multiples of ``n``, i.e.: ``offset=i*n``
     :param n: The number of elements in the tile
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(1,n)`` and dtype the same as the source array
 
 
-.. py:function:: tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32) -> Tile
+.. py:function:: tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32, storage: str) -> Tile
     :noindex:
     :nocontentsentry:
 
@@ -861,6 +869,8 @@ Tile Primitives
     :param j: Offset in the source array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param m: The size of the tile's first dimension
     :param n: The size of the tile's second dimension
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array
 
 
diff --git a/warp/builtins.py b/warp/builtins.py
index fa7e8a5b..0bf4c2a5 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1722,10 +1722,18 @@ def tile_zeros_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str
     if "dtype" not in arg_values:
         raise RuntimeError("'dtype' keyword argument must be specified when calling tile_zeros() function")
 
+    if "storage" not in arg_values:
+        raise ValueError("'storage' keyword not provided for tile_zeros")
+
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(
+            f"'storage' keyword argument must be either 'shared' or 'register', got {arg_values['storage']}"
+        )
+
     m, n = arg_values["m"], arg_values["n"]
     dtype = arg_values["dtype"]
 
-    return TileZeros(dtype=dtype, M=m, N=n)
+    return TileZeros(dtype=dtype, M=m, N=n, storage=arg_values["storage"])
 
 
 def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -1741,7 +1749,8 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
 
 add_builtin(
     "tile_zeros",
-    input_types={"m": int, "n": int, "dtype": Scalar},
+    input_types={"m": int, "n": int, "dtype": Scalar, "storage": str},
+    defaults={"storage": "register"},
     value_func=tile_zeros_value_func,
     dispatch_func=tile_zeros_dispatch_func,
     variadic=True,
@@ -1750,6 +1759,8 @@ def tile_zeros_dispatch_func(arg_types: Mapping[str, type], return_type: Any, ar
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A zero-initialized tile with ``shape=(m,n)`` and the specified datatype""",
     group="Tile Primitives",
     export=False,
@@ -1770,10 +1781,15 @@ def tile_ones_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     if "dtype" not in arg_values:
         raise RuntimeError("'dtype' keyword argument must be specified when calling tile_zeros() function")
 
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(
+            f"'storage' keyword argument must be either 'shared' or 'register', got {arg_values['storage']}"
+        )
+
     m, n = arg_values["m"], arg_values["n"]
     dtype = arg_values["dtype"]
 
-    return TileZeros(dtype=dtype, M=m, N=n)
+    return TileZeros(dtype=dtype, M=m, N=n, storage=arg_values["storage"])
 
 
 def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -1789,7 +1805,8 @@ def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
 
 add_builtin(
     "tile_ones",
-    input_types={"m": int, "n": int, "dtype": Scalar},
+    input_types={"m": int, "n": int, "dtype": Scalar, "storage": str},
+    defaults={"storage": "register"},
     value_func=tile_ones_value_func,
     dispatch_func=tile_ones_dispatch_func,
     variadic=True,
@@ -1798,6 +1815,8 @@ def tile_ones_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A one-initialized tile with ``shape=(m,n)`` and the specified dtype""",
     group="Tile Primitives",
     export=False,
@@ -1837,7 +1856,12 @@ def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
     else:
         dtype = float
 
-    return TileRange(dtype=dtype, start=start, stop=stop, step=step)
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(
+            f"'storage' keyword argument must be either 'shared' or 'register', got {arg_values['storage']}"
+        )
+
+    return TileRange(dtype=dtype, start=start, stop=stop, step=step, storage=arg_values["storage"])
 
 
 def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -1864,8 +1888,8 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
 
 add_builtin(
     "tile_arange",
-    input_types={"*args": Scalar, "dtype": Scalar},
-    defaults={"dtype": None},
+    input_types={"*args": Scalar, "dtype": Scalar, "storage": str},
+    defaults={"dtype": None, "storage": "register"},
     value_func=tile_arange_value_func,
     dispatch_func=tile_arange_dispatch_func,
     variadic=True,
@@ -1878,6 +1902,8 @@ def tile_arange_dispatch_func(arg_types: Mapping[str, type], return_type: Any, a
         - ``(start, stop, step)``: Generates values from ``start`` to ``stop - 1`` with a step size
 
     :param dtype: Datatype of output tile's elements (optional, default: int)
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(1,n)`` with linearly spaced elements of specified dtype""",
     group="Tile Primitives",
     export=False,
@@ -1903,10 +1929,15 @@ def tile_load_1d_value_func(arg_types, arg_values):
     if "n" not in arg_values:
         raise RuntimeError("'n' keyword argument must be specified when calling tile_load() function")
 
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(
+            f"'storage' keyword argument must be either 'shared' or 'register', got {arg_values['storage']}"
+        )
+
     a = arg_types["a"]
     _m, n = 1, arg_values["n"]
 
-    return TileLoad(a, 1, n)
+    return TileLoad(a, 1, n, arg_values["storage"])
 
 
 def tile_load_1d_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -1924,7 +1955,8 @@ def tile_load_1d_dispatch_func(arg_types: Mapping[str, type], return_type: Any,
 
 add_builtin(
     "tile_load",
-    input_types={"a": array(dtype=Any), "i": int, "n": int},
+    input_types={"a": array(dtype=Any), "i": int, "n": int, "storage": str},
+    defaults={"storage": "register"},
     value_func=tile_load_1d_value_func,
     dispatch_func=tile_load_1d_dispatch_func,
     variadic=False,
@@ -1935,6 +1967,8 @@ def tile_load_1d_dispatch_func(arg_types: Mapping[str, type], return_type: Any,
     :param a: The source array in global memory
     :param i: Offset in the source array measured in multiples of ``n``, i.e.: ``offset=i*n``
     :param n: The number of elements in the tile
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(1,n)`` and dtype the same as the source array""",
     group="Tile Primitives",
     export=False,
@@ -1966,10 +2000,15 @@ def tile_load_2d_value_func(arg_types, arg_values):
     if "n" not in arg_values:
         raise RuntimeError("'n' keyword argument must be specified when calling tile_load() function")
 
+    if arg_values["storage"] not in {"shared", "register"}:
+        raise ValueError(
+            f"'storage' keyword argument must be either 'shared' or 'register', got {arg_values['storage']}"
+        )
+
     a = arg_types["a"]
     m, n = arg_values["m"], arg_values["n"]
 
-    return TileLoad(a, m, n)
+    return TileLoad(a, m, n, arg_values["storage"])
 
 
 def tile_load_2d_dispatch_func(arg_types: Mapping[str, type], return_type: Any, arg_values: Mapping[str, Var]):
@@ -1988,7 +2027,8 @@ def tile_load_2d_dispatch_func(arg_types: Mapping[str, type], return_type: Any,
 
 add_builtin(
     "tile_load",
-    input_types={"a": array(dtype=Any), "i": int, "j": int, "m": int, "n": int},
+    input_types={"a": array(dtype=Any), "i": int, "j": int, "m": int, "n": int, "storage": str},
+    defaults={"storage": "register"},
     value_func=tile_load_2d_value_func,
     dispatch_func=tile_load_2d_dispatch_func,
     variadic=False,
@@ -2001,6 +2041,8 @@ def tile_load_2d_dispatch_func(arg_types: Mapping[str, type], return_type: Any,
     :param j: Offset in the source array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param m: The size of the tile's first dimension
     :param n: The size of the tile's second dimension
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array""",
     group="Tile Primitives",
     export=False,
@@ -2707,10 +2749,10 @@ def tile_unary_map_value_func(arg_types, arg_values):
     return TileUnaryMap(a)
 
 
-def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
-    func_args = (args["op"], *args["args"])
-    template_args = ()
-    return (func_args, template_args)
+# def tile_map_dispatch_func(input_types: Mapping[str, type], return_type: Any, args: Mapping[str, Var]):
+#    func_args = (args["op"], *args["args"])
+#    template_args = ()
+#    return (func_args, template_args)
 
 
 add_builtin(
diff --git a/warp/codegen.py b/warp/codegen.py
index 50288e05..53519521 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -783,6 +783,9 @@ def func_match_args(func, arg_types, kwarg_types):
 
 
 def get_arg_type(arg: Union[Var, Any]):
+    if isinstance(arg, str):
+        return str
+
     if isinstance(arg, Sequence):
         return tuple(get_arg_type(x) for x in arg)
 
diff --git a/warp/examples/tile/example_tile_matmul.py b/warp/examples/tile/example_tile_matmul.py
index 881396f9..b8ee510c 100644
--- a/warp/examples/tile/example_tile_matmul.py
+++ b/warp/examples/tile/example_tile_matmul.py
@@ -13,6 +13,7 @@
 ###########################################################################
 
 import numpy as np
+
 import warp as wp
 
 # tile size
@@ -23,16 +24,16 @@
 # num threads per-tile
 TILE_THREADS = 64
 
+
 @wp.kernel
 def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
-    
     # output tile index
     i, j = wp.tid()
 
     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
-    M = A.shape[0]
-    N = B.shape[1]
+    _M = A.shape[0]
+    _N = B.shape[1]
     K = A.shape[1]
 
     count = int(K / TILE_K)
@@ -47,9 +48,7 @@ def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.arra
     wp.tile_store(C, i, j, sum)
 
 
-
 if __name__ == "__main__":
-
     wp.set_device("cuda:0")
 
     # generate some tile aligned matrix dimensions
@@ -68,13 +67,9 @@ def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.arra
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_gemm,
-            dim=(int(M / TILE_M), int(N / TILE_N)),
-            inputs=[A_wp, B_wp, C_wp],
-            block_dim=TILE_THREADS)
+            tile_gemm, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], block_dim=TILE_THREADS
+        )
 
-    assert(np.allclose(C_wp.numpy(), A@B))
+    assert np.allclose(C_wp.numpy(), A @ B)
 
     print("Example matrix multiplication passed")
-
-
diff --git a/warp/stubs.py b/warp/stubs.py
index 3b7f8823..77e1c548 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -894,31 +894,35 @@ def spatial_mass(
 
 
 @over
-def tile_zeros(m: int32, n: int32, dtype: Scalar) -> Tile:
+def tile_zeros(m: int32, n: int32, dtype: Scalar, storage: str) -> Tile:
     """Allocates a tile of zero-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A zero-initialized tile with ``shape=(m,n)`` and the specified datatype
     """
     ...
 
 
 @over
-def tile_ones(m: int32, n: int32, dtype: Scalar) -> Tile:
+def tile_ones(m: int32, n: int32, dtype: Scalar, storage: str) -> Tile:
     """Allocates a tile of one-initialized items.
 
     :param m: Size of the first dimension of the output tile
     :param n: Size of the second dimension of the output tile
     :param dtype: Datatype of output tile's elements
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A one-initialized tile with ``shape=(m,n)`` and the specified dtype
     """
     ...
 
 
 @over
-def tile_arange(*args: Scalar, dtype: Scalar) -> Tile:
+def tile_arange(*args: Scalar, dtype: Scalar, storage: str) -> Tile:
     """Generates a tile of linearly spaced elements.
 
     :param args: Variable-length positional arguments, interpreted as:
@@ -928,13 +932,15 @@ def tile_arange(*args: Scalar, dtype: Scalar) -> Tile:
         - ``(start, stop, step)``: Generates values from ``start`` to ``stop - 1`` with a step size
 
     :param dtype: Datatype of output tile's elements (optional, default: int)
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(1,n)`` with linearly spaced elements of specified dtype
     """
     ...
 
 
 @over
-def tile_load(a: Array[Any], i: int32, n: int32) -> Tile:
+def tile_load(a: Array[Any], i: int32, n: int32, storage: str) -> Tile:
     """Loads a 1D tile from a global memory array.
 
     This method will cooperatively load a tile from global memory using all threads in the block.
@@ -942,13 +948,15 @@ def tile_load(a: Array[Any], i: int32, n: int32) -> Tile:
     :param a: The source array in global memory
     :param i: Offset in the source array measured in multiples of ``n``, i.e.: ``offset=i*n``
     :param n: The number of elements in the tile
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(1,n)`` and dtype the same as the source array
     """
     ...
 
 
 @over
-def tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32) -> Tile:
+def tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32, storage: str) -> Tile:
     """Loads a 2D tile from a global memory array.
 
     This method will cooperatively load a tile from global memory using all threads in the block.
@@ -958,6 +966,8 @@ def tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32) -> Tile:
     :param j: Offset in the source array measured in multiples of ``n``, i.e.; ``col=j*n``
     :param m: The size of the tile's first dimension
     :param n: The size of the tile's second dimension
+    :param storage: The storage location for the tile: ``"register"`` for registers
+      (default) or ``"shared"`` for shared memory.
     :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array
     """
     ...
diff --git a/warp/types.py b/warp/types.py
index 454f7cc0..ea9604e4 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -3020,12 +3020,12 @@ def alloc(cls):
 
 
 class TileZeros(Tile):
-    def __init__(self, dtype, M, N):
-        Tile.__init__(self, dtype, M, N, op="zeros", storage="register")
+    def __init__(self, dtype, M, N, storage="register"):
+        Tile.__init__(self, dtype, M, N, op="zeros", storage=storage)
 
 
 class TileRange(Tile):
-    def __init__(self, dtype, start, stop, step):
+    def __init__(self, dtype, start, stop, step, storage="register"):
         self.start = start
         self.stop = stop
         self.step = step
@@ -3033,7 +3033,7 @@ def __init__(self, dtype, start, stop, step):
         M = 1
         N = int((stop - start) / step)
 
-        Tile.__init__(self, dtype, M, N, op="arange", storage="register")
+        Tile.__init__(self, dtype, M, N, op="arange", storage=storage)
 
 
 class TileConstant(Tile):
@@ -3042,20 +3042,20 @@ def __init__(self, dtype, M, N):
 
 
 class TileLoad(Tile):
-    def __init__(self, array, M, N):
-        Tile.__init__(self, array.dtype, M, N, op="load", storage="register")
+    def __init__(self, array, M, N, storage="register"):
+        Tile.__init__(self, array.dtype, M, N, op="load", storage=storage)
 
 
 class TileUnaryMap(Tile):
-    def __init__(self, t):
-        Tile.__init__(self, t.dtype, t.M, t.N, op="unary_map", storage="register")
+    def __init__(self, t, storage="register"):
+        Tile.__init__(self, t.dtype, t.M, t.N, op="unary_map", storage=storage)
 
         self.t = t
 
 
 class TileBinaryMap(Tile):
-    def __init__(self, a, b):
-        Tile.__init__(self, a.dtype, a.M, a.N, op="binary_map", storage="register")
+    def __init__(self, a, b, storage="register"):
+        Tile.__init__(self, a.dtype, a.M, a.N, op="binary_map", storage=storage)
 
         self.a = a
         self.b = b

From d32c9043016e4c5b4b4f85280eeee5666d52099b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 8 Oct 2024 21:35:09 +0000
Subject: [PATCH 061/102] Testing different dimensions

---
 warp/tests/test_tile_mlp.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index b8e34452..04bdec85 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -9,9 +9,6 @@
 
 from PIL import Image
 
-TILE_M = wp.constant(4)
-TILE_N = wp.constant(2)
-
 #wp.clear_kernel_cache()
 #wp.config.mode = "debug"
 #wp.config.verify_cuda = True
@@ -23,7 +20,7 @@
 def assert_equal(result: np.ndarray, expect: np.ndarray, tol=1.e-2):
     if tol != 0.0:
         # TODO: Get all tests working without the .flatten()
-        np.testing.assert_allclose(result.flatten(), expect.flatten(), rtol=tol, atol=0, equal_nan=True)
+        np.testing.assert_allclose(result.flatten(), expect.flatten(), rtol=tol, atol=1.e-2, equal_nan=True)
     else:
         # TODO: Get all tests working with strict=True
         np.testing.assert_array_equal(result, expect)
@@ -188,7 +185,7 @@ def compute(input: wp.array2d(dtype=float),
                         output],
                 block_dim=NUM_THREADS)
 
-        print(loss.numpy())
+        print(f"Iter: {i} Loss: {loss.numpy()}")
 
         # output.grad = wp.ones_like(output)
         # tape.backward()

From 77e83e3cc57481e264fcfc365438a2caaccbb12c Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 8 Oct 2024 21:41:02 +0000
Subject: [PATCH 062/102] Clean up some comments

---
 warp/tests/test_tile_mlp.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 04bdec85..30b2245e 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -76,16 +76,13 @@ def compute(input: wp.array2d(dtype=float),
         row, col = wp.tid()
         linear = row*IMG_WIDTH + col
 
-        # linear = wp.tid()
-        # row = linear/IMG_WIDTH
-        # col = linear%IMG_WIDTH
-
-        # # normalize input coordinates to [-1, 1]
+        # normalize input coordinates to [-1, 1]
         x = (float(row)/float(IMG_WIDTH) - 0.5)*2.0
         y = (float(col)/float(IMG_HEIGHT) - 0.5)*2.0
 
         local = wp.vector(dtype=float, length=DIM_IN)
 
+        # construct positional encoding
         for s in range(NUM_FREQ):
 
             scale = wp.pow(2.0, float(s))*wp.pi
@@ -98,48 +95,43 @@ def compute(input: wp.array2d(dtype=float),
             local[s*4 + 2] = wp.sin(y * scale)
             local[s*4 + 3] = wp.cos(y * scale)
 
-
             # write input back to array so that torch can use it
             input[s*4 + 0, linear] = local[s*4 + 0]
             input[s*4 + 1, linear] = local[s*4 + 1]
             input[s*4 + 2, linear] = local[s*4 + 2]
             input[s*4 + 3, linear] = local[s*4 + 3]
         
-        ## load from input array
-        # local = wp.vector(dtype=float, length=DIM_IN)
-        # for i in range(DIM_IN):
-        #     local[i] = input[i, linear]
-
 
+        # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
         f = wp.tile(local)
         
-
         # input layer
         w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
         b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
         z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, m=DIM_HID, n=NUM_THREADS))
 
-        # output layer
+        # hidden layer
         w1 = wp.tile_load(weights_1, 0, 0, m=DIM_HID, n=DIM_HID)
         b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
         z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
 
-
+        # output layer
         w2 = wp.tile_load(weights_2, 0, 0, m=DIM_OUT, n=DIM_HID)
         b2 = wp.tile_load(bias_2, 0, 0, m=DIM_OUT, n=1)
         o = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_OUT, n=NUM_THREADS))
 
-        #wp.tile_store(out, 0, i, o)
-
+        # until back to SIMT
         output = wp.untile(o)
 
+        # compute error
         error = wp.vec3(output[0] - reference[0,linear],
                         output[1] - reference[1,linear],
                         output[2] - reference[2,linear])
 
+        # write MSE loss
         wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*IMG_WIDTH*IMG_HEIGHT))
 
-
+        # image output
         for i in range(DIM_OUT):
             out[i, linear] = output[i]
                 

From 29b277df436a370faf414ecced7a54df40af021b Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 8 Oct 2024 15:17:36 -0700
Subject: [PATCH 063/102] Use Artifactory access key

---
 .gitlab/ci/mathdx-support.yml             | 12 ++++++------
 warp/examples/tile/example_tile_matmul.py | 19 +++++++------------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index b6fff5b3..bfca61fe 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -33,11 +33,11 @@ linux-x86_64 build:
     - .runner-build-linux-x86_64
   before_script:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
-    - apt-get update && apt-get install build-essential curl wget --no-install-recommends -y
+    - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
-      wget --header="X-JFrog-Art-Api:$ARTIFACTORY_API_KEY" -nv --no-check-certificate
+      curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
       $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/30/libmathdx_build_x86_64_ubuntu20.04_cuda12.0.0_release.tar.gz
-      -O libmathdx.tar.gz
+      -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
     - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
@@ -56,11 +56,11 @@ linux-aarch64 build:
     - .save_warp_bin_artifact
   before_script:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
-    - apt-get update && apt-get install build-essential curl wget --no-install-recommends -y
+    - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
-      wget --header="X-JFrog-Art-Api:$ARTIFACTORY_API_KEY" -nv --no-check-certificate
+      curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
       $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/30/libmathdx_build_aarch64_ubuntu20.04_cuda12.0.0_release.tar.gz
-      -O libmathdx.tar.gz
+      -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
     - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
diff --git a/warp/examples/tile/example_tile_matmul.py b/warp/examples/tile/example_tile_matmul.py
index 881396f9..b8ee510c 100644
--- a/warp/examples/tile/example_tile_matmul.py
+++ b/warp/examples/tile/example_tile_matmul.py
@@ -13,6 +13,7 @@
 ###########################################################################
 
 import numpy as np
+
 import warp as wp
 
 # tile size
@@ -23,16 +24,16 @@
 # num threads per-tile
 TILE_THREADS = 64
 
+
 @wp.kernel
 def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
-    
     # output tile index
     i, j = wp.tid()
 
     sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
 
-    M = A.shape[0]
-    N = B.shape[1]
+    _M = A.shape[0]
+    _N = B.shape[1]
     K = A.shape[1]
 
     count = int(K / TILE_K)
@@ -47,9 +48,7 @@ def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.arra
     wp.tile_store(C, i, j, sum)
 
 
-
 if __name__ == "__main__":
-
     wp.set_device("cuda:0")
 
     # generate some tile aligned matrix dimensions
@@ -68,13 +67,9 @@ def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.arra
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_gemm,
-            dim=(int(M / TILE_M), int(N / TILE_N)),
-            inputs=[A_wp, B_wp, C_wp],
-            block_dim=TILE_THREADS)
+            tile_gemm, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], block_dim=TILE_THREADS
+        )
 
-    assert(np.allclose(C_wp.numpy(), A@B))
+    assert np.allclose(C_wp.numpy(), A @ B)
 
     print("Example matrix multiplication passed")
-
-

From 132c8d64f8e95ea311408ab3adbb692cfa4799ef Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 9 Oct 2024 04:23:21 +0000
Subject: [PATCH 064/102] Add batching support

---
 warp/tests/test_tile_mlp.py | 87 ++++++++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 29 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 30b2245e..47748110 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -13,8 +13,11 @@
 #wp.config.mode = "debug"
 #wp.config.verify_cuda = True
 
+wp.set_device("cuda:0")
 wp.set_module_options({"fast_math": False})
 
+#wp.clear_kernel_cache()
+
 rng = np.random.default_rng(45)
 
 def assert_equal(result: np.ndarray, expect: np.ndarray, tol=1.e-2):
@@ -53,10 +56,11 @@ def create_array(dim_in, dim_hid, dtype=float):
 DIM_OUT = 3
 
 NUM_THREADS = 32
-NUM_BLOCKS = 36
 
-IMG_WIDTH = NUM_THREADS*2
-IMG_HEIGHT = NUM_THREADS*2
+IMG_WIDTH = NUM_THREADS*8
+IMG_HEIGHT = NUM_THREADS*8
+
+BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
 
 def test_multi_layer_nn():
 
@@ -64,8 +68,17 @@ def test_multi_layer_nn():
     def relu(x: float):
         return wp.max(x, 0.0)
 
+    @wp.func
+    def sigmoid(x: float):
+        return 1.0 / (1.0 + wp.exp(-x))
+
     @wp.kernel
-    def compute(input: wp.array2d(dtype=float),
+    def zero(loss: wp.array(dtype=float)):
+        loss[0] = 0.0
+
+    @wp.kernel
+    def compute(batches: wp.array(dtype=int),
+                input: wp.array2d(dtype=float),
                 weights_0: wp.array2d(dtype=float), bias_0: wp.array2d(dtype=float),
                 weights_1: wp.array2d(dtype=float), bias_1: wp.array2d(dtype=float),
                 weights_2: wp.array2d(dtype=float), bias_2: wp.array2d(dtype=float),
@@ -73,8 +86,12 @@ def compute(input: wp.array2d(dtype=float),
                 loss: wp.array1d(dtype=float),
                 out: wp.array2d(dtype=float)):
 
-        row, col = wp.tid()
-        linear = row*IMG_WIDTH + col
+        # row, col = wp.tid()
+        # linear = row*IMG_WIDTH + col
+
+        linear = batches[wp.tid()]
+        row = linear/IMG_WIDTH
+        col = linear%IMG_WIDTH
 
         # normalize input coordinates to [-1, 1]
         x = (float(row)/float(IMG_WIDTH) - 0.5)*2.0
@@ -118,7 +135,7 @@ def compute(input: wp.array2d(dtype=float),
         # output layer
         w2 = wp.tile_load(weights_2, 0, 0, m=DIM_OUT, n=DIM_HID)
         b2 = wp.tile_load(bias_2, 0, 0, m=DIM_OUT, n=1)
-        o = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_OUT, n=NUM_THREADS))
+        o = wp.tile_map(sigmoid, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_OUT, n=NUM_THREADS))
 
         # until back to SIMT
         output = wp.untile(o)
@@ -129,7 +146,7 @@ def compute(input: wp.array2d(dtype=float),
                         output[2] - reference[2,linear])
 
         # write MSE loss
-        wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*IMG_WIDTH*IMG_HEIGHT))
+        wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*BATCH_SIZE))
 
         # image output
         for i in range(DIM_OUT):
@@ -160,34 +177,43 @@ def compute(input: wp.array2d(dtype=float),
     optimizer_inputs = [p.flatten() for p in params]
     optimizer = warp.optim.Adam(optimizer_inputs, lr=0.001)
 
-    for i in range(1):
+    # create shuffled batch indices
+    indices = np.arange(0, IMG_WIDTH*IMG_HEIGHT)
+    np.random.shuffle(indices)
+    batches = wp.array(indices, dtype=int)
 
-        loss.zero_()
+    for i in range(32):
 
-        with wp.Tape() as tape:
-            wp.launch(
-                compute, 
-                dim=[IMG_WIDTH, IMG_HEIGHT], 
-                inputs=[input,
-                        weights_0, bias_0,
-                        weights_1, bias_1,
-                        weights_2, bias_2, 
-                        reference,
-                        loss,
-                        output],
-                block_dim=NUM_THREADS)
+        for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
 
-        print(f"Iter: {i} Loss: {loss.numpy()}")
+            loss.zero_()
 
-        # output.grad = wp.ones_like(output)
-        # tape.backward()
-        
-        tape.backward(loss)
+            with wp.Tape() as tape:
+                wp.launch(
+                    compute, 
+                    dim=[BATCH_SIZE],
+                    inputs=[batches[b:b+BATCH_SIZE],
+                            input,
+                            weights_0, bias_0,
+                            weights_1, bias_1,
+                            weights_2, bias_2, 
+                            reference,
+                            loss,
+                            output],
+                    block_dim=NUM_THREADS)
+
+            print(f"Iter: {i} Loss: {loss.numpy()}")
+
+            tape.backward(loss)
 
-        # optimizer.step(optimizer_grads)
+            optimizer.step(optimizer_grads)
 
-        # tape.zero()
+            tape.zero()
 
+            # uncommenting this line fixes convergence
+            # wp.synchronize()
+
+               
 
     predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
     predicted_image = (predicted_image * 255).astype(np.uint8)
@@ -195,6 +221,9 @@ def compute(input: wp.array2d(dtype=float),
     predicted_image_pil = Image.fromarray(predicted_image)
     predicted_image_pil.save("test_tile_mlp_wp.jpg")
 
+    return
+
+
     # print(input)
     # print(output)
 

From 9e8dad930e84fa9f7e71e561e7b193683676fcc8 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 9 Oct 2024 22:30:12 +0000
Subject: [PATCH 065/102] Increase layers, use cosine weighted learning rate

---
 warp/tests/test_tile_mlp.py | 43 ++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 47748110..5e3616a8 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -5,6 +5,7 @@
 
 import torch as tc
 
+import math
 import os
 
 from PIL import Image
@@ -49,7 +50,7 @@ def create_array(dim_in, dim_hid, dtype=float):
     return a
 
 
-NUM_FREQ = wp.constant(4)
+NUM_FREQ = wp.constant(8)
 
 DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
 DIM_HID = 16
@@ -57,8 +58,8 @@ def create_array(dim_in, dim_hid, dtype=float):
 
 NUM_THREADS = 32
 
-IMG_WIDTH = NUM_THREADS*8
-IMG_HEIGHT = NUM_THREADS*8
+IMG_WIDTH = NUM_THREADS*16
+IMG_HEIGHT = NUM_THREADS*16
 
 BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
 
@@ -82,6 +83,7 @@ def compute(batches: wp.array(dtype=int),
                 weights_0: wp.array2d(dtype=float), bias_0: wp.array2d(dtype=float),
                 weights_1: wp.array2d(dtype=float), bias_1: wp.array2d(dtype=float),
                 weights_2: wp.array2d(dtype=float), bias_2: wp.array2d(dtype=float),
+                weights_3: wp.array2d(dtype=float), bias_3: wp.array2d(dtype=float),
                 reference: wp.array2d(dtype=float),
                 loss: wp.array1d(dtype=float),
                 out: wp.array2d(dtype=float)):
@@ -132,12 +134,16 @@ def compute(batches: wp.array(dtype=int),
         b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
         z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
 
+        w2 = wp.tile_load(weights_2, 0, 0, m=DIM_HID, n=DIM_HID)
+        b2 = wp.tile_load(bias_2, 0, 0, m=DIM_HID, n=1)
+        z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_HID, n=NUM_THREADS))
+
         # output layer
-        w2 = wp.tile_load(weights_2, 0, 0, m=DIM_OUT, n=DIM_HID)
-        b2 = wp.tile_load(bias_2, 0, 0, m=DIM_OUT, n=1)
-        o = wp.tile_map(sigmoid, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_OUT, n=NUM_THREADS))
+        w3 = wp.tile_load(weights_3, 0, 0, m=DIM_OUT, n=DIM_HID)
+        b3 = wp.tile_load(bias_3, 0, 0, m=DIM_OUT, n=1)
+        o = wp.tile_map(sigmoid, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
 
-        # until back to SIMT
+        # untile back to SIMT
         output = wp.untile(o)
 
         # compute error
@@ -156,7 +162,8 @@ def compute(batches: wp.array(dtype=int),
 
     weights_0, bias_0 = create_layer(DIM_IN, DIM_HID, dtype=float)
     weights_1, bias_1 = create_layer(DIM_HID, DIM_HID, dtype=float)
-    weights_2, bias_2 = create_layer(DIM_HID, DIM_OUT, dtype=float)
+    weights_2, bias_2 = create_layer(DIM_HID, DIM_HID, dtype=float)
+    weights_3, bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=float)
 
     input = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_IN)
     output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
@@ -171,18 +178,20 @@ def compute(batches: wp.array(dtype=int),
 
     params = [weights_0, bias_0,
               weights_1, bias_1, 
-              weights_2, bias_2]
+              weights_2, bias_2,
+              weights_3, bias_3]
 
     optimizer_grads = [p.grad.flatten() for p in params]
     optimizer_inputs = [p.flatten() for p in params]
     optimizer = warp.optim.Adam(optimizer_inputs, lr=0.001)
 
-    # create shuffled batch indices
-    indices = np.arange(0, IMG_WIDTH*IMG_HEIGHT)
-    np.random.shuffle(indices)
-    batches = wp.array(indices, dtype=int)
+    max_iters = 500
+
+    for i in range(max_iters):
 
-    for i in range(32):
+        # create randomized batch indices
+        batches = wp.array(rng.integers(low=0, high=IMG_WIDTH*IMG_HEIGHT, size=IMG_WIDTH*IMG_HEIGHT, dtype=np.int32))
+        
 
         for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
 
@@ -197,15 +206,19 @@ def compute(batches: wp.array(dtype=int),
                             weights_0, bias_0,
                             weights_1, bias_1,
                             weights_2, bias_2, 
+                            weights_3, bias_3, 
                             reference,
                             loss,
                             output],
                     block_dim=NUM_THREADS)
 
-            print(f"Iter: {i} Loss: {loss.numpy()}")
+            if b == 0:
+                print(f"Iter: {i} Loss: {loss.numpy()}")
 
             tape.backward(loss)
 
+            # cosine weighted decay
+            optimizer.lr = 0.5*0.01*(1.0 + math.cos(float(i)/float(max_iters)*math.pi))
             optimizer.step(optimizer_grads)
 
             tape.zero()

From 477a30c382de806cdfabcab41c406a04e855498c Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 10 Oct 2024 04:31:07 +0000
Subject: [PATCH 066/102] Add fp16 support, fix for uninitialized output

---
 warp/builtins.py            |   6 +-
 warp/native/tile.h          |  16 ++-
 warp/optim/adam.py          |  43 ++++++-
 warp/tests/test_tile_mlp.py | 240 ++++++++++++++++++------------------
 4 files changed, 177 insertions(+), 128 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index fa7e8a5b..d8a38a5e 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5559,8 +5559,10 @@ def tile_matmul_generic_lto_dispatch_func(
     b = arg_values["b"]
 
     if len(return_values) > 0:
+        accumulate = 0 # for c = tile_matmul(a,b) case we want to overwrite c value
         out = return_values[0]
     else:
+        accumulate = 1 # for tile_matmul(a,b,c) case we want to add to c value
         out = arg_values["out"]
 
     if any(not is_tile(arg.type) for arg in [a, b, out]):
@@ -5581,7 +5583,7 @@ def tile_matmul_generic_lto_dispatch_func(
     a.type.storage = "shared"
     b.type.storage = "shared"
     out.type.storage = "shared"
-    template_args = []
+    template_args = [accumulate]
 
     # Real
     if out.type.dtype == float16:
@@ -5728,7 +5730,6 @@ def tile_flip_layout(layout):
     """,
     group="Tile Primitives",
     export=False,
-    namespace="",
 )
 
 add_builtin(
@@ -5752,7 +5753,6 @@ def tile_flip_layout(layout):
     """,
     group="Tile Primitives",
     export=False,
-    namespace="",
 )
 
 
diff --git a/warp/native/tile.h b/warp/native/tile.h
index cd25c674..8df8e202 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -742,6 +742,16 @@ template <typename T, int M, int N, int Alloc>
 inline CUDA_CALLABLE auto tile_alloc_empty()
 {
     WP_TILE_SHARED __align__(16) T data[M*N];
+
+#if FP_CHECK
+
+    for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
+        data[i] = T(nanf(""));
+
+    WP_TILE_SYNC();
+
+#endif // FP_CHECK
+
     return tile_shared_t<T, M, N>(data);
 }
 
@@ -1287,13 +1297,13 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
 }
 
 // cuBLASDx follows the BLAS convention: matrices are col-major, so we swap A & B in the code below
-template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
+template <int Add, typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
 TileC& tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C)
 {       
     using T = typename TileA::Type;
 
     WP_TILE_SYNC();
-    fun_forward(T(1.0), B.data, A.data, T(1.0), C.data);
+    fun_forward(T(1.0), B.data, A.data, T(Add), C.data);
     WP_TILE_SYNC();
     
     return C;
@@ -1314,7 +1324,7 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 
 // backward for the out = wp.tile_matmul(a, b) syntax
 template <typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
-void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C,
+void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C, 
                    Fwd adj_fun_forward, AdjA adj_fun_backward_A, AdjB adj_fun_backward_B, TileA& adj_A, TileB& adj_B, TileC& adj_C, TileC& adj_ret)
 {   
     using T = typename TileA::Type;    
diff --git a/warp/optim/adam.py b/warp/optim/adam.py
index cce2eff6..fb2d0064 100644
--- a/warp/optim/adam.py
+++ b/warp/optim/adam.py
@@ -50,6 +50,26 @@ def adam_step_kernel_float(
     params[i] = params[i] - lr * mhat / (wp.sqrt(vhat) + eps)
 
 
+@wp.kernel
+def adam_step_kernel_half(
+    g: wp.array(dtype=wp.float16),
+    m: wp.array(dtype=float),
+    v: wp.array(dtype=float),
+    lr: float,
+    beta1: float,
+    beta2: float,
+    t: float,
+    eps: float,
+    params: wp.array(dtype=wp.float16),
+):
+    i = wp.tid()
+    m[i] = beta1 * m[i] + (1.0 - beta1) * float(g[i])
+    v[i] = beta2 * v[i] + (1.0 - beta2) * float(g[i]) * float(g[i])
+    mhat = m[i] / (1.0 - wp.pow(beta1, (t + 1.0)))
+    vhat = v[i] / (1.0 - wp.pow(beta2, (t + 1.0)))
+    params[i] = params[i] - wp.float16(lr * mhat / (wp.sqrt(vhat) + eps))
+
+
 class Adam:
     """An implementation of the Adam Optimizer
     It is designed to mimic Pytorch's version.
@@ -75,10 +95,20 @@ def set_params(self, params):
                 self.v = [None] * len(params)  # reset second moment
             for i in range(len(params)):
                 param = params[i]
+
+                if param.dtype == wp.vec3:
+                    dtype = wp.vec3
+                elif param.dtype == wp.float32:
+                    dtype = wp.float32
+                elif param.dtype == wp.float16:
+                    dtype = wp.float32      # we always use fp32 for moments, even if params are fp16
+                else:
+                    raise RuntimeError(f"Unsupported dtype for Warp Adam optimizer: {param.dtype}")
+
                 if self.m[i] is None or self.m[i].shape != param.shape or self.m[i].dtype != param.dtype:
-                    self.m[i] = wp.zeros_like(param)
+                    self.m[i] = wp.zeros(shape=param.shape, dtype=dtype, device=param.device)
                 if self.v[i] is None or self.v[i].shape != param.shape or self.v[i].dtype != param.dtype:
-                    self.v[i] = wp.zeros_like(param)
+                    self.v[i] = wp.zeros(shape=param.shape, dtype=dtype, device=param.device)
 
     def reset_internal_state(self):
         for m_i in self.m:
@@ -98,8 +128,6 @@ def step(self, grad):
     @staticmethod
     def step_detail(g, m, v, lr, beta1, beta2, t, eps, params):
         assert params.dtype == g.dtype
-        assert params.dtype == m.dtype
-        assert params.dtype == v.dtype
         assert params.shape == g.shape
         kernel_inputs = [g, m, v, lr, beta1, beta2, t, eps, params]
         if params.dtype == wp.types.float32:
@@ -109,6 +137,13 @@ def step_detail(g, m, v, lr, beta1, beta2, t, eps, params):
                 inputs=kernel_inputs,
                 device=params.device,
             )
+        elif params.dtype == wp.types.float16:
+            wp.launch(
+                kernel=adam_step_kernel_half,
+                dim=len(params),
+                inputs=kernel_inputs,
+                device=params.device,
+            )            
         elif params.dtype == wp.types.vec3:
             wp.launch(
                 kernel=adam_step_kernel_vec3,
diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 5e3616a8..34f5ff60 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -10,8 +10,8 @@
 
 from PIL import Image
 
-#wp.clear_kernel_cache()
 #wp.config.mode = "debug"
+#wp.config.verify_fp = True
 #wp.config.verify_cuda = True
 
 wp.set_device("cuda:0")
@@ -53,7 +53,7 @@ def create_array(dim_in, dim_hid, dtype=float):
 NUM_FREQ = wp.constant(8)
 
 DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
-DIM_HID = 16
+DIM_HID = 32
 DIM_OUT = 3
 
 NUM_THREADS = 32
@@ -63,15 +63,17 @@ def create_array(dim_in, dim_hid, dtype=float):
 
 BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
 
+dtype = wp.float16
+
 def test_multi_layer_nn():
 
     @wp.func
-    def relu(x: float):
-        return wp.max(x, 0.0)
+    def relu(x: dtype):
+        return wp.max(x, dtype(0.0))
 
     @wp.func
-    def sigmoid(x: float):
-        return 1.0 / (1.0 + wp.exp(-x))
+    def sigmoid(x: dtype):
+        return dtype(1.0 / (1.0 + wp.exp(-float(x))))
 
     @wp.kernel
     def zero(loss: wp.array(dtype=float)):
@@ -79,11 +81,11 @@ def zero(loss: wp.array(dtype=float)):
 
     @wp.kernel
     def compute(batches: wp.array(dtype=int),
-                input: wp.array2d(dtype=float),
-                weights_0: wp.array2d(dtype=float), bias_0: wp.array2d(dtype=float),
-                weights_1: wp.array2d(dtype=float), bias_1: wp.array2d(dtype=float),
-                weights_2: wp.array2d(dtype=float), bias_2: wp.array2d(dtype=float),
-                weights_3: wp.array2d(dtype=float), bias_3: wp.array2d(dtype=float),
+                input: wp.array2d(dtype=dtype),
+                weights_0: wp.array2d(dtype=dtype), bias_0: wp.array2d(dtype=dtype),
+                weights_1: wp.array2d(dtype=dtype), bias_1: wp.array2d(dtype=dtype),
+                weights_2: wp.array2d(dtype=dtype), bias_2: wp.array2d(dtype=dtype),
+                weights_3: wp.array2d(dtype=dtype), bias_3: wp.array2d(dtype=dtype),
                 reference: wp.array2d(dtype=float),
                 loss: wp.array1d(dtype=float),
                 out: wp.array2d(dtype=float)):
@@ -99,7 +101,7 @@ def compute(batches: wp.array(dtype=int),
         x = (float(row)/float(IMG_WIDTH) - 0.5)*2.0
         y = (float(col)/float(IMG_HEIGHT) - 0.5)*2.0
 
-        local = wp.vector(dtype=float, length=DIM_IN)
+        local = wp.vector(dtype=dtype, length=DIM_IN)
 
         # construct positional encoding
         for s in range(NUM_FREQ):
@@ -107,14 +109,14 @@ def compute(batches: wp.array(dtype=int),
             scale = wp.pow(2.0, float(s))*wp.pi
 
             # x-coord
-            local[s*4 + 0] = wp.sin(x * scale)
-            local[s*4 + 1] = wp.cos(x * scale)
+            local[s*4 + 0] = dtype(wp.sin(x * scale))
+            local[s*4 + 1] = dtype(wp.cos(x * scale))
 
             # y-coord
-            local[s*4 + 2] = wp.sin(y * scale)
-            local[s*4 + 3] = wp.cos(y * scale)
+            local[s*4 + 2] = dtype(wp.sin(y * scale))
+            local[s*4 + 3] = dtype(wp.cos(y * scale))
 
-            # write input back to array so that torch can use it
+            # # write input back to array so that torch can use it
             input[s*4 + 0, linear] = local[s*4 + 0]
             input[s*4 + 1, linear] = local[s*4 + 1]
             input[s*4 + 2, linear] = local[s*4 + 2]
@@ -141,31 +143,32 @@ def compute(batches: wp.array(dtype=int),
         # output layer
         w3 = wp.tile_load(weights_3, 0, 0, m=DIM_OUT, n=DIM_HID)
         b3 = wp.tile_load(bias_3, 0, 0, m=DIM_OUT, n=1)
-        o = wp.tile_map(sigmoid, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
+        o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
 
         # untile back to SIMT
         output = wp.untile(o)
 
         # compute error
-        error = wp.vec3(output[0] - reference[0,linear],
-                        output[1] - reference[1,linear],
-                        output[2] - reference[2,linear])
+        error = wp.vec3(float(output[0]) - reference[0,linear],
+                        float(output[1]) - reference[1,linear],
+                        float(output[2]) - reference[2,linear])
 
         # write MSE loss
         wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*BATCH_SIZE))
 
+
         # image output
         for i in range(DIM_OUT):
-            out[i, linear] = output[i]
+            out[i, linear] = float(output[i])
                 
 
 
-    weights_0, bias_0 = create_layer(DIM_IN, DIM_HID, dtype=float)
-    weights_1, bias_1 = create_layer(DIM_HID, DIM_HID, dtype=float)
-    weights_2, bias_2 = create_layer(DIM_HID, DIM_HID, dtype=float)
-    weights_3, bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=float)
+    weights_0, bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
+    weights_1, bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+    weights_2, bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+    weights_3, bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
 
-    input = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_IN)
+    input = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_IN, dtype=dtype)
     output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
 
     # # reference 
@@ -185,48 +188,98 @@ def compute(batches: wp.array(dtype=int),
     optimizer_inputs = [p.flatten() for p in params]
     optimizer = warp.optim.Adam(optimizer_inputs, lr=0.001)
 
-    max_iters = 500
-
-    for i in range(max_iters):
-
-        # create randomized batch indices
-        batches = wp.array(rng.integers(low=0, high=IMG_WIDTH*IMG_HEIGHT, size=IMG_WIDTH*IMG_HEIGHT, dtype=np.int32))
-        
-
-        for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
-
-            loss.zero_()
-
-            with wp.Tape() as tape:
-                wp.launch(
-                    compute, 
-                    dim=[BATCH_SIZE],
-                    inputs=[batches[b:b+BATCH_SIZE],
-                            input,
-                            weights_0, bias_0,
-                            weights_1, bias_1,
-                            weights_2, bias_2, 
-                            weights_3, bias_3, 
-                            reference,
-                            loss,
-                            output],
-                    block_dim=NUM_THREADS)
-
-            if b == 0:
-                print(f"Iter: {i} Loss: {loss.numpy()}")
-
-            tape.backward(loss)
-
-            # cosine weighted decay
-            optimizer.lr = 0.5*0.01*(1.0 + math.cos(float(i)/float(max_iters)*math.pi))
-            optimizer.step(optimizer_grads)
-
-            tape.zero()
-
-            # uncommenting this line fixes convergence
-            # wp.synchronize()
-
-               
+    num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
+    max_iters = 5000
+    max_epochs = int(max_iters/num_batches)
+
+    # create randomized batch indices
+    batches = np.arange(0, IMG_WIDTH*IMG_HEIGHT, dtype=np.int32)
+    rng.shuffle(batches)
+    batches = wp.array(batches)
+         
+    with wp.ScopedTimer("Training"):
+
+        for i in range(max_epochs):
+            
+            for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
+
+                loss.zero_()
+
+                with wp.Tape() as tape:
+                    wp.launch(
+                        compute, 
+                        dim=[BATCH_SIZE],
+                        inputs=[batches[b:b+BATCH_SIZE],
+                                input,
+                                weights_0, bias_0,
+                                weights_1, bias_1,
+                                weights_2, bias_2, 
+                                weights_3, bias_3, 
+                                reference,
+                                loss,
+                                output],
+                        block_dim=NUM_THREADS)
+
+                tape.backward(loss)
+
+                verify = False
+                if verify:
+
+                    indices = batches[b:b+BATCH_SIZE].numpy()
+
+                    z_np = np.maximum(weights_0.numpy()@input.numpy()[:,indices] + bias_0.numpy(), 0.0)
+                    z_np = np.maximum(weights_1.numpy()@z_np + bias_1.numpy(), 0.0)
+                    z_np = np.maximum(weights_2.numpy()@z_np + bias_2.numpy(), 0.0)
+                    z_np = np.maximum(weights_3.numpy()@z_np + bias_3.numpy(), 0.0)
+
+                    # test numpy foward
+                    assert_equal(output.numpy()[:,indices], z_np)
+
+                    # torch
+                    input_tc = tc.from_numpy(input.numpy()[:, indices]).requires_grad_(True)
+
+                    weights_0_tc = tc.from_numpy(weights_0.numpy()).requires_grad_(True)
+                    bias_0_tc = tc.from_numpy(bias_0.numpy()).requires_grad_(True)
+
+                    weights_1_tc = tc.from_numpy(weights_1.numpy()).requires_grad_(True)
+                    bias_1_tc = tc.from_numpy(bias_1.numpy()).requires_grad_(True)
+
+                    weights_2_tc = tc.from_numpy(weights_2.numpy()).requires_grad_(True)
+                    bias_2_tc = tc.from_numpy(bias_2.numpy()).requires_grad_(True)
+
+                    weights_3_tc = tc.from_numpy(weights_3.numpy()).requires_grad_(True)
+                    bias_3_tc = tc.from_numpy(bias_3.numpy()).requires_grad_(True)                    
+
+                    z_tc = tc.clamp(weights_0_tc@input_tc + bias_0_tc, min=0.0)
+                    z_tc = tc.clamp(weights_1_tc@z_tc + bias_1_tc, min=0.0)
+                    z_tc = tc.clamp(weights_2_tc@z_tc + bias_2_tc, min=0.0)
+                    z_tc = tc.clamp(weights_3_tc@z_tc + bias_3_tc, min=0.0)
+                    
+                    ref_tc = tc.from_numpy(reference.numpy()[:, indices]).requires_grad_(True)
+                    
+                    l_tc = tc.mean((z_tc - ref_tc)**2)
+                    l_tc.backward()
+
+                    # test torch
+                    assert_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices])
+                    assert_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy())
+                    assert_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy())
+                    assert_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy())
+                    assert_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy())
+                    assert_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy())
+                    assert_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy())
+                    assert_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy())
+                    assert_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy())
+
+                # cosine weighted decay
+                optimizer.lr = 0.5*0.01*(1.0 + math.cos(float(i)/float(max_iters)*math.pi))
+                optimizer.step(optimizer_grads)
+
+                tape.zero()
+
+            print(f"Epoch: {i} Loss: {loss.numpy()}")
+
+              
 
     predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
     predicted_image = (predicted_image * 255).astype(np.uint8)
@@ -241,56 +294,7 @@ def compute(batches: wp.array(dtype=int),
     # print(output)
 
     # numpy
-    z_np = np.maximum(weights_0.numpy()@input.numpy() + bias_0.numpy(), 0.0)
-    z_np = np.maximum(weights_1.numpy()@z_np + bias_1.numpy(), 0.0)
-    z_np = np.maximum(weights_2.numpy()@z_np + bias_2.numpy(), 0.0)
 
-    predicted_image = z_np.T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
-    predicted_image = (predicted_image * 255).astype(np.uint8)
-
-    predicted_image_pil = Image.fromarray(predicted_image)
-    predicted_image_pil.save("test_tile_mlp_np.jpg")
-
-    # test numpy foward
-    print("NumPy output close: ", assert_equal(output.numpy(), z_np))
-
-    # torch
-    input_tc = tc.from_numpy(input.numpy()).requires_grad_(True)
-
-    weights_0_tc = tc.from_numpy(weights_0.numpy()).requires_grad_(True)
-    bias_0_tc = tc.from_numpy(bias_0.numpy()).requires_grad_(True)
-
-    weights_1_tc = tc.from_numpy(weights_1.numpy()).requires_grad_(True)
-    bias_1_tc = tc.from_numpy(bias_1.numpy()).requires_grad_(True)
-
-    weights_2_tc = tc.from_numpy(weights_2.numpy()).requires_grad_(True)
-    bias_2_tc = tc.from_numpy(bias_2.numpy()).requires_grad_(True)
-
-    z_tc = tc.clamp(weights_0_tc@input_tc + bias_0_tc, min=0.0)
-    z_tc = tc.clamp(weights_1_tc@z_tc + bias_1_tc, min=0.0)
-    z_tc = tc.clamp(weights_2_tc@z_tc + bias_2_tc, min=0.0)
-    
-    ref_tc = tc.from_numpy(reference.numpy()).requires_grad_(True)
-    
-    
-    l_tc = tc.mean((z_tc - ref_tc)**2)
-    l_tc.backward()
-
-    #z_tc.backward(tc.ones_like(z_tc))
-
-    # test torch
-    print("Torch output close:        ", assert_equal(z_tc.cpu().detach().numpy(), output.numpy()))
-    #print("Torch loss close:        ", assert_equal(l_tc.cpu().detach().numpy(), loss.numpy()))
-    #print("Torch input.grad close:    ", assert_equal(input.grad.numpy(), input_tc.grad.cpu().detach().numpy()))
-     
-    print("Torch weights0.grad close: ", assert_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy()))
-    print("Torch bias0.grad close:    ", assert_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy()))
-     
-    print("Torch weights1.grad close: ", assert_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy()))
-    print("Torch bias1.grad close:    ", assert_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy()))
- 
-    print("Torch weights2.grad close: ", assert_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy()))
-    print("Torch bias2.grad close:    ", assert_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy()))
 
     
 

From e1d1012621f12b58bc52b0ff8944d2cd086a048e Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Thu, 10 Oct 2024 16:07:07 -0700
Subject: [PATCH 067/102] MathDx 24.08 support in Warp + Tile

---
 .gitlab/ci/mathdx-support.yml  |  8 ++--
 examples/tile_fft.py           |  4 +-
 examples/tile_matmul.py        | 12 ++---
 warp/builtins.py               | 83 +++++++++++++++++-----------------
 warp/native/mathdx.cpp         |  4 +-
 warp/native/warp.cu            |  6 ++-
 warp/native/warp.h             |  2 +-
 warp/tests/test_tile_mathdx.py | 12 ++---
 8 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index bfca61fe..d7879267 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -36,7 +36,7 @@ linux-x86_64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/30/libmathdx_build_x86_64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/54/libmathdx_build_x86_64_ubuntu20.04_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -59,7 +59,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/30/libmathdx_build_aarch64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/54/libmathdx_build_aarch64_ubuntu20.04_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -101,7 +101,7 @@ linux-x86_64 test:
     - python -m pip install --upgrade usd-core
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -U "jax[cuda12]"
-    - python -m pip install --upgrade nvidia-mathdx==24.4.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
+    - python -m pip install --upgrade nvidia-mathdx==24.8.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
     # HACK: disable P2P tests due to misbehaving agents
@@ -118,7 +118,7 @@ linux-aarch64 test jetson:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
     - !reference [.snippets, install-python+warp-aarch64]
     - python -m pip install -U "jax[cuda12]"
-    - python -m pip install --upgrade nvidia-mathdx==24.4.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
+    - python -m pip install --upgrade nvidia-mathdx==24.8.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
     - python -m warp.tests --junit-report-xml rspec.xml -s autodetect --failfast
diff --git a/examples/tile_fft.py b/examples/tile_fft.py
index edc6c101..f47e0b4a 100644
--- a/examples/tile_fft.py
+++ b/examples/tile_fft.py
@@ -16,8 +16,8 @@
 def fft_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
     i, j, _ = wp.tid()
     a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
-    wp.tile_fft_dx(a)
-    wp.tile_ifft_dx(a)
+    wp.tile_fft(a)
+    wp.tile_ifft(a)
     wp.tile_store(y, i, j, a)
 
 
diff --git a/examples/tile_matmul.py b/examples/tile_matmul.py
index faedbee6..57b94bbc 100644
--- a/examples/tile_matmul.py
+++ b/examples/tile_matmul.py
@@ -10,21 +10,21 @@
 
 
 @wp.kernel
-def matmul_tiled(ga: wp.array2d(dtype=wp.float64), gb: wp.array2d(dtype=wp.float64), gc: wp.array2d(dtype=wp.float64)):
+def matmul_tiled(ga: wp.array2d(dtype=wp.float32), gb: wp.array2d(dtype=wp.float16), gc: wp.array2d(dtype=wp.float64)):
     i, j, _ = wp.tid()
     a = wp.tile_load(ga, i, j, m=M, n=K)
     b = wp.tile_load(gb, i, j, m=K, n=N)
     c = wp.tile_zeros(m=M, n=N, dtype=wp.float64)
-    wp.tile_matmul_dx(a, b, c)
+    wp.tile_matmul(a, b, c)
     wp.tile_store(gc, i, j, c)
 
 
-A = np.ones((M, K), dtype=np.float64)
-B = 3 * np.ones((K, N), dtype=np.float64)
+A = np.ones((M, K), dtype=np.float32)
+B = 3 * np.ones((K, N), dtype=np.float16)
 C = np.zeros((M, N), dtype=np.float64)
 
-A_wp = wp.array2d(A, dtype=wp.float64)
-B_wp = wp.array2d(B, dtype=wp.float64)
+A_wp = wp.array2d(A, dtype=wp.float32)
+B_wp = wp.array2d(B, dtype=wp.float16)
 C_wp = wp.array2d(C, dtype=wp.float64)
 
 wp.launch(matmul_tiled, dim=[1, 1, BLOCK_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=BLOCK_DIM)
diff --git a/warp/builtins.py b/warp/builtins.py
index d8a38a5e..d3a9de7f 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5573,9 +5573,6 @@ def tile_matmul_generic_lto_dispatch_func(
             "tile_matmul() arguments must be tiles of float16, float32 or float64, vec2h, vec2f, vec2d entries"
         )
 
-    if any(arg.type.dtype != out.type.dtype for arg in [a, b]):
-        raise RuntimeError("tile_matmul() arguments must have the same type")
-
     if (a.type.N != b.type.M) or (a.type.M != out.type.M) or (b.type.N != out.type.N):
         raise RuntimeError("tile_matmul(A, B, C) requires sizes of A, B and C to be consistent for a matmul")
 
@@ -5585,34 +5582,21 @@ def tile_matmul_generic_lto_dispatch_func(
     out.type.storage = "shared"
     template_args = [accumulate]
 
-    # Real
-    if out.type.dtype == float16:
-        dtype = "wp::float16"
-        precision = 2  # COMMONDX_PRECISION_F16
-        element_type = 0  # CUBLASDX_TYPE_REAL
-    elif out.type.dtype == float32:
-        dtype = "wp::float32"
-        precision = 3  # COMMONDX_PRECISION_F32
-        element_type = 0  # CUBLASDX_TYPE_REAL
-    elif out.type.dtype == float64:
-        dtype = "wp::float64"
-        precision = 4  # COMMONDX_PRECISION_F64
-        element_type = 0  # CUBLASDX_TYPE_REAL
-    # Complex
-    elif out.type.dtype == vec2h:
-        dtype = "wp::vec2h"
-        precision = 2  # COMMONDX_PRECISION_F16
-        element_type = 1  # CUBLASDX_TYPE_COMPLEX
-    elif out.type.dtype == vec2f:
-        dtype = "wp::vec2f"
-        precision = 3  # COMMONDX_PRECISION_F32
-        element_type = 1  # CUBLASDX_TYPE_COMPLEX
-    elif out.type.dtype == vec2d:
-        dtype = "wp::vec2d"
-        precision = 4  # COMMONDX_PRECISION_F64
-        element_type = 1  # CUBLASDX_TYPE_COMPLEX
-    else:
-        raise RuntimeError("Unsupported datatype")
+    def cublasdx_type_map(dtype):
+        if dtype == float16:
+            return ("wp::float16", 3, 0)
+        if dtype == float32:
+            return ("wp::float32", 5, 0)
+        if dtype == float64:
+            return ("wp::float64", 6, 0)
+        if dtype == vec2h:
+            return ("wp::vec2h", 3, 1)
+        if dtype == vec2f:
+            return ("wp::vec2f", 5, 1)
+        if dtype == vec2d:
+            return ("wp::vec2d", 6, 1)
+        raise RuntimeError("Unsupported input type in tile_matmul")
+
 
     # generate the LTO
     M, K = a.type.M, a.type.N
@@ -5620,7 +5604,17 @@ def tile_matmul_generic_lto_dispatch_func(
     num_threads = options["block_dim"]
     arch = options["output_arch"]
 
-    def make_function(M, N, K, tA, tB):
+    def make_function(M, N, K, adtype, bdtype, cdtype, tA, tB):
+
+        (a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
+        (b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
+        (c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
+
+        if (a_type != b_type or a_type != c_type):
+            raise RuntimeError("time_matmul(A, B, C) requires all inputs to be real or complex")
+
+        element_type = a_type
+
         # Warp follows Numpy: matrices are row-major
         # But cuBLASDx follows BLAS: matrices are col-major
         # So we have to flip M <-> N and A <-> B
@@ -5631,7 +5625,7 @@ def make_transpose(t):
                 return 1  # CUBLASDX_TRANSPOSE_MODE_TRANSPOSED
             raise RuntimeError("Invalid transpose mode")
 
-        lto_symbol = f"dot_{M}_{N}_{K}_{tA}_{tB}_{precision}_{element_type}"
+        lto_symbol = f"dot_{M}_{N}_{K}_{tA}_{tB}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
 
         # early out if LTO for this combination already exists for this module
         if lto_symbol in builder.ltoirs:
@@ -5650,7 +5644,9 @@ def make_transpose(t):
             N,
             M,
             K,
-            precision,
+            b_prec,
+            a_prec,
+            c_prec,
             element_type,
             make_transpose(tB),
             make_transpose(tA),
@@ -5663,7 +5659,7 @@ def make_transpose(t):
                 lto_code = f.read()
 
             builder.ltoirs[lto_symbol] = lto_code
-            builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({dtype}, {dtype}*, {dtype}*, {dtype}, {dtype}*);"
+            builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({c_dtype}, {b_dtype}*, {a_dtype}*, {c_dtype}, {c_dtype}*);"
 
             return lto_symbol, lto_code
 
@@ -5683,13 +5679,16 @@ def tile_flip_layout(layout):
     b_layout = tile_layout_mode(b.type)
     c_layout = tile_layout_mode(out.type)
 
-    (fun_forward, lto_forward) = make_function(M, N, K, a_layout, b_layout)  #    C += A * B
+    #    C += A * B
+    (fun_forward, lto_forward) = make_function(M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a_layout, b_layout) 
+    # adjA += adjC * B^T
     (fun_backward_A, lto_backward_A) = make_function(
-        M, K, N, c_layout, tile_flip_layout(b_layout)
-    )  # adjA += adjC * B^T
+        M, K, N, out.type.dtype, b.type.dtype, a.type.dtype, c_layout, tile_flip_layout(b_layout)
+    )
+    # adjB += A^T * adjC
     (fun_backward_B, lto_backward_B) = make_function(
-        K, N, M, tile_flip_layout(a_layout), c_layout
-    )  # adjB += A^T * adjC
+        K, N, M, a.type.dtype, out.type.dtype, b.type.dtype, tile_flip_layout(a_layout), c_layout
+    )  
 
     return (
         (
@@ -5803,10 +5802,10 @@ def tile_fft_generic_lto_dispatch_func(
 
     if inout.type.dtype == vec2f:
         dtype = "wp::vec2f"
-        precision = 3  # COMMONDX_PRECISION_F32
+        precision = 5  # COMMONDX_PRECISION_F32
     elif inout.type.dtype == vec2d:
         dtype = "wp::vec2d"
-        precision = 4  # COMMONDX_PRECISION_F64
+        precision = 6  # COMMONDX_PRECISION_F64
     else:
         raise RuntimeError("Unsupported datatype")
 
diff --git a/warp/native/mathdx.cpp b/warp/native/mathdx.cpp
index 1dca0afa..75a83e3d 100644
--- a/warp/native/mathdx.cpp
+++ b/warp/native/mathdx.cpp
@@ -41,7 +41,9 @@ WP_API bool cuda_compile_dot(
                              int M,
                              int N,
                              int K,
-                             int precision,
+                             int precision_A,
+                             int precision_B,
+                             int precision_C,
                              int type,
                              int tA,
                              int tB,
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index 7ae7b634..bb6bb8e7 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -2905,6 +2905,7 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
             CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
         }
         CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
+        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/../external/cutlass/include").c_str()));
 
         size_t lto_size = 0;
         CHECK_CUFFTDX(cufftDxGetLTOIRSize(h, &lto_size));
@@ -2925,7 +2926,7 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         return res;
     }
 
-    bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision, int type, int tA, int tB, int num_threads)
+    bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int tA, int tB, int num_threads)
     {
 
         CHECK_ANY(ltoir_output_path != nullptr);
@@ -2940,7 +2941,8 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_FUNCTION, cublasDxFunction::CUBLASDX_FUNCTION_MM));
         CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_EXECUTION, commonDxExecution::COMMONDX_EXECUTION_BLOCK));
         CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_API, cublasDxApi::CUBLASDX_API_BLOCK_SMEM));
-        CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_PRECISION, (commonDxPrecision)precision));
+        std::array<long long int, 3> precisions = {precision_A, precision_B, precision_C};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_PRECISION, 3, precisions.data()));
         CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SM, (long long)(arch * 10)));
         CHECK_CUBLASDX(cublasDxSetOperatorInt64(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TYPE, (cublasDxType)type));
         std::array<long long int, 3> block_dim = {num_threads, 1, 1};
diff --git a/warp/native/warp.h b/warp/native/warp.h
index 045d5f0a..f913c006 100644
--- a/warp/native/warp.h
+++ b/warp/native/warp.h
@@ -319,7 +319,7 @@ extern "C"
 
     WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes);
     WP_API bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size);
-    WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision, int type, int tA, int tB, int num_threads);
+    WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int tA, int tB, int num_threads);
 
     WP_API void* cuda_load_module(void* context, const char* ptx);
     WP_API void cuda_unload_module(void* context, void* module);
diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
index 50b71404..2c8d7180 100644
--- a/warp/tests/test_tile_mathdx.py
+++ b/warp/tests/test_tile_mathdx.py
@@ -26,7 +26,7 @@
 
 @wp.kernel()
 def tile_math_matmul_kernel(
-    ga: wp.array2d(dtype=wp.float64), gb: wp.array2d(dtype=wp.float64), gc: wp.array2d(dtype=wp.float64)
+    ga: wp.array2d(dtype=wp.float16), gb: wp.array2d(dtype=wp.float32), gc: wp.array2d(dtype=wp.float64)
 ):
     i, j = wp.tid()
     a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
@@ -39,8 +39,8 @@ def tile_math_matmul_kernel(
 def test_tile_math_matmul(test, device):
     rng = np.random.default_rng(42)
 
-    A = rng.random((TILE_M, TILE_K), dtype=np.float64)
-    B = rng.random((TILE_K, TILE_N), dtype=np.float64)
+    A = rng.random((TILE_M, TILE_K), dtype=np.float64).astype(np.float16)
+    B = rng.random((TILE_K, TILE_N), dtype=np.float32)
     C = np.zeros((TILE_M, TILE_N), dtype=np.float64)
 
     A_wp = wp.array(A, requires_grad=True, device=device)
@@ -57,14 +57,14 @@ def test_tile_math_matmul(test, device):
         )
 
     # verify forward pass
-    assert_np_equal(C_wp.numpy(), A @ B)
+    assert_np_equal(C_wp.numpy(), A @ B, tol=1e-2)
 
     adj_C = np.ones_like(C)
 
     tape.backward(grads={C_wp: wp.array(adj_C, device=device)})
 
-    assert_np_equal(A_wp.grad.numpy(), adj_C @ B.T)
-    assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C)
+    assert_np_equal(A_wp.grad.numpy(), adj_C @ B.T, tol=1e-2)
+    assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C, tol=1e-2)
 
 
 @wp.kernel()

From 241fd3e217adf55e7d41cdef35c3cc836e306409 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 15 Oct 2024 08:08:55 +0000
Subject: [PATCH 068/102] Fix for tile MLP unit test

---
 warp/tests/assets/pixel.npy | Bin 0 -> 196736 bytes
 warp/tests/test_tile_mlp.py | 229 +++++++++++++++++++-----------------
 2 files changed, 120 insertions(+), 109 deletions(-)
 create mode 100644 warp/tests/assets/pixel.npy

diff --git a/warp/tests/assets/pixel.npy b/warp/tests/assets/pixel.npy
new file mode 100644
index 0000000000000000000000000000000000000000..c6bfb9e1593af61b8b490b11035f9ff57c2aa265
GIT binary patch
literal 196736
zcmd44hnpPNb>_Vmftl%?yQ*`j&N=6t({%TAPtF4jav}hN00{;$ilj(U%(6^}l_gpB
zTB0ReruLD8EPE}ly|#D#eSgsR-m0EPBMOq~&r7{eJ-2Im8l#!tIrp4%&%O0Qe`RsF
z@f-i;>Hl)ud3fsj$!pGx+4<a!u+wLDp1yMJscQ$9_pV$!JVov=U%PP%x38Z)c=8lp
zx}rXFA|8(>+&=U3=3nnWzQ^TqyKF9p$z`XVR=LwGb{a)?gV1Uansow`hG(RBdgZ)M
zKBti!r^Ux9$uYI$m`Z$1DSARF!gWp|oRbTViTTF_{NucZCl=<vbf0i@qWL*|^$Fg>
zF^E5Rd}03h{K69ukAF<aKPKZJR|@86;S*ZXF`f80u8>Cf1jLOW=ZbH9nDml4$RM4c
z6?gOg>+E}G$41!#G`S<2GsuoZM)}-5df0uFV*Z!vu8)$<bN}M>%Kg)N*@8~Cppnk2
z#q)CEyo5h5<jwP0@fPN$;_lpkfBFUt@KphSK_cME#5{#~K`C9JWb?FaUaJsl6(XHd
z2<cTqBPBG_LX$>l)(RktPQVyMoa`ov%PM!<WNw?pZIfc)vCG|z3^&PrM&YrlTvjE<
z4hE7kR<YG0vRI{dyUy*kc>UHu$Q}&ad;zo5tz+zz)uObR6eferpp)n{B1lUMAq^$K
z*r=0P3@V#NW4G$;R=wS7a99j(i@|3%23@9*+Z6R#;{j(n;?Jj(g=)J!-X3h98y#F-
z-oH9rKh^H77W0*SG?Wf_+`!*yciN3E8~9hcEHal#>NH9jqu8Ry|5Lt+<{MNCI>mw(
zzmtDA|C}K_KL1lIm=1oa2|Nsb-2Y<v*GuO0lH+>WvHvXo^>Ut8wm?aE3emzB#lKR*
z<M`Le7g+uY{E$IS;Ai<4aqwI9LYq+x0e&a2neY$ziCi|Bi;+$4GYXgh-T(({x5{i5
znZ+zI8%0K=)MBNbE;INKh8>ZZ6AFZ^Faf)bVk|1NQEt>rVFG%s7~%{81Cw5EHL36A
z-#Xzx>@m;sU#xaIBbNXDtK+>Z!?nY9ceQlHztib<IP7+t(az9zi^^$HxJ(MCNntZd
ztwxbqFED8ZCJH9NhY4sTbN^2NKb;6LcpU%F!3^f7g9q~e2<B(M^o8(0alh<TLim>+
zgE()0|Mf0U&dw1JbN=_gb8_e5{-5wKT~Lb`zCiyE{#CLCTD}1ObxHy2`9*O4EdL^t
z2Jnk4Iw4~q@PmAZS$YR@8>RrMi;=pl5~mec!apI3kS}#u#8#8oVw9Q;QZP)@BAOCw
z^$NzO_js9b#1)Nu;z>_5?t%$;JO-CT%b5UX0SJb`31f><$(Sj-RqJ5%4#wbMj1H^O
zZ8iEFaQ^1I`A?Oroi6y_I=i%YWoh^FU={qYl=GDW>;LTzr_I4IHiOklS<Onuq_7*6
z5M!2G&G>0WW}VQi5t=aXu>2$V`S-*>@cf(!AP{;S{z3aMjekJ+Z1ge<{5|6@ihu5N
z*bhIZlRWVV6S$x1FN%MibY3H!qs4P7(VR>$ClP!x@sCo<SIc?eU#sAOf4z!m27ZJD
zLaSah3qO22Cyz~m$Ors8Efc$NnB_zk8ADhSn)D)&ucgIml}I5M%A`VtLZmmS?GB?q
zU=KxH;V7(tC>(JF0#>)%V6)L?v(jWz7%>;H!q{R_GgjKh=xjE<&1SH%{CgOapXEQ|
zo%R2zO0C-)Z4WokF6~|!ZC@I!oNRaQ@Xt8yR)>|b8yGvyFlw7sWw)qpW(*XJRcbMd
z%|@|NCp6Fky&6*hkCx5-QvT=XeuGN^kPfgJ0SW#AJQw^=rvM%s|1ZEIc*ppH<B0#I
zykiRfaauTG{+_U(GlYB0;U2>Aa@H1R$I~ytflp&;4DcTALok?5G#F1Jd^TB_eLpwm
zzDp~9LM#3fOn|e3S^qyv{^KzJ@b{2(V^-7nUn@mKdc;3}PRJvX{3A4t`SG6Vn2Y~~
z2>wNUg;c=t5BORB`Fa&3Fi>3B5Awx~VdD1T`Qg|(kqF<WblVgJez<zG)NaC6Za2wo
zMmb}Ynhj!uR-mN?v|1>a^JP+jSRxdO_!5{KB{P~dZjXgCfk@0vCW3?~7ITF{HoxEE
za_Q}Mjn%4#Ay_Obh+$}(O>491>~@30ZgSYoPR8W5nf(qk=l>&K68~o+{z5ucf&U+F
z4mZw@!2i~z;mXNQd!>@E7GvQ|&|`L)%}%4msb!p$&7rp0)eeTTGnCDyvN4F$#3q9X
zKly}zB~K%tr-%t0e?<QKAH@G`<TE7}FiZiS-o%~z^*H=<pTNEI5&X~mqs0F|a0>YU
zlYfP1UMgGw{~Xo7)CBH%1K<~n1agT`DHEz#|9^*nHGc{}oABO=dAP7=)&$%PF$dgd
z`Ip&@QmbBS(Me2*-Zc_hB~Z#hK3^i{i^P1PP#_fY1;Tl$OsJ;hCbQP*f(bY<3$P}D
zt1}#R1VW5IX!Qjw9<RyeF*x0Nr%Uf}>g*1^-C@AQVRx8pc9YZgaQ-Wu?qGAcerB|N
zd1>?FaQVb6{}!Lg;?-Nd8pflvd(;lM+UcYm4%+Te*=%xl3Xo#jGH8T4gg`337ABxr
zprmtb3FyJ~pL_V{vfZiMpALRi|3AzBbUY<?I=G*F?hZ#a#-MlBo#GDny|aT^`0siD
z4E!iG-4*{l4)e2q{&C)Kl;7q2@so2u8|pu^{$I?S6R?$>d)R)=ms9_*k_spV!hSxH
zif5ou14O_tVm-fzO?6S~nG|-|@SY3u?_2>t$Y=Bti%yL1HK+wzl~ApaD5OHMh$n)b
ziWUUI1-_662}BEG3121`QMBA_(K(!E<O2SHEf9hjRD^Kl1S{YS!Rt4>i7l9%Zj-|a
z_>DIBf1BBBL;Uj~|IhM2SUpAHUq3$>9dz3(wcN!2`%+GCf^o+T?ugbG)dwPaf7sv;
z8GQkr#|JA=SS?6B2>*JPe@uul0Tjm&1wE|(c{l&Z_&n4_Nx^eUJ~#MP`JWPw0muLB
zo#~jP{r)T7z}>uu=UMEhZ{Qf4ULR;L><5zMcbfk`4$uD){3HJr^M5)2LjEK=LbdZA
ziG-8*M=cZ5DEO#|3E*!H{*mwk{-Zh%(pwq){G^^k(pw^jS%!cI)`7PnKmz&rE)<Ip
zJ89%RwTv$pizGs!NGQNRBoOf6_CY>`i2(dVa)pT2D9k8nq7>;idweEuzzpDL;m7#M
z1gsvv#qG7Cc0~Ahn4NaB=RW>X|14%wRq)^6?5~^}Zd@3yog0kyyRAF?C%eH!#~*8X
zVoi6v<xY0|>0U6^4J2CLXvOJG8l8U1%E&MUAO$w5MVJCKDjua+P|FboK?^F;f>Ll?
z%0DjR9~Ukh6U;vW2^YwfXaVEnf+O+gk4?2O4~rm1!9y)*{+MXtxKuE&6!O%fW3>2~
zMvUc#W3=cAa>Yx@aV=ZepW^y3;Af`>j_27chx9DZxWj!W2Ha<I@4x3gv$nu}3;VV^
z^&h}L%l7Pe)-GnpX5|9+HVin;eit{s-~Ug3FzJGh<iE$&;^TyW0c!a3eEz}$f1XXL
zj`Oe_gTIIXUs@3H=S93Z34ac66O%D{$xRIinMC|KiD+IS;j3i=jY6PR3JfYC{?-7P
z&2|Alf_ZR`N)A$8C5dxT<YA-^6!5Kbn+4Spg~cd08z89><w}i2ONnWPSS^>Rq+-A?
z77HPfNQjt+6R`pTPXzxj;z=cZg<PaoOSD=fxoWFbgYe$r(8JH8H0ld7-T>nbGQN<_
z2NMWcarO8tNEuu%v%_H~@xR^dvYWk5Q^0LP{U_qHMuW~o#FtGc5&yUQ>%HZZ{q^(x
zwR8Q^e!Dd;=gNghC>QXQx09u<L~$#QYjrQvJehADmYOFE_5EyhE1eyL!UdZ<sI@T)
z1acOQ1hW9G=Fv)?T7g7ho|5v^VjdQ)uyiHi&53!(g@Awl3DE+0J`!&Zzim!}c^Jq<
z+@!$06i)bGPzw2K;RJYKPdx_zQ$yeekHh~IW^TX%kMS)3_q^wxm)y5Z>;K#m!2R?-
z`p&<He>VP^@;@gM%<=j2JpKX?{IlfqF(dGCAE^OPB3O{}=jHe_2^ZjG5G-+j4%P%<
z6vw5aIaEF=Il>;|v;n_`CIJlV`Qg|(_(8sk75GPmQ)0KsY-WYkq%a#4Mx9)*mC_m^
zMF~_0^_3zeVjh`PE|JP4_)jF2ig9JFfJFWz_$PS>59xqhCRD;$)DoJO>U1)LL18w-
z3TT@{=WrWbUNe!;;tE(j$OQ=hHjm%R@$Y2$hY7eG7O%?;{;~eW@gI-)GMOampRN9S
zcjaVn{oLZ(*~QUbvo$Ve%SD#|?xk|~VySnjym+O$biFxxsx`jR#&vY9J-pm(pDe)y
zLV3pN)0#~(11-{00u9BdReVZ`RYD{Jgnt!T0-nQf82kf!j(?2daPHC6D1@l_z*vq%
zoIh_)&IkXzUxWW68cEtaYYz9{pS?33PuZU3fBIDnrf<N2yMJ_ttp9Q@{n4mD@T*@8
z|Dxjx5$d0FLIHrEhr<`JV-XgGkzI-wWI~=?goRz6QZ%n3SJnjB8G)Sz<|M*7nRs3$
z=hN`~l*mMjty+mqPZC(x^CP8&=f~>pL{3uJm)Oiwi&17qab7ReYvo#vjH1N~HD96P
z%asC|LWHPZCX-4bTx1f6GXarE0C6S&_@xp7oWDvb(P(7w|9ZU~{2Pr*qe%rTFk30c
zu4NoLn^W&_8{K|0;U6=C-|F$AKx%P0%}$2}ayzX)mjxA_NAurX+G{jN#Y~Cgf9cr<
zG=8qN`eJ9}<;9Jc7T3Sr+jzOZ_TpfCYjJS3T|dYtx&cqZWVR{vIuvigzX1GW3P6>Z
zmhmYGUnQJZfPVp2;mH&L$T|3NM}|KE`O@9OFI<Re_ZTNJ@0f&beozSK@lk3aiFw#|
zi^ui<v#dUBjCtV^cq6F5H$LJGN5J3VpN&K29|!(vcys@Nvy18d`_1HG@z2!%t3=0@
z;yHzw{Z$JWB#0=4$0edUsc2p%nwN_gP#sc9_>@e5l`=J!x5UI8uwKjtLZoVncPgaw
zlmd}0vRy>)1b&B6VKXACCqg9?CcqW*@zQ3JSdC({USc52vJxHKJ}pzLr3#fuq7q1y
z0<l6UQHaEHp-3(f$)sXp0^+Iv=cWM83a|tK!TD2a0>4g2$Txv}qsn4r`8TUg7Gecv
zhK3cmyhd-(g80W5KnTRJ{1f<HPOI0=_}rGD`%(P&R?l>YyN$*j{+C~ELu;@1Hs0)S
zy*1o?YXEJ1b-4M)Xzk_U__==fN+Yux_Z4hbw_2-1Cx(zTV^F9PEKpb|k@0CMpMqBb
z=dAw+`BM`h{0lJc9+L}M3D|m(2=VH1nedoI_=J?`m`ngYq2M1w9-$N*Q_sXaT=36b
zx%hw9y-&vvV}ABN-psw6l6=o#_RaT<@y_Y@P6xPY#NzBs=g&PbPpSS@2X|S4e8D8=
z-7kZjhyEWLDVX>;F#$AUA%+pp!w}?>IY=Ry$4ixDo)Rz6Ql3W2rzK=SNfwmsk}>PA
zkR8DJ&neMLgq)W`z%N13(ry6%ci_K6J_<VqiCK>&A&^hnax@gsm8exxrAjJSidp7?
zJ&{-;7Aqt~h=GoRAHYA2f-sg!gmM|2|Ac?NUTI{pM|%Tgfxy2>WiTmW1(*ojJ`?=E
zKWy^{Z9c!v>t)<7tJ7fx{~njs5B@!tJMn+ipUsl>ul8W0w{p0+aUSsx{C~4KE@jK5
zXc+PT#@maVZ!d1W)8GC2(%yTcBaL^yw!Hb~c=_di??yYnn+nvNjE~l6N$`UyK&!?S
zz@y1>*b)DjRcH90`hU3)4RWMXgnB8f<41+lCuE|D6v8jb`M;sye@P{T!@tYte}VkZ
za(fT?Q@roNe)^JoKVHs$qfxqGl>evkZv_8xP(4pf0KZ-l{Khc~=`blu%&Dd1N{Q$2
zRgGjGZ4&4d(#p^;M63WUbBKkI3$U7(OOLCSJp4`dw8)|d;ROCE|HR?5*n@ne1E_@>
zS@O}1Myo|?rC6a5%M}v2Tq2W;C33M?CgLQKOE{4nfaM?LbK<4|mVW{GN2UM^I*NbD
z6ks+eEe3KWGO0{vwT00*-9{fOLJ?ai>I_DlfuO_Zw|m^o<MZD~3NYG7{9npd5dUQT
zp8XG(_dXcodh+8nPA5KEA$ormCb07Qpm(cXIFSuC-8LVsr4jbeQ!4lhqyP`*p9_9C
z{_*R@<f|4RQ;VNaiHJ}&B^^Y#gYv2Lm=ag<mz1I}DFwfw6n=^G{IfYPqM)gC4_eil
zBKnwvS%7E9+_%r(nH?k4;a<)TNc{7k$3L2Xz(1(wOo00>X{B?dDMB`<VTG5Fp6ho-
zfG6ka<$R5l2NO_{xQP69NM3+7O%)I8I|h{c^%BM)CH$M%GCs>ZhdpXRoDATf76X3x
zZ8-_@#Yl09xL{u*krE;5Au$hkosxeK{}Qo)i+{jBl3$pBPDkQ@lU`wFg#nQXt5I5y
z*Ak4_BMDb5>5V45;ix+hbb34toWI+}JdXdrkAF9qKk?D($&Xe~ePbOu{A3N|6CbV|
zyuZBt?r8b-e)m>0e<BrZfPYGd%vUk5M&^t94}NKE{6C3*j{N`K!9Q9P=hdPwfq!66
zNsrOeW89xcBcG#CL6sd>OP|13$r~UME8&-Dlt9t>gPJER?vMLO{*OQMt9O=zXQ`eY
z|5E-5`6Sx8vl_+5B>!ptf5Ja&0%*+z=Q`Ouk^((x-68$D5P8Wi_Hp5pLBWS~GGYQM
zk_#`WkPb)%aQ?{Y^jNDR?QKGMc$-0NBfZTNhi}qg6<`8C21YHZaH4xn$vS_yedM_k
z88Tg&LMnr^Cp8~T1rieeLsGWv1M*=G(?AIEKZ%8A@?Rto8m$cZuhF1@^XCNrZ`6T*
zEXx`K0b3;Iil@DaOdydC#1j5c$OZlZ|F6z}xeyL!eD0IqScOh}vV!X&#-FSle2nQ}
zwENz0<L&PF)mHoIV&NbjY}hScm73x!6$_9;zMz!xu$+ynLdsJ}7EnMy+Jz+m5nJ|=
z3KmdR5Ti6LpQot>lSyQ?OKlz*<B>9Msl%gi`PJ?)O%zbuU2?nulZ!@)`m<0Y#}$S~
zUZNraY1CpZCDE!Sx;v8U)Dvk?H&aQpDv=JgYZ>VZV=>_C<a}-&UkgZ%qyqp7^7(fO
zr4Q2d#bdvUH%z3R45s&|1HPX0h6(Xe(>G&4Zj$0Tsh=BBKUa$kDu|5P0T~-qVlu|t
zlzb!JNd_XLQfyH0j7pwKg`0enl5bKi7?le~vRMF213V%H&m>!b%t!}djBw^?QG?K^
z3FhV4qcKmIS75&e5oZGU5-h@q4_ENme@lQ_gT}4_>70^?WWFlIv1);i79svYuL4={
zA`)74Bo{WK-(Caeg=hoAN~qBw*6Sr&orKni)szG#AV>T|h===!5lCbbF`U0tOt6;{
z!dXcK5~*+^37UTZKe<XIT>dMeR5G1LZqUh1fZwQKj7qDK4AAOnGAgV#8Y^O<m@}I3
z#0&mpF__8)V<}%S?DBeT1b(;8>$dtmmXL?*|74;8S0WnBW-^s(uQS+MTs_s_yfBIX
zTT8`ksSpWf{H_zfxdI)0V|o9RF{j;+mbO0_Y<|7B@@8Y{rAqT!rnnyoHEd>|T0!%r
zvIU7`9+FBIIAJW7%!$NvLJ{_`kis(Hmy7sf@tjmXuR)irp9*EoiH1EhaOOskt2p*l
zSA*4!5YbAo(DNkAPJh(ma2XlKXtfv^tKI@JdXrUeW{ehxnQ_@{kb{-eW^>x@E>`H@
zbJ{Hq#)7w*XtQ2!(nzg3C8MXTMvc{cM-~$qGZu8!>MUlh*{rjWfzC|ES}eX}j8Cva
zW+JnZG8rf{CnJTIlRHMrV#M1fpNwzegl}Oi_<Z(XF_V9VvFNS%Orw@D>#Qa%29VXH
zV@$Z>LsZzrVb-Zl8p=e|Mp|yv$V?i!k(QY?kkm{|%#_$fiOp(AWKs!?aNu$tB7ZU&
zv77^PkWb(T`7HLZJ|irSSQa1Bvsb)i1?xjIKO76Ug0rBM&&j37RB|4rL{5hm7a?dT
zP0q9cGSMO<CDKzuJyvqDo(uSm5~E3?H%fH|saB6_zf`4`C{z-}|6F-<8s`(=epKxj
zi2?tEh4;&4Xa|*PG+04^+gDhPN}E~fG^_1q75bmi_GYzFF0a8KW+EwfwBV1I{i#X_
zCJ;+`gJI{Cf1j5Lp!n%!`S;tR0e1osel}OB^*e*@{@R(r=EcReGrgtVMr%;a77CF-
z#_t6D``=jJ`*_4*5AwI(@2|hxU46SXdcE3zp;Uh|mER2qn>KTRl2bg1bY2WsFPfK#
z7a$BokZ?{YJT4G`e-wXE@Fa`runt-;u?2O}ioGxjHun>Y7t*6^+2tGAwWsszPZu|y
zE^pkZtX(UO&gMJY+45o{jyWkF@P)jPC+Kkpy`He&8x8prkw7XDOs7Jr^hD`&=t#Kl
zi^sj;pv&uUIxScSaXAnT;y2^*`aQmY#~<`U+#nG0LEfN$Dt{<2l`lZvc1QSjPSczI
z$)^Q^<a7K%h<rFF?!(CyUkwHzf57ka`#5=hUI=5aA3vGX<+eLrHir{GGdAG`TyCG!
z?Y27{jMZv38clkgQKvN<^cG03vuHI&TB%n_u|CDkW$^rnbF{4IM^hs4_DUXsUriuq
zCmqZ?lW7P>G!YGdE;c|q2@+V95>q)T{=fv-sg{TJeKh(S(D{RmK*>&^Xo1Gg7qm2B
zy9&0hSkb7XFj<ralR~SPQ?yKp$|vF;w!A;fKbPc^Qa_v_+vLDqXKg{oCciX=%BK=F
zPsXHlSSf79@-P|~Lpy9VIzOE*oiAt!$DQ$vFJ1~JtHE?FoGOM`|3Bs5?g#%~OW13P
zcr7u%Ef(}7qoG_jU#kzg!yS(Q{yOSEEdQAa|0m)9;rTyW+Wla#{eB;U=im5RXZ5Y-
z(kqq03&rNubbdDk{w+aTM)O2s(2hlxIb68`wmm|8b^{}d$zncFCg97^)P(ukP6hM!
z>^RUqlU})1*?GNj`1Q`&k9!xs(L481=iGbUv+wrKe6_d#<<9!`R{vzZGA^YWxo9CB
z$*04)R4|(k<#Ul@Ay%y>YV~BJooaR`YIIXair3rGN<Ew}29pVYG!!5_hl0^~IF$}(
zGLdXHmdnTT`DiYOYdl|w7Yea_G0F;qSw(Z?Z4+g4(FX~iMm{88NEC_?H=e!2-Ymv)
zg;=f_%@(4WJjT&%K9YfQku;PICSkKNn5{pa4kn9{Og)lq1X5LRH17-~%nrZ7>M~kf
zR=byR_^piBVsRJ^Cas2|R1&oURS2F&i4c*LAW(%yA_gI%)4_+Ug<5P@LOg?d4qP{d
z-gN{pV#uUL=@L5!<>*x~8L`qNr8GhnR;*<6h-1}qbpCTYU&w?4!K_FZw3_6_Xl$^W
z<u*oXv#V^_pJ=1ZR<*&T(r6VbwM?m$-OoQ);pf)5NVIb#^!!YP*h!^C+KaNM+6td<
zQM+s!pGy~T8GKH?$EEkU4L-j)9CO4{o@73dgy*k?@Y_n4!n6E)J$Ap(7W7%eJ`20~
z%TD-@h4cABy}1PDIsOM5=h*eHej%MNMEn`Q1MnYwJlgwkNcewm5!!sWyZ%ml?ajtA
z_<z38ApW25Zw?_p;R{4qWIWF2k=?#nUOy5s0R%r{-U8uYD3D5dG8NC_(Ingc=IQkM
zbInulEnWV-)f@k9{h1$b-u}~#Tc54o{G-*Uzq@w*_f{``v~uc=)y<pZ#nXf0N+Z*&
zr`pwIvl?&I6U|nt+fDZd*}*73TrDhZ6h=h(rS++@gSB*ToNRUD<thjdXR_gRHkvKQ
zOZ8-}m1=a-jc%sd!<AE}NnU1Ki%_Ns;Y+=AvxfmuYoh7<$)I^8_CvT&An(Ne*^fQ?
z-lO}yRK1(5b;z~e!5~>}L-9&8TB?PL)ex?wR;bjElt<ypN}xRQ=6a5J%^b*?yh)2U
zY4fLS-jvmqFxf+}2(7`Srj&BIP_E!B<vgs|5|KoVjM2AAt3<R44QynA9`$?;dM{xP
z6e-sjG!hfKkxkfFt2CRDQ7ZICnbshsv{-&3oykfh#WEDMv8G9KC{jlvCa|DY^Nm{U
zUctUah25@lpgq{7u{ksrhT`~F&+yO1JX6n4oIl&@K(bvC*8)Tqe~CyhoB;e|<%>9f
zvY(4`*)@K*KIk(>z`V~K^qYerYdB(yrQE5UKT{5a{7fT~YeeDyBS{Yz|DXhVKmYNN
z7yRc4|3i-d{>FvD2Iv1d{=M0N{otd~!ADDb9}IWjAMAktulF{;)>(hMz5Zr%?bX`w
z`C{|QQ~~jS$7zjd6?**NK%W)*%t`kG2R~VGXAMEfC-6%FzX-t|50iyGY%HxLM%OEc
zUtN0gcQ&8-Z11I?o_y`+hhO>I!`J@i;FZ7F|MH*j-u`U&#&>ovezbG&>ek9rYxNWT
z;z~C=XlMHEbRUC3VZ2gU*(j}USJwAyTZi@SQ?>0=)y<QYjT4;8>j#D9t!#gsZgo<%
zIu<+<rE03yN;em?y>Y(3l3QFZ3|66hA6h98txi?Mm)MuwAkV&&d;jD;vv=4JIr=Dk
z=Gs)nA;#oifw(dLx!mFk_XFT($n?h9?l{v~O0@>@W-r?8h8v5a#wgTW3pcj|&0Sx8
z%TrmiXBMrox;a#0!X+kDum%biZ`SCH>Fs`viBV9LR3?(jP&VYtXQY%1RSHs#S1WN9
z5~)NKq=v*A3cJXq2Jmks`$x=HB7;StGs_X9s#RhIK8oyo=TX@FPPWyMvks6ZlU?vg
z$qE1Hb9)H?luV(V#6PqA!wNV<U<c^@Awmn>bbv8>BvAIjvX{j|Iqk&rhy2D!&=d<;
zqCra}WQ{~^(YPa#@nnm^Ts4ww#<I<5z8M4m;e`8$fBTew()mL+zuPDL=kg8K{}0yB
zBLD5LPvZYV2L9he`2TQ;=!4<@`-8pr2fOd}x8Ln;zSG%ytF``Ged&dA>uS1q5DoWS
zOjNHhBC8>NhP(w#0i@f6Z8$+QEjkaek`Dd_QX$Fu#WJ2$%XcQMjqU9AbM4C?Zanvg
zhp+zR+#5f?_|<>7@XkM8`078Ld-H#sdHrXHFaPB5+3%mc_PZxfzq7yb+*aq@xVF_V
ztt=Lo`-SCUVR>9$->huz*0v7nJEt3a=UN9B8vExks86+frn-4hUfwA5m-3x%uGz{o
zTG{quzBei?t(V4I#nDE2d3#pX<?RP+b*K6Tsj@=;Mf~ff<t_3vj5qOPVE!r&*K>oF
z^x`PdTZ(kXk?uyUzaJl-jttL+`iFteo~yiU%k(U<sx?|-A_Y@0XYyqXu9)5t(3<UX
zH7!AHg_IX~5(%G^R4S0ku&+v_P>_O*LMc?LM371?CRdf1qEN1p=)gatvf3#I-rG(a
zZ7MyZ)R^SpUye^8i!}>s)&#I>24R;6;a@E<=`eA~nFsN&)hgAr;t~AAB3O~t0kpx1
zkN$%tfPc9{h$LFCS77_C%c1dmbm4$88Zsxs))*9FVljIn<xJ(g`EsaOkLFv1|6HB;
z|7eoSf3XB+_jw%wzdht<B7SSk&rJEx=L*f%2t9AZjdO#oi}&$wJMqzIDj32(CIZ$3
zcHi!9eWkhb<!bkOu5>aUUUXaISdfqiC8Twn+cd}ZkYnov8SoZ_3xt1}2n!-2nNlFt
z37jcwXTNax(&Ei;?!Efig?Ik?>eqj9{r!Kw@!`MTc<-ODeeHi;{_5Xfc>OQWzwpDe
zPk;N&`S(w5e|e{Oan#ryRMr;DtNqgIQfYNu-Q22e@6~q>n)_#4Coi@SFG0-{7aAwd
zPqcrwv2&ujwpALAioIUG)y_3LxlX^>9~VdKm6fgX@@92)dsg+;-3O|%u|K1|#>TIy
z`ubjNZMQ~DX}h|z4O?J;4E!WpmC<HlX)QBYPAraNy|wsYCpkWqUcHc9yBu3NA08fh
z8yk+oq9tB2hH}PWTJKNky$P)=taJEjBO_B%BDB&A5LX}<n+SO&l1gl(l98oY5ukzz
za8@uGsKsiH7y*yPMp+%S)upjGHAcHyZ&zu}3Q8?i$wX)hl8fN{1vCm5=#-F<T~6R1
zMInnph9=Jk@NZOUbxMlm|ACn=JU{LOe9{3YlS5b;6N_O55|M~Z26CkkYhDI}(#lNv
zH$+1w@E;E|g#WlbnRcb~zCtBjYQ*yGIQY-jBZU8y7n{X=J~)4e-{%Yl9AQ7oo(%Yp
z9r^$KjQ>ad=YIYVzp+Lt|DO<@_-Gj>z|8_DzCPH0ySMpjYxqpLdL|hic<d>on!&Gl
z0j{2R<opr&U{?hsz!@?N=rc$E6>6kHsZy*oO8o`L;;G`9*N3-%@5CE_cKN-(yYb;a
z-Te4}-}>a=Zhic}pML*;U3>fQFTeIz=b!u2GdI3__WXyZcV9Wtzp_!=8J1ST|Dd$I
zR9;!FZSK^z59+&z%@gNaCogqQU1=X)#-MfL61kqd)I2y}-#M(TZ5M~*VsA0u>P+|_
zj7#GU5D4~aYdf==g}<@3cSjq0^&{O4|9*XQzrl*UVe{y17|e{hH{e>|*sredRo9^1
z%Ia=;bq6Y~Y(vH6joi{&8t`{VvF=)8c#v8-msz`#T)Proz7QImayK?@`9)K_VhE%S
zzL?Gv(PD3sGpJ$Q3XMT5m7|9q+Z8bd2t_<jA`uU@bSZkt;CZkbg{|czM-VBLumxPj
zs8XtFnF0G}?Hb0RwK}!n-)yH0HkHn-pb@~x1#<TAAF?6_@;Q<CXF*SikTP4%;9qIS
z_E;xnce3%nl`^pWt10+@RR2kn^ECe9A|Jd2|7>bJ;U9(|#^S()f3kvx#!;hQ$*}(4
z=hg-NhA`rvkR=LX3V{D-`OjCv#YU{qPGp+VbS;u3{QH80f5#pE11NhE{yG1jLjIe}
z*IUEx(#~N04B~&n|8S?-7!)V*kL~oQ`=>wMJ@s4Lr+#zm@EhwVK86WkMi`&@aQXCm
zql0$_8?SWQmvX7)fGcm-I2A(W0`{syY_~0PUUtokO@4_jkTox?4hSW3smi1bS6zd%
z#nW#tJ@=i{Z+w2`y}!Tl@xR>s<p13I&Hv}tH~;O{NB?}|-CsQU+Rx74{<G89{^0c4
zk52EtdV2WeZgqEDSRIyEM#a&%vb<hf->qz(sP3I<p19OL`DEwtD%3i0rFn39qWw$t
z?b86PI9w_8`tbkRW;@qe%=f|nCL4!rO`{s(mhTpP^;9+Z?IVHVscM@C2*5a1H}|Ki
zZ0yZy=Kg+pov5_72NhR$3oAQNetA2$vXL3Brh4Oedlc)eB?bqH<qNU#mH7BlcyP|w
zJ#<tzncUDEuNeYKgFC2mg|yC)*5OkbtzxAbUQ;MQhbM|}JTA4ICYT&-Y<3_dbAcT2
zlf@}?reT9ErBUe7PG&<Ru>p;gj8lV&z-&_)%&1`kQJ!3~04Z?(4>kgzSxzQEE})U~
z4cG`k_*as%5p0y*fqmi{w4#|UG>&f2Xq76pT%nMY7Ej{QNp1`HIa`?gNMs5i69BIG
zkBfrvR=i&&!%i8wL5ocUYE(PjPMzPY5BLnBfC-5Ju8{~6PdYMrU$zn|)T7`(+lr-X
zk$52(N&A9Pw=dxE`W(IhPEB!!f)oA|etRP1CH!ae_4cqkV)@@Z-(Nl58*Ddgi{(_V
z5O!z%wu|37b>Y9BJo`KQ=YD(d?5EpjKixd{>E`)QH!uCx)}`OvJonMsiMPk2o3+wT
zB2aVbyebKW63{&ON0T{Q^O@xzYXNf-3D(6B|Hx^JI^OiHT&kV@>hcTUIrHY{PqO^q
z`ptiP_S66OHuTBA-uU`ITzUPk&)xp<;q^Z_dG6!Gz1I#$*LQ1s%f;1UY5D&u{_ik9
zg&c!f_%WV-sj~4<{`1Ql*`?L=;&P(1oLt;WEuBb=&qv3XqN9u9!C8Oz&{^GJa(z>*
ztcB-ycr*^bn(?a4b{VZlS%Hsj<AMda9f4qh+d_4;kqT?6T!R6!W3052a0u`#loHe@
z5u#vs;luc+X>_mQ@CUM#PtL!(!#|EYApB!h5vQ@i|JykJ?X=aVF_|fyUWHEv|8hCm
z<1*deH8lY?_~HCN;h!^sS^lx<p2ngEmi!D#tadr<w8y3Kd2~nw$kk^EhAq*A9r1s*
z9K47BXvP<ear`?U#eZu6|KDFby|{6X@ZaA8|D{xx<Nxxv&tCra=}W(V>I%`xtKT{K
z)E}IJZhq&~jc=X2`rG^GKG@oMW!Sq=Nex5Rs7|3n8jnpz$M|g7f7bsKzl3!$^g_tQ
za<xKd*QPqbl}q*WZ?C=ZopW#e`0BeqzwyC8KK;qRKJ(3gd*++}=b2Cb<@&q7xb*s8
zo_Y312Ty)y@9c*Md#@cVUEi!95dI6}VPR?Xe~tftEdFQB;bHziKi<d<*E0Q;RBt8S
z-^z?mrdQ4<RxXlRVCigV@zC4cvK1DM;XLK=VebcJamWo;Db~@I6i+1K3-CX#;D$i-
zjxY-xd0hhklz%w?3IAH<!}!-|qz~kuM!%>6i(e8G&KpEe0{F)XId(OJ;y*(fO>q7S
zH7!@F2-h6{ARj*ou1Cg5;AbU+kq|q<3pNYp{J&f*q?DKfaK?hdY9{lG+eu8o>(Y8$
zS}*tyVdcvf&w4WDP@au{j{JWB{D<OLj&%C`PJa-!AXhkqvcE0vXYTU<&3<QTYjN#x
z5g`zEf%P|<)ov-7DTIjsfAYKMuYK?Q^*_Au^dDWk`QI+y{=uaefAl2u$`7x<^n<In
zzjx{Cr-x_W*;>1~Slh|@3YapLVhNAspDY1P`Ns}Zvegs^aY%Ux@#HFn(Wy_i1LF(T
z^KY#^_pP(9{qV`R|LWSie|Mdjz`xx3#=qS9=%1eY+TUG#^{1zv`N8g$Z*8CcV0Y)`
zo#EB>2KZn5<^10XfgTk9WZS*m;;1y*_(I~J>P(z7b@KQ7|Ht9KFy6=y*RuVUbZ<Ge
zxRx63r<TtqR?a8KXJdm?q238kW8IeP=tF6>-HkG((qNX78l^_0P@!XnFW{l_hF>-E
zUwHnh31EEWbqW7mMvM^X4*w7H{}zQ-D^qYw05U$#KY_UT2hHtM{&5r`NdZ{?vG|2<
zd<6f5e@e#jkGc+!#?Nr{V~|_`u%k6dp_st`0RCtE|1AG*r`8P<@EULyTqtIb<-DnK
zIM;~hkK&(1F&NGGS^iyq68v~V5qBiyAp8ex;6EPpApR*J{%`cUL-hZgAXC8jM6b8j
zs&vZ9H2BZ?ZBPBt<){Dn%FXXzx&0^CUi{PRuYP{xjh{aA=3hMf_D^4Y^YiCk`Qg(y
zzkTuY$0rY7UK^ZmB|1TqODn<puHZP2_rxs!So}i#k46Y7{6AX3RdS<CpJ@3<=PIXP
zUB3Cv!!Q5Qh1Y(3>8-y)7Vy*u|M2w3|8(p9UtE9dXBS`o{P5QIcQ60$_UW(hY`wTS
zys}!~9~ak_isNB%X;dDsP2&IE(~X0RV7`0m8sdK>0vK~@-ot;hjrwP%)y?*nic6^f
zkn+nk{^thwqyTqb9tHo?EPxx#@;?jtBgXgpe+2(falD19Y<6)u)g7gKtEs_uYIGt#
zJ{=t%hI;$H*0!UxWKB2p!6fZ)t4t=P-YBPaQWcF*9&1PfY?~#gH*oy(r~G386JTcn
z0zXPaxFQ5X{KJ_59JS6!;(x1M&p1i^Z?RJ*MrFh*fKIMN$&_?6k}dEMf`3+1{!#xV
z{-3d`7@Nv!BO9{x1|`QoTl}2(|7kLU$p}LHBbQ5+N+by6zbXH$rQjk|NU&fh)oZXM
ztzzK+8MVtnrU2AJolcF*qsMOdP{J9{`%)FGkS7YAWUd`g*P}_S|7HW>xHk|&7c>^a
zPz!QL!gu(O1>C7<xR@){>)q}E```B#R}Xumy>4fvS!q_1De#{U*q{0A`t2VQz4&K0
zUithc$ba|eFTVG8ue|@aU-{Z!z4FFSo`2zwpL+UtFP!`8{`z&q<SA#^ifuk51!fO?
zp89|A52u9qAO0UE&-wpY!`nYyJa}<<?Zf@s-#+u@&(6K}lZ$Wt^^@=X&6Dr`?bWyc
z`pT=HpS}Gj2Uowlec`vZ4&T|{e12<qd9Aib@?XS1;J-9pt*-4<HV-Phr|bI{knm3P
z--r7DJNzSpBK(u8<D>}5l_Kvlhdb~iFaARL=Q3X|`Q_lxkGD&s?E?INZ=A+LSQk~$
z^=N-Pw747U?D$)o?%FC-=vh-WeIQPw)M7DG*bt@B%CR{dyTrwa{t5Fi0aA88^8egm
z%0E{Z<@m=GFf{=!+U*!EYXVxpZ?>t8m|p&a`Ddp9#8<=wa7F%mG*gfQs8S}@sAUuW
zEl7b;1tsy1%Zamqv^em}k5hmWu6RC>szh_mWTBG+|CvS%{Ks>FNYWPydx9ar?+Zsg
z(TFo1vL}M}WWW)h@Sm^O+TF!fqyW9;0~CKc?QyHrC?^se|IhvC=@<U&=8HeR`O4?d
zyzv*$0saqv@!BW<_{Jx{c>BG-dE?Fh{^CoY-F)`jS1-SJYWJCy=2jt;_P~<}gg9-0
z>v@>uzij-ofZ~r##z!?wK`9ac2a2}FZgTTx=fc~YPknmu?DtMS|0kzj`5abo{`LQk
z>EOb1pB+B+y`2lcvwr%+jlHjIuHM|}o?EW$EEQG;h0%cHzp}Pd-Z&`loT{<>!}D{_
zpDX?}_AfQ|FWy!EndN`D2KevStU25@HLUHTVtKzR>$~M&QTM~28*k;Y5Vo|1CF4wY
zlxh!?ouznpCEQ&Pb~k;^b$4yWQ5jisZF8cm^GCHH0Bxft1C9O|><<C|BEdb(Pu>1>
zfHoT(Fa^O1xGaD(0d#3<^h%Re0~5erTH^Tuzg6)c$3Nm9E(>5y0PK_CAN(IVe-V}e
z)N(26KY9)NoK-d}R|3)y{#`nU3rC|Hec0`lbjJ#TR4tZkr3&42zLNm|2!|4RltBIA
zh&L4WMPh+y+!u?w5@BaD<VXgch<}oiP$8QG|DE1SXSmgY^B-)rTT6{%trU+F{)6@x
zest@lKYQk-A3yWjPj0{Y(-+?To0kax|MM%K{KMPt|NM=&e){ECe)!Du-@ShI!?Oo3
zthW!Uv0{M6xv0W90ZKqe@jqq(ECHY*Cl(|56%hV4N}8eEDQjUI>z}XezqEAj-L<Qq
zZr=Rv?(Od%JokeW&wu~qtv@`t`um&be{1>V`%8Oojn-dS8C+Ru94wVLmay`h9}Nqm
zQE|LlUfU{f?i2p^&H?rI$tzIn1d5-8d;<Ue#p=%K(%Md8xRUSnb5s7iLqytuA63vY
zI}02c!ekcUtYDV^3d;2;w@$QIS>LOy5tY|x^;ouWRQ4p5Pg3?ALtD8Kf}gD%*Ra%H
ziZ_Sx_A;uVk<Plex$3SjJ1awLu4PUX^!|w6<<i=$I+IaD7FpFOpdetF@(<uSac+OY
zzkuYP<bXAnf27OA1h7SmEd*(Bw1Sy3T4?ZZv=GnF=`rFT*-ZRH8j`vAhx}ZV=7(wi
z%NMcv?@=<&?gAi70GI-}G#Or>JqU|z{uNOQ!oNYQWcf!61f0Lt=h3@e2Ap$<-M;>a
zEu3<r+DZ6tr;5D{_|G=usd6Nd569BMNE9aEkHmtpINAx_iKvq`0r2lmMuPcFrdn%t
zdgIPuqchybL0Ik9piw9nW06A01^!?B?3tH-c>Coa-G1%QZ-3?U+h6_Z^Y8uq<&XaU
zwGaOOwXgl`<u`uv!b?B6_3XE<Ui$j!o#)pZC##WaP-|ART|ju|1?l!@n;-DE;2|<a
z2@J+AQ_JLN3&bJ1VOy*bt?cH;*V;QT_RoH8eCgwrE1#}B`FpEZerNT<r)#G_UfM&i
z#cQp#7rIN=ms+Ps)h#UA4s-o}roWgO407Y;;`(-ZYp=Y0qK5v5-E-}O3mvR{?O(u{
zQ+4lrY4=QiZ6`Ne$#wggW;<O&3q&W=8RV8$i(}F+S0aHor=8+>r?LV~tbj8L?j?y;
z)+Q>h@0E^(@dE|OAFS2A;>vzuc`rZSg>s{v?9w)?t?XzsGh9dU2lfAGeHd#jB^u*c
zee7+FU9}}=v1`xOY{|R{C-J#FI0Q;(v7*kXK%X(rO2%dqb_(Fu+^0UDi@I?owjjc}
zf|JwLgvbJr3$rF5M;8QkSDDnr1gs?TnM?r|^d6&uQ>c;)AWANTG(_a+Hnc#aS(<c>
zViSs4NxJc^Y6~pLOdIrStwu?y<=DN89)1Z<coPe7m<hIYal5)O=8kv34qptoEB94g
zaS$Tjz!rbxMjiO47&Gm}CP$av=P|n7MwiQoL$0vlF`Duu3V}>5nrp@3|DjAHp031_
zg=j1tibMn8KN<_f6M;m+8;`qUF-JV=h=(1in6H5S@6|@DyVM<Q;1tkaf3?%<)k^3C
z_7_4<@c+yo-gxf&Pv8FI8_$1s<HaA{c=gY3z4e#Rz4Nmd-~HLkU;XKeum9=o7ruAn
z=})hmfBWR-Gpp78a;WU5u@gg#lk}dzR<9}l0*-&|ewIt+DwzWO8`Wm7)tB?d2jSX&
zwtum-{%n2smDa&qox^v!2k&%u-|B6?-duU1Ji1jLUaxh}b*ekv{J59tb&~B?qTNb%
zd)cLNVQsUtu~XXGuWp~J@0@GyU1;uKsPCSy?VROQ**RC*I#XEPMeACwJ4mBVy4p(D
zI_dTRZ3g+#8ahSMep5i>0o*SU+m<ul#Jn&SX9Z-HYIUOGks-h$FrI2^Amsk)L$tDc
zKm3{D7SYm1da#!4F2`C+(fS}-8$>GoU}@1+T67e9wp@crm8`L>(I5UL{Il^t=lnrF
zS9lQ$$f7WJv>@Pz|L6FZU^z@F`qlX-r+^x0G28!x{1*o=QxHz3GMOm7Ud8e+`&Iep
zY=k}bO`ukg_#f+EW&=6!!GX4ahYlv-avHGV(d{({!nQ~f{@)M&QT!?P2>;n;g5y7)
z4kPk`3B=;T1mWKoPq^c8CuRZgpNx9*IRC0rZ*&IT{#v)c(e16Y8l7q(TSx>7AxA#Q
zT>hO4SHE@n+V5Yw@!iX}{_x4?e{k)UAK!faC(nN6^XFgv>{-Bn^S7V8{NA|}FYPWr
zIV^7EycxGjFB6F2_K)4o{{kjH%xI+ErIx96Dr{;p;CSAM-Cyw}2chz2tbIB?ypmgf
zy1e#mb>-RW=vH-bqtv^S>s-jSP8Vvs)%>`Y?4j+s7OPidje5M*PAy^zSk13&6xOy&
z8z)Mer^?%Bpwiaq^44iwi8fCcR!`<e+u7bSTAJbfla*$winix|x;H|z8HAP_#6tPu
zMt(qqR<x;*BXeWeLV+#OaJeeCV*6`emv^R~pL6~k{OQ4F8qUAJn&^z9jX|W|kJS31
za>rL_I`T~df4Xdq=dIBcPCC$$=b${?|AYUd)?yBR68XRoP@F*gFXnO1ALPRXu<=l;
z_?7v`_D^#7Im<s=|D5`Nitw*fX=wBg{W|=^1i(K|L06&wldb=-Qve=mggq}BG=kdg
zT09yFhunhyF!;~2{C5(i#SB6q@Smy1Q^i<36OP7%(I|qSP$C&jCjE(oJCSfDW3EKR
z3I22GM5$bDv<IEV)lP4{+Z{J5&1x=FOazJ{N72vhzO{4WYx}3(J8}NwQ<s11%=O>9
z@XU9wJoiV}p8wvBXTEjqsc&Ao`0km*SN7MRTJD@|CVEkO#46WdCj%*U&c;7*MBrZ{
zMMo1>>18U7g2Mc0(VE;Qdz|r=TsUPkHx89HBaQuJ`&71lD%U=pZysiA`>E=7y116f
z470IjHdsm{X$ci`;c_urt0mf<On)gmTEVhWVHvUXL2>m&arK~p=wfAhJ;;ytbBi0<
z_ENUqO;?)9Qax2h=YKEN9c25XY=0Sh2Uz~s@`Lq!e;v%<BhCc!Ba)&{G-C^s>J5A6
zu`Gh+ALQQ)|6nr(=f6mTpGbW%QtyRpolv>uFVvmcsv}der3%(q#u|<pygnT^J^=!B
z20tkNnZ`U*_z`lGrh4SREc~4FpYTsw<9-eP*@s_XQ?L%}Q!<zUPT*&y#&#7n?Sp@8
zUHn!3KVl(n3cx^#lb@9mtp9TSlc&0{$L})?IRX?1p3L%(;wQ&{u9XD;=~66_i9{2@
zXe<~{gpwTpNzWbrBc5C;Q7ToN%|$%(rPE#Ov`6)FqmoTe`5)dG4WC|GeQtI4)$J2+
z?4k+z!uyApKRSK+!?PFPJA3xc!~GYx*RC!1PPPl9tT%_5iI!4A#6M*J*G&9_!zXd5
zFKTXBbH{=sdSYp<T5HhikihwjwusfAb4IJ~WXqT71`ESjX`F(J<793SOZTF&MkI)Y
zJQZ+a-%dR2PbUJ|Ot1(Os3+RJOn*dTdpPhV?0jL@PY0{m_L3d0<w!g6W_EEi*ICYD
z8%w2;DpeANa=ciJSDV-rlj`=fi%VHd1dGe)#)q;{pD2fT=|~gnSlY}lkuV9#!cnk9
zLZ+kKVVXot2D9@8o8)ehB$sG=8uP>R6SoiY`x{WQzaH<A;3v{p1pI+=BT%ZLyVsQ|
zI#UHEp0<RdCSSnd_Q3g52({VzCl~*t?~SZ~vBgi0f6nJm;~!!RVgbnKf*-)o2`R8#
zC82Rdr%7!>^kPNjpDli({$rMFbuyJoq*maRauWPQG{i1`A^t)CCsF`Bdx$)TXz~yi
zrQU$x2hJb8(=u$}Li|t8IYHmo-N8R%en=q~!$w#$K{`y50`7|YI2T>R_CFv5LJEu!
zh(uY~p@0K_ENC>+c>|UZ;+aAq&Bi~a#Vq{)5&!X2E|y3|;)!q~8BSq$Y$}i>{JWEJ
zS2F5C{Ez3~mCMy;W3kg&?sQgL&0(|LsO2){SfCVOiav8`r&ivpHBWW=7YC!OORG<f
z5$A8;Ufq6vWAoYdwWn4F=NFrM_1q9N`b+`6$_&0yWkCNMIlXfl|MRg61~Y<$$5q?V
z6G+p@k7$iSqqXV`4ujQav<Ho@h}9Fb`IGKo+Urldd}*gU;dDmrOweZWF*w5=XE@pe
z9#=HzNyPlwOo*631111ZPIiHHhsbJEt;J-s2c_DJ>2^Qe9;RDM>BeHJQco32$pYE!
z7%P-w<!ZFnOf=g`e0#f}?T&~LBlVUui_23Jm=0h9*}-~l2>XJS5kr`2))r>Rm?@5c
zzh}T@yW7*SX9EA?dZM=$>n=x|!*IPDth9W^s<%*bXLGJp#+688vxg-ZGJ3pt%!3A@
zHn!GJ>i?7!PYmMLzu0vyuKo}5xfEbJ3*5s$$mdLejsNfPkK+GP{;NRy{gi(i=a4Hz
zNBJ+e|Cg8mo?C=1!`MNCU6VMXM61Q$SB^s)06z!zlz*_z;mzf|v-|^i&LX%gz7ft}
zgL6Ev1x6z$Xa3;W6FlCCJne{_f@rhf;UDoo{C~Nhg9#A+Yw=7uk;)VP6G@`vL-?=2
z|AYVbxZPf9)d$UTy_U_CV*XOlTJoDBt#quLP7U(;<zjKIRM{-ocdCuuO6{OpJ*bv<
z%Gq%y-ii71UQ58BvMS^{iCD$w<Fp5I)X7Z!2kme;{{)-jN#_-^0I5o;&``8itJN5E
z8lzTc*6J{mqiN8hF<A5lt5I*o?4U)AMB@YmqlPkQG$x&%G2vkzHowOi4*3wz<#M4?
zC01|8$<AL?b(*PqBT=bgt0R=CK($1*nW(ms<yxXph-Z`0bS#{XWBYKVkdKsWahO28
zm27lU*yq?9WMC|0;c3J!zmBnS61KpZ1LlHEZ!NnBt>^mdT&D2oD=Y>1>{O`j9BB*H
zp8MfXa{PBzB8@?)*729>o_xub%{h`u2evtdqf8)ZV&TWreJGs$ha)G@98YUxDhdTe
zEUBXVjSDpqZE|jZIuT4v;BNm9{wMWMwN#^rw@2#^mcVFk{maBCQ3wM6YPFa`f~*v2
zlsLCsgvXx31Q7pY|0521#c^ji@sgZ-sTFH*<{=LMp{1zE;?UY7__<>~iF=>Ie-Hnh
zrQrRr0&YgY9R&Z#e>wgQTA4|Q6V%Dm?zmHM(Pifen1iV2u<?JPo2m?QFoAqKm8&PR
zl|(upPo|?R{Ey_nzSwS#+s$S0-ze2<nRGeouLP~-fXR{cIa7X5Ht5TTe1%}36bx0M
zK(G<;)dRjtz>#rS!e+f&t2E0M28moNmZ}8U1%m^(x%eL|zr^`Z_@76MBH>?#-;7MH
zQc@I6X|<G2qtVeC9YwBMtmkMoTD(v}Dq4lbG6k-*N~KZLm@mwF6CTOu^SC1s4@@AN
z3zsTk0(?1HD#nZ0qMiw7(vd8biRChpTn_s@BiQzvjECZ(V9Xzg`vb{vFq;bIi}+sb
zj)!8^R=n0t)O*RsV!YW;v<5`%A{5r4jw}MZ6jw4NMp}glO!Y7;cqso%C~}f&&ouG@
z`5gT5?rOBX9BK4}g#W5Lmv^L7OgzejLe@aQ<o4jXJ~(b(gNAss32`>+EJ|z<RN;^j
zq_QZTPa4cXJJ(~tT_559<!Aw=&_hA`-vB@3(4+tPh<|n!%y1X~Fahw-ngGkc0w#bB
z#n??O)Zn3ww1}d}$v^msR4mL~JcH7QOgaf~IP&l3e`*4-0!XctB7Fw`8cMFEasCgU
z_KU->WClHU1QQbg|2XhD!0}&?6_MKw^5DPFN$2XxYz0j`@ni;5Kr~78;P|Ifsx|Aq
zR&%M<7`1ABEPmC&f5cx2F_oZMZ!_raMqG7_UTf8AEE?KO(-xXyXsu1Fv*RY5vx-6m
zS|?X&BuezkD$%-)f<IaRB36L2AXt&6EF1xi6TPtig>Akii7<uzacW9KLpVQ?Mo4rd
zwML<$<@nPm(L==wFDb2(Hc&Y7gK;w+98?ku;pCG@J`*YAV#RE{n2u%>VJvLK0$B3K
z+D9yobHGC3csLXd2Eu-S$m<VzpkM@65cVZgfpj*MD?|$ANU<6%)nnymtkObikRbCy
zJJIOI8@+gQk+X$VXOM;vWpzgx?A7gJ$L?~jx0+d8&GgZ<wvGUaTxXOStWT94u5-d2
z)Okp$XRt;RUKD&5S0F_G(e6s5Jq|Vdfoj`ds(5laM<T&Q@mvR=+3PVn@mwpLj<F)`
z)tXR2Gf+CM5+Q;LJtb(VA@CEo&tAFlz5I*#)3PTjKOFp&MybR3MFf6>!)>;^4Cw!v
zv;bih_#yn`aR3xKUjkK10oh!P=6AN66b(g6Y#Anpzmfel*lR=fBV)%6IS7s%^~v=-
za92bE#2L!5On_jCh=m`=K1hhLHv#S0h?X@9bQ5WH2*&X9U`mps@5#c@IQHL<l0Nzl
zYVmw0Q|=eagM6`@%Qe%PYBF7lCvvekrhs%LnF%M;fkeuKC4fZCnT)tH5pN*{msxAn
zJ6QH=xBBgByO}T7Q>khMp59Un7}a`OrK9DTht%j4!D+l=zD$Hoi~@y7N=cPEl?pxH
zdKGO@(R#TW=N-{f{9nF@f6fH%;U8;`EdOYcg`bBBsA(<Yp*vDzRUh5stO>vhIN>iv
z3P(6y%4K#1+}>y)n8ZQ7DPjWoR5%+CrNV)j-ycTfw<{2EhOp~_?Evuyac+;>>vws4
zPRQqX`GP)gB<zdE{3wm3G9k<(Fob+53R@_Zqvdj}Qj1j^v1&6~Yscy^ghhl&NfKxw
zQtFdnskM|rO{qOfcgE@Na=N>cfh}NyI8qvRF{?frn~Bo>wG^x5;wq{=1peM~yo)_w
z<0yh3@ZVSrRGPkG*^|vU;xQ(I=Q?;zWYrTV-7{G8B4=_LjI>^_)@hVT5mm@Av32|o
z|92cd+j2Sca>75tendXR^J`Taom!8>!SKLPrxE<)Abq=AXLD)E78pha-pB@Fl^BBa
zpZb4Vfm6QuNQbd5ra&tlx=FFrrN9yoVV+<wC)kt0FX3N`4mj|ST`~}oW%i?x6qCbG
zkU`V5LaR~fbSf5p6*3;g{|2Ly@b91UpD47m<$kd|D3p5nd@GZ!rPAo*$)gGyO=O~k
zf1HbWH~+=tl>a`<f2)~CaS$75f@Q@20h2<9Qyn0kT&I<3XemueaGo`$C9PU%pedZY
zWYK6%G(;O!YOP#F%f2Z7VG%G5h@B(IfU^{=9pJDhv)yF(I^1ySiD)RB4Cly+K%q=L
zm<;=)eqRWvJF)jO5U~3LPK-T1hudq1TppX#W5X49Jbt$qM_PqoFQGs@7EIvVlc6+D
z7Q&>Ei{uO8LNQz{BSwmp5hG!(8etMFq7kpP6F9Mi4VF@keyTZ0GzZDnFxgs4wP(~B
zqlaxO7|29D(%zr!EhoBTR@m*rZur8QFM^-eI8<8<l$*YM$(_#Fqfu)pVDWmuzutj|
zJrMXcaQ=EYf1MgdMjCN8Vhh4Q$2s!gY49_-!}eQ}n|Q7$`ajY6fGJohfj<HKT0Lbj
zYuL?SMyH4H??9a8X1hSq1&8^a<sT<W-o?L?@UO<YuuOmr@`%4ks6$p0Q0T@ik3{cr
z`A6#;$Y*n8#Ku?)Cf39m6!_QRXfK`0Xdq=(tSsR1zU0XlIRDL#_{SQEqyH_GL3-ED
zRtCk&@Gkz7g?JnxP$rtnMp8Ie38#Z5?)d*l@LvhqD*>E>fgsb2er`?}<Bp0vdqc0W
z8gw?3!Dckr3<fI>Bcm}9s5SV1^2PXn%mv)Mz+ums1G#E3-B3oG(dMzcLOx$S984!d
z*%YiGl!*nCAz#Gj3Bb?WJzjDGyw_**crmuSJvPXN1F>-?ybI6AVQ@GeY{chxphX;^
z5dx$@)E_`hmB8n)2g+sAI3o|oB(a82f-PWbh#+M^;#_IOtE~i1?5TAUY~qk06U1Vw
z33Hf;SVU)Os#Irmq;aw{=9ovRKhb94Z!M8E-!_Urqfo6EC^dY!f;*YEMIzRK&y16v
zSm&>?S_u4z{B@WJ;kywWJb-_4uIBWB!5MRy@(<^)P~n6t!awK#EjGQ~Wpa8fPCPu(
zYk&!mDZoxTOv%~Hfd6j(HA);MBw+c+Su3;rPxjX!%4O#Qvf~T`EPXyO{z3B(*#Usi
z2pw^x?FS`L@@WwF<O_s&1_L1<84t>ySYKw)XkkJ9XPW<_{{fqS)73$-IxLnK3$TS;
zJ)I$a%<)7vp3KIk{3ny{Y5emD{;NS}C4eFWLpHNvKc@}n0@$X~+8LwEX7b><N*I`p
z4kIf08WTn9RXFbsbxyATjf?-e_=oNP$LXM?=ZgGVlsFku%0@L<3m{W4SrO1voY<Fy
z|0DQGF3d_z;hhF-%{DM@n+s<l&hQ^hM+0c=3whjrcze*!csy3fh3DY7EqEFrCp;Dt
zPlk0mjb1lo@_5X6j~9jzaQH(mf7s&<`+SiAIV>(72_)iyRFWL7n8{#LfGtD{g=n!9
zD^=p9TB6cOB3VGnP$9N36?dvtqP`e!^f5Ke3PzD=FC|(dD1Mig;-u`q6siq^<xZej
z_hrF<${r3|{3Q6%*=>Ly5Rmo)073ZIDX}{pU6R-+%RWO98^=js<<#3hU_csvxZnr=
z9}y;g)%cz3j5Pc|f*<hDK2D2F0XCOr%0CzXaP?0v|K$!Jp7IZ9iT}r%f`s(;bG@Bh
zv=8h#;pKyb?cfvV&zb-lgt%wk;hZ9}{}+p2Z2tpVfF|o-=moXgXoo|K`ae!SOyq+(
z)ciWx3gZ8MzPy+R|M^BHTTNxiR+eNcA5Z6^NcG_V!G9`&=6@3Z-@|_gi9iucMCnE}
zTn{<wLDU)WAZ@$Oj;AY;=L%0mdM})KoF1O2VfH#qZkx$z0g$LwY7A<W;vUF9;(s(X
zP5H+!pW~m>Dl~c(CsZM{2CRz_Y4q?*O5B04JDbuVzaZPR;s22W=o!7$#W-<pC&zy#
z70ke6NB!{t_;>jnE*$*s2K*MVO`e0p!S6IVo#e5w4)E{b4$#xP-3DR;J_}|FA0C>3
zgF=JI4%}o$K$?uiITj8kVxeR_$QeR56UK3cI14e4a#I<nX~m0Ze67M9h((m^#3)$1
zh*#UuS~phjK}a7sB~V;y;)E--cRoPiKU^II%5B1brr?ezZJ`j!d|ggb_PL*b&6Iy|
zkML%~Ke2)bi*0ydga7;aNATnFT3kM}(@RnShUFipi^Ko_O8j&GG>7^We$M$nh<`Cj
zi?KZ!KQIgyr&Wk>cq|V&1%*PB3P+OS<Xe@^OgXGH&OgNYpH3Wyg5bxc^#`&4*Oe*+
z@`V3%74V}31pYfX*@WXi1^(0d1df?Z=R&Crs^*>t^WR+T)HwcAjcBkQV(LLWZ<W!a
z;NZ0DT@EmBbh}I*Fz>PWJWRl63wU7yR-e=2u^XLcgWaGr(Rfrb!oU0dKlmq2P1Bv%
zDm)O1oqvzuhY6qxfkX`Y|CoEP@JP=pTlnh008ZWRsGOzBIp-Wyr3z9-l`5)oR4M1&
zlDgG7=ip8@&a@-g#tCC=3>d=<lbG;KfMEs(hV$S4>#OP(?KE>P&c*SwpYJJEm+h9c
z_I~%f_u6aYMQsC$5^x82$M(Io+x8Il2%fuY2<}8(OKk=J<Vp#M#}EsEf4Y!O!(b1C
z)JbXYBq7OZCUrHDNKN1h!r--4!|F^B2RhnoJ3C?E5o2#S)+OUt-$J6b<Ls#BMYuT7
z;016o7L~?k&^b&hmrdnyXnZdEhzv9oMG`1>a-iM`u~0PB6sj%b5vPNOtJ1O6P#-mM
zG-fUgzM$>q!54H^t{&XZFEF;&&LjwLCWcH)5lXwb{5CoR-vg2Px3%tRMuGlQ@gMom
z_TAfwBLItp9(fS*IRwDI82MF?AbP%cYyK$Ipt@2`f$E6jX9s=`WE=sF)#RtOa}VNw
z75_+n2p6dIzy62#kH5x!e}Ve}fB*aUzpjY>5bZm3uq$<vufPS6|KNWI9eb4hTN-v&
z#Xrd%HPo&;N@p#eRB}C&1^ow-M2_Kio*Ma2k4$ZoBmYsDp=>3QYDE&YP%MWDBxwG~
zAgxLh3jc(Exqz>bYcK*}G+1FKypR+1A03;nV>IdL^);QXwOws>r1l0%M-%oZ(a9|g
zax0VC&SrFSSzseTRH%)C)nYgWS{geW>ziv3qH*!5>Ob69|EvUmRF5huG!VoF_$xpB
zN2~F_9>U-#zLA?rv^FZMi^hhv(K&=sSuG@bV;iX!y_mgFq(s3R`A;iKpluxtGMU4q
z3AuDIB1A}3*i<g9g8#Pm&StC&N3`$if)Uvczz^hC!4LdbcF5KcCtGJd@?N6uL2l`!
zv~^M2;W88=BdXpGpdJ7(Ak$bB3go6)mAr_{rgOPWo`8k^q5wL&5~9Tblff4V<H{8<
zHdL7udI&SIRR*?-a0J2&2uHARv^^ZHwPJYjCu3`E1pXC9L^QHM(#hdr-#LOmk^Ho_
zd>;S1YETqLD+Txm?(fR!5EscWHKO4Q{3G~R4Zj6sLHMoVo`>Q;^iSGJ#N`*sKY@Rg
zfSPtgga;algbP&rzvzGAMP>QI=kp(oux!bgV3-T@?Dzh`S43*`d8^1>LGTlmTg)#L
z`mx~80RPzO0M&P_x<K&nY~51@Kb_og*C>q;|3pe7MU|q0f2~c9{9oOpP<G4YMu?k;
zr5dqVA(AMBQiT9|!(u5zB%*A^{|f%q75rNadW%V8G{}`&fuMr_1|6-YmeyX+=xk<_
zAsf^#U~~wXU1Anl%AtZGsua-CCzdiPB5D_}tAp9mL_%GDFB$=m+ItWx91y&M>>JVl
zf*ePs7eMSX#s~no>W^+mv3EC8nn%$-XzHw`V2WQwk(!xm2T$t~={*voTV(bKO>UvF
zm#4O}#2P9~)ZR|5t#8|1*RrPxtL>?s9Sj<Y!=?#&G=N{kqY1baI3|tMPHM&0Z|n;1
zfKTC*olW?%t-Zdjy$*l}_@SFk018S?NGp(8cM%`!Kuxs^H-*;fO}Tn&hI#-YAb@Hr
zgG5CU6^=k?q|+E|29v{NabYYbpWs&!6cgf_K*&n7h#X16CJL%5j$BPN7ZiGq5(Y;w
zvXw@T8b-JSThq;gX)U07!terwPr5=+m8i&kQ3sRVLZvly0E;a(sQFhbKd6I%UW|eu
zT8A~eq27advL)<u-wPre|GW`b(Teg<^;wlb*|qzT258tqzyv>;j=hA2RXa8+?!g@s
z#QXX^+v~7<cMmFm5ANCZz^+~2fK{Y)Q384ZE&zQ|z!gU1|3v&Jmid0(fc*RGQPmGr
z-$b_po$JcX0!Eep4D<Ww7hs7DdS0OYbfVSUQ48@~Cbfx8Yvj<I*|cU<g}J;=;9sI-
z%JqCzw^&W&Kez%Ylonipr4k+HTSO9-SgMl9)nciFE0!@uVwzAu5pc*z7UbZ+DO6gW
zL2ob{VOqUjF4GD58V*fIZ`RW4+r;z^DU+n&P*gmoUc@qpI3@|NTgK~_3#<yERVM5f
z3iMo#5<!^K#%OEkKr@#pfj$Jn=bwG;!LLF55OP3Y`T7I!=>JtAbyviW;37M=6Ojj_
z4P+?!Q5065$uD)p)!u@!zib&8=?M>8BIWMTkRg!Qxf80Meu>U5;H#-*4g|j&Td8fG
zOllXKNdXZ^#3%4C<kR>(3Y*zUBe#>V2;(mPan)^YZ)`)ow1aR56j3_cYX}Z)8x|<z
z7mXtT!*unq%3%Q)Xu9782=%0UE`WoedKS=`(3Zq{5h{~SW1$DZX3;TX!Q~Q{WfYDo
z*92Ud;RS>XRN4w?v9S~ymO{%?>Z(Q@3ThKeZDOj;Ou`GSglW5(Dia8wRI!50yNmyN
z{1sa&J+FKC$NiVUe{C(QWEK1qRrk+B{R`ju4F2zV!GpEfXWg=c&~T_+Eh|463|ZE;
zH`g{o9cAx!@DQ-iaqsTCF7Rdeuf+c^K7astpdC*fPg~E=;h&gXhC2ZNXeASU-YWhf
zenqN>>@$Z(=-=Ltf3cD&Gw_wbKa&4X<6oi?OMrZJ75_Z3j8(xuRluPD|1yIAsnlwL
zf0F^G)#_wYosg$tgWb@gr8P1PB9>9a>6Y-UQlVWgaw;T!Dw$g?b7_=5t=g~E_*80_
zT-GDx>sVA#R~x;dws|Kuy&{o&@atcL;4@@@h;c})z##H}i~v;dk9oHpDE@4#Z`;F=
zP}FvbEu``0OwkE@ddX8*A1H5zMm7T@2mC{8uKbcSIcXgz8v9~ueV<UEW>I+L4rUiv
za1=fwzX*!}Xc7VKOYlz|0UZrpq$Vl_b;K4LrHMvvq!E=LkjRk#*AU^Z9ivormD@oh
zYDB>Q{Vsrmpy~x^pTV}g07+5R5$I?iusL)N1}(TW==S4p1ry*tB4mifOtFM1mNF%B
zrc}w0AylcEGA&cCWx<p>rqaMr8i0AG23|lILusTU`4=llJYhQoo*;AcY5b%9Nz{KT
zM?f|HBZgNSPhSXlyh#+??%7uG-+HI=fd}^1K8(E;Ev<Wid`#D3x}1bbD{_5jTOG<T
z7(ygYf!*6`cRzwVDbWZZq}0D4{(lDl_uqf<nrQf<)PG+^sOl6r7pg}9!2f><|1zl_
zR{)7rLzs++|6hiGy<TrLz%&}2RH7Ad0kJCn1x~rJS0QpKBp#*Ar<D6u$^o@Hq|rt6
z#<<a%GWDd4mblgsP$->3uAT|TW)rwTd*S9^gWBgqm5lB|3|xE`|JZ`|;GUXoUG!#&
ziPsa<`t#=GY+rHRH*q35cOkQUHNWqAdi82z{z7p2gnwkyon3T<N6o!)mD(X=E4U;c
z9cLI#ETT)L44H%~5z<5$Z}@`vPwHx*Vet*U6(bKqE=kDk65<8Me^|K0v@)oT<j%V4
ziGW-FJr_X9lyHI1ctN#yT=jzc9f3;hi=k7Q3<?X~2PTEXqHx);%1xP12g^bzq6@`z
zk(42kRa`)YCj*+rG7Uoxsee6OfT1!ma2DLdKUpLvaVz*IQ|rL=y)XV(>;J$%j3^O<
zB7$PV4lEe`;v)d>Z($xUww$H<`RbqU*!gg6{dUalwxKLWX~I$oCbNyrYG*O9Hl?K#
zx`>T?+ZsSce6(TL_WDP+)ouqf`HS%neX}iiZvnmvdr1B4-1%rl@B}$9VfX;>Uv+^h
zVdMWg{$*OJOe6h5{L6(rrCb5j6ZqE|jq1Dj*D%Rd{7d`Q@&UCnq)~=dnwUl#SL+gL
zQ(9{+7#-#AzL9R%u&J+Pu%$G{kWA|2v9(mF`tNE07mJwu1q+vu<HNw|-Hk9PZB>fH
zND;Sxy|(dDwwR=GOI#UKY}z?|aA5juboFL-^O^F|7sgJ!JbdEC;-P0V8@J<&mqVjR
zec6@1{t=UTz$h?CX>vYIDq_fFOt}I^mrLkk;THZoNbT(%&Fvk;<rlppKD$lICn-c!
zjfA0*Ff}5ET1Z#&DN<gSh~3Vmw=yVA<j&eIRD^IrfEV1m1$@Q}x{z2=I;viP=hkt6
zCNhOigOOnjI+?{Fvl%1~lguR?0VN~Mol^OHnovX&iRoxALtPIC0dX9t7*aJ;LAU^N
zes}?0Wuz+%6sd+JlmY+9|C=E5giA{+Xb(jGk0<b7b%8w%_0{|bkglS-dJ0s5jXq1&
zs`0=2NB9_=Wy>A#NxVbof9}EFqTQWc^)zfqA!cK{Bm$CLM3D(8B2Fh{^JyLRl$P3#
zhTScDA8mqs*`trtf(}8r0H)N5L6xoie+&P>{1(*l0=8=xCiQnzU7-5TmJ1;M1NqgN
zWxT+bI7g8GfPF+vKDO<lbk-6152cZf{TIlB=q*J44<?U@Qq6x9mHbC-M+;1e{9j?l
z6+kZ2OM!pHf8_sir2r)$p%^DKVkf>Yx<bTP$rW0RK;jCZH30v5TpD;B4dX8U)p3I^
zWi(_=#+=?<(3^`IOId3lGk9jL{fqX{vOToaGcarMju{;}jV>e>+t^eIhSPRDuniIr
zZ~+L>K#m5<Pjw>_@K20j5G?3NzXooJNKG?Glmlb7{JL-Ed~Egc^r1ULr(T=5`0nhL
z?@wQTZ}QxCM^3y_-2YT+_EKc{XdpUo?@9ECtSY8POjCmVtYoRcPf#-C7=acc_Xee>
zi$<>EzoVmxLV-|rn^M@NmovK6T)SRq*NS@7f*vKmTh7r-;S2C2vKYT&S~Hc@K*H^q
z=y){1eX4C@Jhu*lPITPRbiU6E?z#ZBfDx;~u$7ELqE`$rm^7>>Bjd@&C7nUWiNNO~
zVWSEKbg_gXmNCR~hD1S!7f96%xrPD!!wV|-ujW5wzPOXk#r?MllYYqI+R*cB#S<=2
zS2+&g1vPaTqJYLMWY+GY8j$`%t496`IgQxH4g>6A_;>f#pu1W_>Zs!|+9U#sR>ths
z@Ek^AuUXh<5_YTDa$cu|)y^R`(GeUPcDL2+Y})f^E!J-0I{Z2OSMwhP_N{0Sr~~s=
z@K^D_<pTH&+ywFe3-G_~f5Jb#g~RRmZ}6`$$>j#Dc#&1(zf6fP0P8dOS6A^5@M{bv
zmD&LOb2(b(-TcQ~vRa0%wy_@jgvB-vwR3&ntkJh*2_JAI4!hFF+^Hjd$^FjAVvl#i
zXiY0LelbT!ZJ_LVU^~h_m;%_sKT-cABv9_+|Lc$J__NlIT{0u1C#f1-=p8)~TDg@z
zc&Bjkjgd?5&))oa<(q%L^2E=UuYWXm>AlGl-yU9jGCOr~FtO%#WcwseJqv)=D47}!
zTdifQ5keK12m}79999>d(n%)bKepX57#)aXMmg1K;Crnyzg6Ke$=rHrpH|$f71>lg
zi;`=QGSy<bl#giL4x$Yi?3d1__Kv2GPQnGYz)x@|D&U6~z;F;$&jKP5qLQdoVw)a}
zL?d;<Fr-7rhCVWhNg|;i&7_jpba(-Y!y<FJ=)qLdp}Sr{xPVN9{S8bdroD*xPm^h>
z5+zk2A#wTb3}!2Z)=DC^pg4tZsk5`Gqoc9C9lk&u2le2J?*xWFe*}ES3xNO6fFF)i
zTerIr+b3Hauq24jYB#F5K8Gd|u;jzGv1H#=#yy+!O{P7AF0ENYm9g4*ltyaX-gc~9
z2LpM}!?m~$e+mAp!5=uU8eos*WU#6id<Orx4&xTQRRY2(0C#{?`<KRloC36F7Pi1}
zI{056|ABvnQLaEu2l<amBvbIkQmlQWek%UMKU7+ST2GilsTYfNT=o|J?Gp~?w9_@$
z>sfSqmu$Y3p3r{B;1O5mw5M>szkG3^e7?VQ%9lOljxO6>BL-_ijfurpR_%j32o>lG
z{)rM$wfS4c|3esp+xE4GAN@*8$4<G4Vb7{l`#h89WBZ@UpL~7v;`b+SeX{iI-)?^E
zA2y%=y8}=C<=U-}moC0Hd-%nXx$EiphR2rclk^$cCN<ljW$W~8t)8tx{-D4zR}@}o
zTsDaTif0G#54^D0B$b3_)3P9%6?5qV*q)>p^{NF<m7rHG?9~XITA@uNu&8)i)bE8<
z0guFDb<k*?1llX$uT()R8rlSjt`nFi_DG__dJq3ZB1FNSSrUv)C6V9)1oA1wdP=k>
z7$g!4M(JV^E<jAKAtU1BYAj|{F2yV<iWe%DT*FrB*=hr7K}1CeH)6Sl$dSZSDqle6
za49SnnaLzEAmqv*h^;hQ8=BT|gqFto1`JGK`7#m7s|P{V7w$tlksa;aStSx<TxLg2
zE%Z4d(Olcw*4RaA!`<CzP=vkaV!}O^?VreaCz5?L8P`nOxm@%fok^Zt$d3*>x@A-;
z3;Pr6y4rSO;Y$PbQug2$fGI`fKcA_85<)js<Rikvs>u(A{;Do;KmPHj)p8Ip4;T2_
z*RU-JQfUNV`hUPbN}wF*|9@%zQ^mj1ppa{2Ql$v@U!nzkAO4j>z8VsU7y$tIRXV*)
zp%;mOe;tFOqY?EV*SyQW;ti~M2R7WH&A#a2zT`=F;X+{iT5#%Sc=~pD@@8oKYOr+1
zpFZdc&RLuVt=x^8Y}2;An6>`ugWvcX3ZV~w1Cjr$kX<5>kMb8fAddk5U#)A~E;7>W
zIaOxeGkYa|_)hWM+haF=GWYaft-knAhhF*Bp;!O)@C*OAi7#*bWd7J|BXc)%vGt%O
z(<gKqIA#q8LJ(R5M`Hl~=_(~%CZ!666dtFGN$bFvB(~m@TA2*8R?6}i`Dw4V;MYYh
zBA1eBm$7<eJc|@HEv{9;w<`G-B_EonTFe26X=13va7auBmWF_I4+d!`odF|3Mh`|u
z36vmF;}+0PqP7#wGH{LH8Z;6a@6$<0fyi_a@Te3fox-G&ndq8PiN+b5K?U8K4SFhw
zrUbc;=n+WKBam^hCQ^l!WO|VvlW5pnX(5(Z>Ij`8krq-FI!tSFKyU@kPsrzrg)k<M
zOJk!)NNekAZ*IdxP;E`^o;{$EgC)Bgk@lW}TMomQdq5YescWdMt>0VUw5Pchl$J*H
z__!>RPALj{?ZsrEoN$e$y%Q-9pgxuCof@>wr+PQb12;DcuU}kv=16HY0M!79$2L-l
zENEAA4VnTd1YtDz!Aky*A>X@|e`41Lrg|~_xC6^i;eANAaP!@^dl#}G+yXEcy!{^A
zpWr-i-G8^P0Kh-~Md1HW^-rSyPvrkBN<E9*fEP?^BZt){5MuckQ>No73I2mzYgcOR
zDz#OivM7{hh0>%@7!@*|93@b(LLjQfe;&l&y2Kosg3m)`NT*O6a0DndI4JZYftE|t
zvO0Bi^nV-ttKQK5K;&S5?1(>p%$qvp&7b#=JQkjRJi7e!;L@}4*{7mYkB3Ju4-`-O
z;%g4?s6pK?pvYPutpm#c?4fUb9ov2%A(%g3A-*KW-w2Ke!2;UxwVIagd;_H?D^IWY
z&0QTl@?z=2_r@Rp`QmfGTzlnT4!!o9gRlMDk(dAZ&~tycdh3(f6W<-4y_t`#N6l%M
zz@g`wp@LH7=M(Xt2K>uqRI!*W;B~TqT?A-?<V>e`X~fKaldv2xjs<k-9>IW?;Zm{f
zN`Xbjx55R`%8+r565IvY1|?UgU@Jv*xBx0eJT_qny+~Mi5RehDxTtZW;E%FD3v)E!
zPIm(QUBExR3m~sRokj)DSxkVFfp!2;%HuFV970n74KltE1|}{SOIH;VzEUnwDFiB|
zSgVyAO=^o(W9?B}dNe(DEnyCw&1rD-8GGG$p|Y4%W}V8c*LR!r7PHo9MA@1z7K6Fn
zN$qNCZ*FXEsB5SL$ZKo&*4Ne4*Vi^Q)HUL8XsD~lZq%lxrk3W`_O^~L@L&l}tdLJP
zYQ=#*Qzq1x8E}jyeKUFALNTy!IJhwqIy@0OJ~eo1CV6%y@xrl@H_wf}ai)BJ+V56T
z#Pn7s=m1zCj%k#=kDw5`8<`a`x(nV#^&gb?D}>K`UADXD5h~vIS#`(@qmG#Ny}t!e
zt^W}C$NblSkAKRi@DKivwnwhBD|L32rbnr^DplP|xPbgF{$*mNkk~FOLcbK^Z=Dhj
zRl(<K#4-rJ888K}RO#hXotUTLQM9af9la6wAK3864hIv*g6WgN?3sQ<{^6^^smEh0
z&n7nRq}E@`tlUY?KRq~kBRq0`AbqeeFx9OOi5aT)9SsjdX9oCxn27(7BmNWqp#wm$
z#4$J8vW=%BThr3iKIh!!*wGh@7r!_0_|F!f2mb%Hg8$!C@c+xzTR)pU`R2&ntz2X+
zY)tm@?K-ZR*ni4JY*g!68Vy~oVki|D2d0XIBrdyyhU^3sxOOVFQz4-E4gArdX)0tW
zIVEWeKVlO4jS?5G04k9~C9=wSCMicRVZl>WV$=j^QXviXREdC&4Z|3t6p7hl5mzMU
z3Nc2_XTcZH>|(L0bQapdUGM@Rp9ZEVkHZuSSRx@<CVZJ(sHg}{py;4f!}to~2@|O_
zVzm~Sm*{j-y+LL$Da{tG)nRnH%)Op&m&fYy*?j#lXCUYbMSQV@HyU#e1o{R7zGyg@
zh=&t#e<*Bsxr`R8TCWpGg*bFN$(_J<dq-<qTT4fKYiDOW@-YU32BT8QG>k+r87wx7
z&*vy*BC}TM>M_Rr_Jpq|KVX|m`xc9#LsPM%(}TyR2Tx4LPtU|JE@W=5m7YI3`Rd8B
zZyztdb#CP8gV}(AEoXPKI-98wWUGC&elL0f7!4+7Rqn?>QQx@NdahnWh}$>Z<lcC1
z74|?rK^etJLM8LLzyA6E2>v4n{ize7?3qa6e57<KgvdX6JG}5r;=oIp!{5mre!Xzu
zI}oZ(%{>tvzdVpT(ifOD8)IU&zH2vjfjvxgu`Brx@c-2Zuou1(|MC4|9U+1L=50JJ
z*^&^aRvmMfBS&AzUwU`!iBIOA|C@cU{LAL+|9<fGe?RidzZ`u2?^YlG^SQI{jxRh>
zh^|Kr2`A5{<(Tf`pV)1!Ch*UM3&<e^CP9s*6WjjCon7P(GNnVzCG}~TC7*UGpc`{Z
z%MMZAAx+p7!EU+FAom*NxC`6WB8!5r6|>}gs*Fz&b4X$?8Tgms9qh-(^aO%8X3O|2
zk$?>^5E7~aEU1dHd2|?N(AZK5BDqwrSDL$ZJ$7@i+v4(d_xZc~0v7jx#T_ua{6r*o
z^lI(BTAQ;H;q7{RFJRy8@>@Lvc7NE>AL$E3+~HXNU<ww9r$edSV16i)Ere2;NEWY)
ziDD@;G@Q&8g0WbCB;p$gK)p<?kO;&Au~a0M2n7NjkH-;<1!}cYuh(j|YNb-9s+d-z
zbXd)PdymU(belA>zV1SQ&s4l`IqyfnKQ%LWVL5$qF@0qz^Tg)RODCsaJ~{Qu$>|pl
zmtH$oc=J^8*+cn&o+W2@u)CURjl0|Gci;+uUKlZd@Hzb7Uwyq>fF(v+h-hD(g0DU?
z4~wk0iv0fv@t-;|P&gkQx*Qw178|<}nR+6!@a*8)OX(x8<&M8qJn?S%$Xmq&-^ndK
zmza7im^<DZSTLEB60Vt4)4Ah;N1!uI$X-2!U4(?@*jFq0KS6T^{vW{#=H_i2O{aNK
zkXY)Oy%;+BLhj0YV^98k;ak64{r10ZzVROizx&%G-}$%0FaFcU(|^5m=?62bPnY8d
zqNY?Y@L$b;s-x@3e^^=#YC#Nzk}j2@x`I_tom6rcrHe#?(p_h_oR;oYO!~FsPVtyS
zFzk{SU8=ZU8M3H?-O7Mj)u)s8sDvgtT!0PjBQd*+Pzh#{#6**UErNy;*bX#QbUK*S
zPP``A(1`S6u2{;K%7rqmT4S}CdtG*4zdIBO#FOE4K9(JdW`@J*;YemUlqnCSir#pp
zFP3nG2W^3fy+35{Cjjq`q`c9LJDm2#^1fKkKbQ%o3gJvKnkz@LLxY78SiFd5Ihr4e
z<%go#LM&HE7D~xnF`muFGbw+lzt`L6bUEE#SD&lbX0!HKyRBA>-C@JiY6Y`JqtnQ>
zDp{|++heoXRZ6E?9<u85zMkQLWi{(PG8#HH6+Jr}yS$LNwv@WPmi_jbnRhNPefQkl
ztH;M)I+%ZXf8xy(`DYL20(u6jKx`C3oA&_!mH1!5Kc;;#-hvSpg8Bm*_Sd&;>mUFt
z>|VgXs{%?4gI5$mf&6OPQ+<IaP6UveuzCsg|K`S>9R&I9Px(Kaj9Wk>xDZ@+J6}LT
zAz7;BDj{rXm1%8?yZMhrsWK}SCL;fl>lHGsLZT9hBw%S#|CImmIBJm;`Hx<y)}#3I
z>HG)$pTha@$hE<Vn~CYi6SGeZE<PVydpUXVjoi_<3Mao;I`#h0vG<CH-psGwNlf1i
z=1w>Qiw1KFk|_{pfAs4W{NsBexDXEk|JD2NUHoqYfvWitma5$x<HZ-dXU+$X-pO8l
zZ}h1@pS$zR)$b7a-vs_&|M#P>{Of_||9<t_kLEU>A59({#0ucg<DadA3+Nb14PB<7
z2!&lNRwwX^QVf;Eq<1KV<ZzF4+^Ze4i$`sOF_*mT)#N<7WUnD=(}sI=UW3A^m0FZS
zqfDR?anM^6GCSCmRyMVbO@jxKS#(lYdsBN;9jv3J9_bajyJ9g<rIs2^nx0;}dmu0{
zI2cMK!--UIFxel84@8rZWImcIhEt^=EL{p_h63r5FIDg)a=v)ZpU4N2cyl<M9Svp2
zBDwKUW^}ML6)#Uk3S+_S2%gd6L?|~J&W*;3<I&tmG&>wj=KEtf9g_IJ;C~lQCSs|<
zWG)p;#KO_gK(OEAarN~%fqcCIv=*JgfT%Au=~Zq=x65WU$^~wnBIYm+1)S3{_fpDp
zpxA$GBzR>p^Vmw}+EU`N#l({v`InDRymVyv#e+jHZWf>47ys6B=&j?~+sk3U7FEzz
zHmQ-`vZsTnf^M(hv#kar2xvfq_FLh7ej)yG91w{R?gjU(3Vs6r4}X^V@PCMZlt4M`
zb{-%5PwDsYFTEfCNd5`@o0LkU3RRxZ;9tz4D0m!|P*TOe?u+nWz7n3im6&}pwfJlb
zdCvMP$%C(Fj=qyW^<LrZ4@ze~96IrX;^B95`(BRE+#blEv;`Kmrj&pK{FAnSecM-v
z{hNd@W8n19kPhKH1^)-^Tw*4A=ht^PKEzbEnWEgmdE?AE|B-K{uf8|()X(N#{N=va
zezp1C-)+A2yCdKI?eTB_dh>;UT)+A8;^qqz>BDhD(#h@7p#FpW|4;E>N7O<YYAs!%
zqKd>_Y*c~hXho4}XiIhSxm{k9xa`)AIb@?Y(U?my(yuT04XHkT%ntl(0^MqlQRURi
ztty#OCR7QTQVy9%>tKNe-QGwip+QVssaqOq>i6tutlbUWeYgXPOGt!NDyhz>?M7Vf
zbK3j*Y))s76Hk}D*W>o~d;9^nf507x_`-?)IACA!Cx!x<;b3++QkaOBX9B4aZ@e5W
z%p`}GV5y1K)a2^m$l^e5GBPw9Ezd`X=VRsh!J*l~;$$p8Hjpg(qA6cA?u!ft6R8UR
z<8Xn2aKPhpyWD+<|7NpM1JtWkYK>Yd6&dv^r`2H5$&5;Y!ypUWwS|CfEZV!8A2>dl
zxVV_Twp_TfkiNc>y|FKQdo}y~q2X7KPu$rYdTJ%}#QflsGXu}g`rbH_y16hgpr@(%
z9XwhK1KJ~?V<QD3cyWZ()s9CWAx3zK1&$AH1%JG##{Vt+;|SQg`Bu{*2)85K2Tou(
z1*+h$=Ko(B|A{IP>OW0vNQm<~zYzZ#tEzhcRhX2D`|vM7l^gn>tPVo|;eW#a_>IB2
zCo{{>XV+fJ?f-Um^R?`ew=yTbmplD_;oL_<XFnP`@j>A*wp70|IQ@8k_N3Lns5Yc{
z?C$m&^7gN9|6kvz;Qs*v|4<voMsm16mWdPiNB+OF;UR{i)f8dJ<_t4ud`F&3Uj1JA
zsZZu!`sM2DzutWFKM%eA`=f9E?)dBfe)y$--gx}ytA}5i${a};5)O`4UCsYLjeiu+
zK_O66#ZnT7sN<9AbP7?2B(r&xKBHvFV;t{Qj@l)|eTs6wF7G$wz1?}ACDmt&*!BHp
ztxK=AYY;OeIw{HmEU>(})OKc93%MO5ZkT>(qm$a(8ZdLat$r^QxxpZ9#A0z&gaiT(
zV5U;Zk#C#KMynMWhz;g+I9x7=+w1WM!@*c;U@+G|SR6=>gtOz(!enxIey}v>iw_4<
z6N9D2<nT&zWF<DX8XMmipID2GEGMTn;uGt59UEOqj4Z~A)5IB(E{9UZNG2alrlXl$
ze>mg|`rQGq%j>fAbQ?_ur2PtoOd=7Br4q43Y%pl7W}QsHf^uq)PU`DXCp`LmpnD|T
zGneSwKNL7QIe2M4bA6?FZ8>{uU*Va9BhMZje);(13x`LaTFKs?jolb^KR)VsWg~K9
z)*tI;o8>eakHjN4QXs?v$)H;B;wn1lJBSU|6++J!<DXbS4@pDlIqZIP@1C7|P?fza
zkhs+XzQ>LxNHQR1Jn}!_A1+X#Kz|wjH5N5eAf;KQG^)4a{}%p5Y_goo{_o@e^zGR4
zv)PT8^M_t59C@>F^zGd7@8wUuUpW24;<+Ce&;Gc0{Da)#w^OSxMkjCj(<dyx1(iO@
zW0_lP$Pfh}YCaXeCh~s_MFRimbP)WIM=S9k_?NX9LhR_QVd}K+$g|1I?+iWtvzeFv
ze))}m+yB<@54`(_Bk#ZketYOUzuN!wFZP{yZ8~>s&^T!0T7Z8oM^7}rSZX~}qhsq~
zX13PEQt4?D8HvXyb2t<hN|6{~qzXg~hf!Sg8Ae^I5r=Hpr7C)~1+Tdju$Kakq{|$2
z82#NEmqFz=X(0jJqXlh*rxdVIBE~#14TF}gjU6owoh^-&&bH=;8ti4*gB51=7)e3r
zw4FgGb2$vDgs<1BO-7y3pzSsp>^5s(FAfB|+uhe6429#VP&^li7elxa<i?{zv#GH~
zctL++D3lqC56#9(vx$+V*!XI6Y^4JGjpWp3d;-q{$;pl6*is}v5z3B4bHjtVVlt29
zKa)W6?{~RfJ-xjat3{_$ibMhy4}>E&mBR+-j*raWq*toM9Fv^qv#1gt^K{z3RPZn5
ze9JlSdZB+K?_bY&4i@^4mqS+<fd8Ry?jL>j(B$K*g=aU0?;IF<d?s>p#PP(q^Oeo`
zl^I{s&g<1ObrOn%+04XLUCZ9q`kf6RJnpXOd~Sb;m_8uL|HKv}Lh~QP?1bDOZhQ~!
z0MTeC6hxpYva7CkcRhxVG4YK}D0_$<!N~S8H~<AZWD9sgk{VT0OeH|`q^S{`zxI+U
z`9G6L`y1Gl1{S%I0sPa65dZ@J7(X?NK9&Dl(FIT|yOjz8|Hyxk|BFQu4kXi1*5)!g
zMJ(W-rAGcEMT?Bc{|#V_ig{WtQUBD^8j%0bKM~uwlRf-e;rQEy6Yu6uzLz`se(~f7
zrPDvopZhp}=Eu3?@23vGnb`NO=;RH5`go6TUagOF8OG)s62>72{A2kznqM$Xw4wfk
z0lkNT{~Zr)+x@_vhKK0lW_^GinK1$Xho4Dadb{-W&!=Ag`=#&xX8n8rx&QlrIP~7{
zkG%8SLvQ?g<6D2ddj8GX^2tF{)XFn!xF$WvU}WozOpSq|(Q))fj={n)^sqHXx>P}d
zlk-r1XR`%to{%M%v79DxzTYt9QI`AUBOcw5%TV%Jb3R+jZOQo^DUU7WF!oz?ev8(r
zlbPf~gH&i#pkT}s<39p=8>yqCy}7Be9@f^{+)!V;dl&SLccIeW+=Nl$Ru<&(WFmte
z_=ld7%F=CcyBw}Qhr`~3gv9Q2_ImxkP;4NPjbuxq+(@K6n;u_?m&S3`#hbxY5pPDz
z(}SZ6gQLrXWBU^08|kUd3~YKo-iZ{a{J1Qnhhn**WWG=+!Lpf5EE)Cn_nAC)y#eGF
z4v9@^W^^@BJ8IF`lH)FeyE&I)QwxI@WqH81R*GC$9y+&>yRcZeyjs4tZ{*tQ@VV*q
zsqy%c;=rkL==?<d{8akNeBt(L`SwEY>E+Bb3xnSrb38NadG%=i%%msp76q+byPBrr
zxACbJI#AQDmiisFkUicB!ViKQj`&Bw{lWZMh2souE28X=oQF`&*}DU<$I6<j;dNat
zF~`4$Sd+aC7X;)>)%9e^m#Tz~+8U_+>;fyh6O)gmdIq%to3GiFdM3G{ihmBbQwVlB
z3d}}{+DgoS5&f?or3&?bt4h(WloPjLr9!Vj3rr;8VpfR<S|S}BV3Lf>PzxkFi2|}e
zX#N^y(9PoMfPZG2hFY(o)~!5~JorlC*qfzO-z%Q}LE+2?g)<)(&wMm=`bYV*Kgpf>
zD0|}jRs2V$Zum0<{#Cj-2l(IH1qc2ravl(&A0n7PpU3|<cGo{h6E*1rtjLsc@|5S$
z)A37h6`uat)XV>{^yY6izW@6JKlrc1@BiWGd%rvM*1v7M^vks?@6L~(O`4-U_v4>|
zoCg#joq=nxat#<H#N@1!DinbW!I20=*krE~X*5E+Ns{&(h6DN`mulFfANE@E?(Qt`
z@3G+i8?hVvt=b+8^vDHvjkHIDAB9LK6{^Jo37;dtoEL+N3vg>o6Oa#UY^bZNg=pQ*
z+L}FewKa_mH8g6MQXv8U%_bdOz}92-`&=Hkv)2jycXwM%Jr27!7z)Od1BsMxFdfK_
zB!;I@*a;>w&i;_iAND7U!NPd3G!-76kCYb&NB3oCHgj_aQ&a2V@~kIbbOz&rWDdbU
zmCNNzxrxc)(UDv(6^#1rZktXcWeZuI)b?7W<=|?nr5qXh7c5e@f){pZM?#MEQt0SJ
z;>2|F&`9{`SnTw4>hyH_!hHVP^3bJ)!u6G*8%u?Y)2VZlspCW83zLZ(bLnSSa^IW@
zKQZd~)`I_)Be_Eb{Lq9^JKt?&8zfy45Tv^rDJ`G_?IMl<V$}<{{X`e6GJJ`0(8E~#
zi#PA#e>>0)<Tr!FS~0u^gS8$c)>?F~A4R~23vA&ZuR-B}%Mkc)*@KZKN>?2NJAbM5
z|7ZAD0sl%1TtM}O_!ls{q|kZei?m|7PO8vhVpuBC3%ObjS<Psz;(z@PN}a`%@0QNI
zUp)U&33l$sL+5^6KKqlx`H%DGev&=)gVYfs{)eY-_%g?>{&}?_!DX6S5dR<C@n;V`
zLR5FF6aAl#{|`R0>+3sfAD{{vbbdy7Qa5qZedwv!<+t)re=_;XKP<id@B82X!{!Hn
zJo1A-9((UU55D!A^;iFK?fMTErY|PVu|MITr8NTobhVbH)ANj0$aV5`-AuU(1WQPO
z3gi-*Mvlgw$*i<n<VnAIIAkp%_<Kzw{kELP5_g(>Mw#EJK>eprFSo12Fo#y=(5oGK
zja8#Es}(w>OfClPfK8>4Iy>4B?SXkje&E0A0?=zjB7`hRAr}Mrz`xzr4de$0a2<8@
z_4PQNR%c(2%iHG}@cYA|KrHGXOn3&fkz6@h9FC^4z5T(y;NSof`@(o=XgXM$36<v3
zlN<T@L)n@A@$seT$c%R|2Ok+oq~qCKwvZheEly4p$44@w#ds?0wwu6k;h=0wZrwv^
ztHD8_6j0TCvR%!~_`2sZ{riW)Cnu8^<}#<J6DOxr=NIzl=5tq9hp(=buP&9YE)N0!
z*BA4b=W>_kau;T@C(EIeCI79t_%~<6w@2(x&A4AYm<Ikw2NbD3QNM*}Q__?W9>GWv
zq!F6-wm?A(DgBN|5$!9aeuB}4{0EGPs)ie$mCOe+R(qNnacB}QP{lvq0SynVB%r;D
z|E=|W@Cf3_sNV_MpN<Ot>Dc*8sRRB&f%xy^zq$vffLdi$D=cb-N%fid&p}y_-XUSp
z6kMTNh$$JlmKYZnYXxi#iv;DXD*g|?l0Es>(77L!FMc$1@h7E=KPg;*eOx^M)5671
z3KxHpKl@?k*jtG;0{`C3aSQNoOyd6AR!0RXgup*Rbs$^-%ir(jKaW5Ma>qji{_7r~
z^6PbedSpU7alG%~lhMm>=AQXv{MCP0eCOZSfAC+MAO7#7AN=vy`@i3O_qS`W|I7O0
zKU$o>nli`!l>b!mPg7}_S_1z9vz@2!W+~K6kZPn7p#t1*h1RIDA-C~b)8XEsNN?WX
zJ>*B-+Y#&04s@$x4g>1{KBLlYRQOD4U$>^e$LQ<n_E^k4S~Y%b*oGq&3V2)&3*#;1
zE>c$q@*g0-28z^m7(OJjpysA}I*kPU_gGDc{H{Jbz#okacs)+9&*Aksqk{vnL}(z=
z@AA3a0gpEvbohh)vBY3H7a{7U31l_#(nPd0=}!-P62rdqcyeqdH?vWk-=CgZOped^
z2h+WQa4eHeWYXzuVq&6H%15WhGSkB;v~is#xlIo}-!@9iZZ5T1%<j|*$TkHd-m6*6
z1&&N4PEDuIOeZhQrq0f$PR*q7<>}eX+1V`eAH@IbtHZ$m6C0yf=5wbf60oxq@zX<r
zE91f2)1h0#J>Q(}yK^9UV#KkMwUmRZm{a7|F^po0l-a^VEflL*n)bHU?`_z-2YKOM
zkUTNug0ldN&bJX7hE)Up5&yUF4=(`zkpjW0@&7aUC*(=7|7Z`?&I#Eoi~wLp1K{7n
zKkfp^|Gyyrai}ylfd4N3l@>Lwz+Z@e5wcc}0Qd(BLMw(`gh<P0t65!J_&@Sm{>=Bv
z7d|Xs{>jj#j{*4X`H!;~ewv3}{3L(zC%JPUrcb_|-2Xyk?xsI~!WvlAm{NRRPiGTz
z_d`3YHJ@)hLg4>T`48}qZ9BmKuKEWl{5rj#9-YvPA9WsjGJNT~nP+}F_Uhlyzx|uF
z5B{+Kqdy-0=#NJ}_``wk|8Cz~|GNIv$BT>C5~he5`47*e=NgTyO8!q(s&NGX{)OER
z$o@l|jjL2a*g~R4Nmr#cYwdm3P{@-W^cM%c!!h4z#FKS*CwonUP9t0(U{(fubYZ6{
z>a>I$rhv`pH0XLXYMV}LhcrBj=o&Q^B}>rm;;=C0f>QvA5L^Jrhwa(DtFxm8_g~<@
zuh%vZK%(!8MT0Pp$KmrkgCTD=mq?~#(ZP_{*XQy$Tt2tO+2;?1!m*e)FyIY@qnSc>
zbS6Kw<WCKG;-w&ppT(K<_|nMIfztd^VRqgZiS@@&=g%fHiDEH1Qce`J!Ik;aXfB)x
zI6R;ZX@n9qJ=$yd)MgQ*-6CTIyT#K<_rdY_$=U3=h3uup+|{N0)zzW%i+SY#mzGPw
z|5>C!v)PJe9$PNnULQR(l{#Dso*s){n2cSS4Bws&1OLx2_CLQdcyiRazi69@8_ENk
zm|f`6vrQ6`oYlgoG%!2rXl)H$P=NyJsBRCEcA&kw{Gtl}O4(;Sg87yU0Q^<_L&R|_
z|3TW1;7?E`D>`Y&ruIPp>Ms8G0{=8JfqxdcLWHAYtPl9_g0P`f%~u*FDvJ#Hzs{-F
z+SLgD)%dT(LEf!a8k90n;G{fo50U?~X;=Z=v4wwh!8Fi{fbuPmrAGX}m;W4lBYW=q
zrAt35U;Sz6$|t!?Kh2!~ICK7|nF}9hFa9)p@yFS7AEZvdlRWrRbm{Rx`LsQ<YOv<S
zA}6JdTep4B0}od6pNFAn^y&Qn;Y$4n@t?x0)qAPYG1d4H`=KX-m)=M}^V5-6|8DN>
ze_Q?V_xpeJ$HPDR<B=c!;lL06bKl#)+V}j=mR7D4_~)9`mHV%OsWl+~0sdJ;{O=K3
zoW%YMy-8v)D$ORWHLyCZHk3zv&S2P=PDV!aiLp#H+uxh<^rYR^ev_h4Erbh%ZN@;4
z&Y_mrl)`QW8o@HVR%^#loLs5K<eXfADLI8qAr=WaY$k?%yE@yE1tIvua1gXKHvs<{
zwH%Q@5swVULI8g_)Q{pP&>o2nAWa{PN1}K$-0$=CIeQ)UUid{Y90|ESZbzRx5K9y$
z=SLPd5+g)is5G~KusoL<TNqv0KeDo%pP3G)Q;BRQTgc?{$wEGs&kW2>q}CUUW4T}|
z=<GGix>Y<8qm|WJD`vJS`DCA2lJgoC^ZlD+gB#`0q0z|kiNWLJv18-0L!;3nqfwj!
z7ZwT^7Ydgb3TLNMXQvV;$D>!5i`Umi&rGKemjV~31}}{dT%YWJYTEPa{=p}ggJ;Hj
zk570GjJQ^d&dI^<tY00ni>wNglG7}rH*qm!+SNpEt#2g+ijW`P-Tzhb0yN0BSE`+l
zAf8uU00#jKuL%*ynw<##=oH+4|HYSx|G<AG|Jj55=Zo>r>4dOt_5OQ5{x!C$3ushU
zjoPC56aIO?KbcMKfTD?zL6NcfDuD!5P&KrI1cD0w{}lgEzMZ@H;n3BO%a8rMbnVXz
zSAU+p__GY`(kI!=aEFgG7d}j%c{g?FmFUWofzfl$!HsTvNiKCUx&%!-u?u4x%2|Y*
z*QfE1;l~Fa-ubm1wcj9fYjhq;WK=nJxM%Zr;L;n(r#~Kk_3vih`SnT#|3`lE{~i6&
z|32`+?^oab_3DeiSY5l7G(}7tqnc|(!4s8F0{=9X8u`Bf`%G<arNgbVIW;!B5tSFW
z)7|e5MEe7=Kr|7`=ToDlOfeoz2l~=pN64x}{7>|mgBG<*C$oSBC*~N0Of{cjkcfeQ
zn@-;YT@0<>q}5|YPAOL^<q*u~@<HjuP-SC1f<N#NM`&+r;^ER`)cU-AX>?-~Q6xXW
ze<;-N^LxXgU@Dc2M#9lZC_Wg8gaSUV+wE~hBf-H~BrxE&_x3uyf%Ndy$m0I|^lD*d
z-}L&4^!RdqdSzm5|LE#UW^yc^&!!96(r|ugD3i;C3TglT<?`xmrkwJRW<!}!uT2XL
z>oy^+S<3D(E4eY3el+UX7#%#lR64&pd}U+&(rWqqV)5*J?%YBi`Tyye^x3&AqWxyR
z|HNqQ+*I<!SOm9&E6YPymP%(P;}@o47bk<4M%>RY1iyVCaedlzVcK(g+JAgHaBM1g
zWGc8m?w>35=0jRvH@90sQ}8+j^i~!WL|bcGTJ|Dn*VdxWU)_QP!gF;O80!DH2;4mg
za7^Gtz?WO#2mY&8?U&(w{3%MH)%^d<@Xr^~B`TiWAXZwW8pqxE5BzIv8W_siYAiL?
z-;MtSeUV9GQQJ8n!P3c67Ej5;!Y`2ua?#Ki;WAasP8IEL{(t)2?4^%}9{Z$x^B3it
ze_6Wzi^A2v$X)sK{FR?qo^XL5WzWB#Irds&?V0f8C3ogvk9$<3>F3a8t-Bk+h$R$_
zDlPyoz_*6_XJz3I`oBBAv9snuGN)GSCWl89qlYT^e?3Lu|Ch7x{CeLHe}CY|f4mR>
zzgd0huUFS@$MxaL@Jj{%3<Cc;x>^HYkb*Gm@f&>slgDTAdh9;0Co~XBL}TfABA3dP
z0G>iN6VJv5lKs8m9(~$l&HEiOhtZ{xc1t+Wbg_X8s=zpm#A(pjwCWz6#*F@nPG{2T
zHEOK_YmpTS^i@S75xA6XZLO&M?B0bySBtAL_+}s;#A4yWcsL%9BvXUPc9MX7G!_hj
zr;rSW2JivD-y00VDLgKhGZ^$IlQDd<*VSvZSYqky?7ofhrG2I8`T6~ahi8{YW*27m
z?HgZOC{2y#hw|f7qa&k*smWqC6&TKkmZmc^!_o0vaCI^_U5tkN^kzBBAfX#XGz|45
zJ;uetz=?(8sm0Q{<)Ksaxnq;@lheubONDFeW0zOTkFAeBzBzSvCVi+F*q`z2OY|NY
z>OVCRyRulgwp_Z7gJ3ZO{GS`@dwwzW;#%z5r0d#T;NpDn)O6tJxc~U%z=1K}@~~@p
z#5<98B)mGi9vu@BpWaMEU!c9N6$=U=;)i_=n6gA^XlG?epP+zNh0Td_C=4S1;Q6C#
zKrCiL*`MGJK|u(8B0?`0;{VXMs|Wree%Mw+g6ta^_uqOZVMPCz$bZ-v0i%ft{EG<u
z-^+hAc8$iat^6XjYO7XpFaII%&p@HLol9vG&`DwzTh13K_(GLHsN!+e9J-3prKSS^
zby{lesdtl?Kg!?uS?Ts)4nO`k<y(JUzWG<f*Z*?(+AoS%f02Le7sV^M1^+OA{JW|3
z=b|%@`3i^afk~Yy#N}$*_BKK1<*R6CJb;CGL^lMJF9g|xki~xV0W4g`zJM-fz1r0k
z8di)P>^X2Nc<Hsw(?6Mb^_L6p{A%OF-);WrzmELqk4Jy_U;BUXhkft<$J&d3yRz?_
zNlnPeG-(+|9m`-~Apg<mm|CsCY*IOG-F}bR=Qg^WRxrUr1A$~bmdm6|g>*5OA0A30
zgk}=SSRfbm4Mp9<VOJEXzEWh93;Me?$ozv&v)^X$STwyxg~On*Xk-?x!is(t;IC3@
zv^tysLa18tc%XiEbaoNDYHDih>l;D1glpLCc7*t3Iyn#x4yNL<^k6KR2!&9f?~lZy
zfncE5<#hDgJ>Fh#zuSeLZ8(rf6Xi@)@VhO#czj^}!1ChyTxq;GHZ!uYZ*FdJYIS{S
zd~$eddTe5<JUUjGoGi{v7BX?)STQy;oE*=FR;IJ-v-$Oz+*BbL_nAy$ijLpe-z^)D
z_3bML4~@sqEaZ=jM>orXlhcE@4vyVCIR4oF$(skJo;W;n300u!<f)O!v10#v!nqu?
z94`9KO~#*C8@;iRe_~(p_Dc5rsQ31a@440Rty%xgh0v9`fpb&-<73{<p}zT)wH(xz
z!^WXVH}LPVNDLUlX1B2+irLZB2`#k7ntF&8?}5<MBal0W+#Jd~FeEm3BDaA-1_YrV
zJKq5kV)HxE`6U!`z@x(iC2FemH9ITIulIJef^$+wA%T3=NToK=sf`R;6N}NpxtIUQ
zD)~S7KYE8+=g?@KTJ&->cAd(mQ}$@(CY4kxg%Bu2?zv17htkd?wF+pRLNqP~T%~}o
z;_=iRB>z+uy-SVZFG{VBQgia1<i!tjkNvcC>leef|7rx_zwwKa>whtF?H5DW{-XHU
zUzDzXQoQ(K{`ecIjpt)CkM)<1Il?nqYm6s0b~dzQ?e|xRaxT892S1H}tVP)I;Evr7
zchc+Cz2wlaa&*(ac`Hod|Hosm{LTE^zgqj?x0@gR{^&=4IQ9V=VgI@Bo!_j#_%~}C
zPvz7h6Tr_fXqkGDQM4$2atu02Pq(JeX7ctLdTly~)#B;(Alc3&6Qz7+sE`}VgC`kJ
zC6R=e3#rjuEFbDcOUR>@feBU&x-&QkT)6xiz1`|wz074)zy*4Aa)VL~s=QIBCP>6e
zg;K2qsX!nS^3h!2avGW%>l&b%)gi`>$LaL@eO{k8kxGPc3gFPmCd09aKQPeecDuc9
zC)hzwo7>w*P(-7F&>(ySM9y@voQ*{Ly*_)noIZGHW#iz&^!(8HWd6YB!p6qj#@fux
zRB3v$G(J%r9m|f6W=G4(kwSECEW0qCogRv>OlDRlk_Q%whZak-rBHsr8tjp$y{467
za1Gbk#r)aD{F(XmwT<%ACuVOS9J_sJ@>^&2J$rKb(qi%GQ1H|+QR=^dJ8~&-e7OI_
z$iVU8;Fa0TQ|qI*SMoQPQdg&gi2S$b{7)>0ZY+haECkQa^dBDSTSyp2gX)}Hmh~ty
zUUkSJ>oxFAN`_KMf=nO-^75_qZA~=|sO?mGTZGnPWi<xC4+G|_S6yrcC026Z)BFML
z69xtm>ZgeR74c_mzTVT;On{#R4GK~Nfqz;9ordC1Gvrym6#tI<@vl>u)iRw7LZG<+
zaxsTZZsTI)nbs*_(Iq?%24R$3uA0rP;J-snYpUY^=xgz_-^*YAsCfO8q3b^{U;Ek6
z)lbS-ep<ftaS^r9D<7Ax{HS#9z5J0^lWR}KrY`%7N9^GlwJpw*c0+R+?)?>_{txs&
zK#V<Z#s5c;|2+BtWU|_+bt*e`pr9_V+t#lQoO>zx_y^@Xe=+s?KQ4d&H|rn%?$8In
zJN*7{*WUe)<?sG#<<4KN?SHzU4w?!4>(Pcs<zK@w=!M-Tk;N$Q!3Hy%+3f)MU7`MP
zI*}~p(?f;)aIsKE>YR(lW7%AKc5-AW6&kQ>`_1xXpC#?KC3?HTe+b(QgDy+7&y0((
z&!T|^d(GY+y~Ct&5&u<d^&pUA(F``sVCRNdAXk83BTy)$AlO?hrf?Y7+h8mn2}TA2
z$p7OJ)HvO!0XZGr*6tp=)zNG7qxk7}dHsC@As=Y|AWCPlv2X-OgmY@DbmHi~QzuuC
z9-2FRaOTX3m176z4y{eB%oXM)(j(<~E*BaeiVqiKlf%iS>EgmfcA^-WDTP)i;>T8o
zPOXicUK>3)n;nfg_Kn1j%%o1vrLXKCIkT8W%zyF1zHc6#dh*!R>yI6J?$q+h@%TpC
zePTFtZY*|gG<s=b@Y-zh+G6_VzT(xT+?mnHV{_>z*NazYqc;|!PppQoOuKI^4P0Fu
zI6v2aV#<4D+_gCZ`kZ4fX&H~2iv8NCQ{HFb8DtcduuH=3<Pb+dV<#jVi3Lr>Y%=6t
ztC4>T{K)fB=K=n|@c==Rxu5xl*<l3#ojbRKXwq1>8{qG3s|WbWoefknfqxp5+2};@
zXER&5UlRW{4vog4)i`u&yPh}#x;0z)=P<h1q*hK>3y<2tXHq2`rW_K69JY$ZP&3FX
zTDyvR7yk!djGuTtd-mPpr4LJ&KPuxcaN&nT=RX)e`@zuJ4~iFlSU&&$(8)Io2kxYn
zZ^y>Z^=CJGf>WxVL7v16{Nup;0{p}Akpexu9m}U4XeHOmx~bljGQVhDJm)+4T>Qp6
zL(hIZ`R%`5c;{c2zW3{W-}}wJcYeL}-CxhW_RkB?{bG6Jsl0Z;!qn>+S{*LGDEkPz
zyQMwde4|EU)^vLvuCU)X7>wqU*^y##tehVyp;Zd1KpfZ7Y-)6DczmRg2HxztxZ9lZ
z_MrYBvFUIH4%^TL)<^NBL!b8dr2Mu)Pfx1fnGE#C`+L)2Un1!7_u2ftc7sMP=CdF^
zCztTeCavFxREwy2qUIlo1j3Plfl#2&-P`L1DW}_LG@8vOknr7}Ubn~DAMkj*PK(vx
zaGE1ge<Fc0sc*pVNGAeI3nQCrQ>Ty2Ts*aS<;==AuN}O6V*bjhg+oi(wb{hPFuo^G
zF5#IOOU{hO7bep4W69wRZlHl<%ef1iV;2vOU)&r!wwOJ$TDo{(?DSIZ#^H&_kIy`P
zYW~jol_!o)e(UVgx3BJh>iEpza%i&{I1S7%79N|=UZ2Z6w}14hjq;P5LpN4)Pi&N*
zSSz0$4L!b^yS^BIaxHy(HGFm6e{sfhan65kuK)CG|LOU_iP`>Rll=!ryo(uds5FC4
zslQv`(6WtEkSe=)5JiUaFfPK4yXtE(fJ`L+Rs2^Y|JEfJ>A^RM^@s%Rvq}$pWE%#D
zF)X|tYrkstVCU(c_7+GpH30mSuEx*dzwJx$Z`Xj%i2Mh=Ot?S=|8fZwKlw}!eGC8X
zJO)XE<<cCMl8xjCYadB!TAP~MpsVCRD^JBY?_`gDJAVokh_^~7-zpvd?$9yVo8{wg
zl~27ra^j8h!I$!@-%QS4i47g=PptOzk868kVvzxRM0P#A4V-^aARvcG<Uhm}808>v
zVu|>VTfjG3I`@jS6lX+{oU}|GaUZx5IsbC*_Iu+m{A}U%f0%#spO)YJmz6jEY5ui;
zo_qQ4=AZoO^1`*W%GXWDw2KzkUV`=`>h2bIV~wv;V>fzZ1Cd-JUQB043Wd>9X{=lv
z9m2JDY@!So7@HhhURfL)DNdHNOB01s%p31Bd-W2xR^&6tLRNLO;s}1Dtlun)_3ASL
zM>*!5%7&+M(Q!QUvGE+}3*LAD$ktg6N@tJ0-)r~uS;GMobNtCfI2`g1pqwA{J9-^p
zfNFKn)>fNL29Ku?$alHH1G4tHY}OvVtw)zm2Mf7Kz~2+{+vcVU`xZwJu1y?UA3k?v
z;@au?=Wc91cjLfwkF8xfI(A|;vp60Z&wI<MzO~uRY&o=y06vqRDFruYlV>+dSC3BI
zJUM&y@c6~`p_@mio<6tq)S3C`FDyTQVdcfkYfqn=d+NmOE0@;+{__i&lT-0?bLn$4
z$;UTFzPUd9^nsCE%gFW%k1Z#!EF@112X3tt@Xq6_*{gGrC-x;S&-$*+!vdEV0+*Hs
zE-nSnE)JZT51gF~9-j(qj(8Tcj`3J`VZab~tG(SKGlU*FI0CR5qOKjcVC-=q7QsXE
z^{zr4T%d~ot>*)fN_gmNcpp9jPpJx7;MAzA!OrWNwq^qV)%cI#&!9Ea=}pLgs`)=e
z>Ucsbw!MS9qwJPq1fIzM3FNCQ_}2peDx03jfAn%9|H0HA7Q=NmvpP_-ZR68B#Z0P<
z%~Z0ODh5?U@6^yherV8=YqaFOQ&*ykw^I9_&g}nIZvUO^#+}^8ox=LJitBeuumdj)
zZ9HFGej+<_IWclFl3DkMrftr=M(-E06<zf$J0IRgl(UfFRixbsb$4O}3{7Dyhe8YB
z(XThO?&e54jV@7tR-0e6&7JJuyp=fj;_&0|%|G|kxtD&i`to0Gy!e+Z&;P~Z6F*wK
z{MNqA{;<%dqod(V1b-t(XAl_lJk%y_dRNSs7)obG^SQA?d1`ogvOF?1GCnglJ3l!*
zoS&JUm|vKgo|)XYx^Q@7d2ymRG}vE`y2lc}a=0(!wG{leV!u7_#XO*YGU?0stx2~j
z7qFG1eKWb>OfEEEh%OIj7l(7>*=S*~KkV)C^_oUX@q8u}k9rd^e?02PHFqH3@%ueC
zo6Tf45nN~sh(PW@zh^KO#7W_D_4s^_UZ>g7qb+2^GZVR>uP5m3KDciJc4*(&k(J`v
zgCmcfp8eL1{jWZG_|A>Br!OzuJ~MG@Epv1!u{Iezy<S|K2p^tL9hyz-$B6oB?$)WP
zr!Fi$cWLFuk+Ivyr@nJz^VRDIUbwPy=kn6akFC9SYxB;9rPr=+-Z{7Y#No+n>*ebQ
z#~wd2`NFy7Cyz|t+!(sGR=l>9zBD&@aV~mxDs*K&d1EQ}?B>V|N5^ihB(Kf|FHCx_
zFNLlzhi|MzuCGKMTM1uX4qaJ+#V#yF&dx<n%|;JTMOMoGv4kV-*ZZteqk<`Bx3l2_
z?e#5<*zgP&*a^W0@ENvl0o8-x-jhHDSn=2zxoh_}h|yu;BMfH;@ZZ{m%WqvZ|3~~s
z?HTz$?d}!0nS~Jm0Y#+bNc18_w}imIz0&{sH2#%lwOn6~{{;RaYR+tL<d9m3E&!d3
z?Y;^oO+_cGX&q{6vzk(0#eeBoaQs|s>Pmd}dVKaq#crl%Z)WCh<!5eW=WgWYu0xh8
zRX!fg9|*<fz20HFIjWV}1Y|+S?s~8s2>c_*BSv5<gJ1+50`8CfXcheLuHQytH$nx-
zHXw-%n@TIrxl`f&k7Z6gGkX4&$&0VfUHH!Y`B!I8zBIA<)I{N6THbG_YgKfW4pe^@
zHZyAUe7#;_>+X&D2TMtSKRcSwj}}VfrJ?bm;feCt^yt_G?f{8$IiJlX%fp3IDVt4%
zQ(;fsYfJmAahEaWHV*}RhX$NuA=hFiyqFEo<$~*DsRNUl<)PTv|4-R_Kh<Go*`mKr
zPuo^b<SYb;oO90kaB|K$=bSj?oDm=)f>1&d2q6g}lqK73x9#b+J)QHt_tpC+-a26S
z^quLMs@Jva_<&`1xoYkG?YNeJXcd$F8g8#a*l$)2+4TXFQq7|Z=<QN2$!?PM_)JO(
zQ>EZ&Rp|V)=rm$$YZLwl{NoO#v!jK}VZhLO#RI`&lX137rL>Vc>jYe)PRZpl+S=+1
z15VA%sC%N{GUSoZ4(e~rdv<ONKHD7I-ROOEBe1jPyEUy}8CEP0Dpp6dOZ|%FVeQp^
z^?bK{b;7uH)%#?<@6N1kd&%?u-o)oyGyAK3duu(<ZuGsmH~ws`@8HJJ?o#*NamVI_
z^WKc-!J_}k+TfFw{;fIJ!#Vfk1<%77$M%$EXWFtoX~x0(gAwziaqCkY|IV0i^$4!o
zsB0e1%>WPIZ}tc_0zCM_Dr#XLXVJ~R;$qJ@I1_g6poQbpQca?E39W(LQc?@1@r;Bt
z2r9*fM@58$VO)rb@F7VNyx_<Yj#@y6pFyQmfd34nT+bdq!vBSk(@6e-f7HK+E^yfY
z0YhVQBq)K=2)&#f^o8{F*sR>7{Ibl#n*7qn!pfEs+&@-h{{i!_!xrdK{!`I}7HAoS
z0D}2XS|acdwsIWuXC=kvB_|YJPAW}JF29^uksMc%6j6~BT6O6{)ur=%KUF%wQjBuc
zlN|LljH{mEt7k=;S+ROr0Mkwhl%rg6fXQ>xs0Ko-xTTy_nN^h?lN@vE9PUAbWqyzd
z=;#pmW8`?j-~51o=w_b#I6CoMX3@p+mdp-LIa}K-^N^JRx~hjE@iDn}D$PhB>ss3t
zb=91zoc7A(Dy-gCr&ZLZ!38Qn{#IGh)=U+#xoV*h_%|q2X0^d?u(~aFug&4L+uT;*
zU#F2`_@&V*j7F_oCXk6ZS}B)LY3-`XZ!JkDROT=mO2mYE6NhNyQ{7T#uTeZ;kxjTY
za{<$p9`mq8>QgX#b^H;BcFbiM_n3y=2B%J>5!3Vv)?kmh*Jso!xM~?k#HV+*W8J)@
zuC^xV1C4bQG691ya5hLJT)7M-I87nv3AiLSy_G>}k_pIkVqIlHieAnb^yx;rb;BO%
z?128}lK;`h;K75*=l4eT?hZV=)4y}gersI0G9bP?rQcq#ZOz;6PMWso9lO_kPjB?@
zu6A!Ny5B#T{OalAt2-n6Ykm7S`af(>z2BO8eRuTfO3(Hc@1w=;2Q!|B3*9g7O+C9g
zx_@(IcR8@X+W+iY;K`Ej)mq=Dw}xM@^zFktW}J^FZM&1Ehhv&My~1@Dd(FwZ+bhQR
zjc(q$kF(-tA}N};QKv1`DGP1HK<U$xdi1m&J;RM8P1q@*)pr4k1t~eHaS#Yfh>eJe
z48^nrN~T9HaG2x-p^nyQ@be5<NrM)|{~4_P2Q^PGVDs}3|LA{4M?(1vRd8t1#qgA%
zWLrdX5?ml6H5pfUaarK=gtkRZK`A7_T0ntP`QPEcxU2wFG(r3)M5e}uBL&L56rGzC
zUzl>KBqgyd<zjhKZ26^#@=HJ9pP(U8O?0M%!*=o59s!Kyh4Fbl9^c31c{n^5mt*HJ
zj101h(jsiDrhvYwJUJ&lJT?M84xSSXe?LYS^jL7C19s#Q_&n-^A>RWFbX;P1X5nRM
zHdPU-YsmGL)cP_~T|rx8Zc}|`Z54P9Q!7d@SCpldS7cOx?V%yPqCNvN--_bm#tIUT
zB-HZxDy~Q)QW=#-r_tqixcv^N$LjXkk^Gp<TCG;0(<-bMy;d!=7*sl?NUsud$Ss7r
zJYs!5xh|jCT*~jLl9C%te4<@K^C(#`mz)vM38q}yt9|w>-Ijnx=#;a4I%FNnF`sd=
z$J*ymJM}`?)R4z%k_lM^7PY;jwGQ`&I6P@>YsM)gjov8~G37G0L`0WM88Q*YpytU%
zL=mTrNv@ak$ub_fx-gYNsPdVGeGc)kQ#jkJyt(Lou-^Of(ag*3iGzE?2X_XZZ}_$s
zjkhNhyNi|=Ywn$S<L<oa#dX)~JAJS34eze_K5S2X^L**u_SEycBhT;P9~t?uJ^T9Z
z`16~?&u$<c3Orfud$T$D?8ea3wSIiZ=i_UEXV?4QY>a+!Z|2}y@10@8TDRi%05T%Q
zgAv`+8S9g2<Ib37Ye2g0VXb@Fw|j%Zf8EbP{9pDk=badYQLzruqv>?X+ROr!0M%M<
zgI?GsVbsyuikd3Y%OEzI8Vk1h#Ne^mg^1Aep-}S0b`VY`5AEy)2rvc5!Ke}NOAO2K
z<?j|eeu+9C6O{hAfKPA$gZ(tfW{-<Jkh>;C<0vl;#Vlx+inH^v$J|g_*-}y6UR~3U
z&VN+}&iU}!ffAtRa0FIbREGZlWoQ;b*X43-XhuS0*2UPoO9_QZ@x{q;#g}4BE=83j
zhL<LW9mW3&Msp3Xqe)C|ky3Fe0&6<7HYL4XL2s8a+N8`@5xtQ|sbvsL3E04|g1Ssh
zO451gc3`dt{Nn`X6I3_<9RKhFJbdRv&qRW&8N!RWNh{7ytt!ZDs5or$1aG_2g7j`x
zspSxl2<k_GCpi^_$rWX}sA(uI93_jd;)rzugF|QYT3rFBJK)6R2<Gi}I$UPGK@I%t
zbt;QlXVk0k0%(yAse~#KokOS+QIX`g2naP2QoWQ^uj6!DMHGjW2JpiLd}^*&$?+(;
zgJ#8IpL5D*@#!RX1=p<=4Z3uFPL1D!a};`yONWNMm_s78R8xtq_)AAyBaK23@EHO=
zRjJ_UG<>a+D;1D+3Wib5w&ToQM&{5OC7dq1Rs{CN`jX2wHPdI}4%zuv1~hlC`5$id
zKi?jI{cz&-*4X|=U~kRw>bCdkb<6X0=et|J-7EToW!r~a-S6)9y}Uc{*`uj{J6QU#
zGxhTR*y9_0&o@WjZcTmhc=6TU@ttekfcpN;;n$lJ2Zz3}d%b6C(Yv?S|Ki5*`@54b
z*M=U=x;If2^y_Z*X*Y*-yVKUadB^iB_MLI<qY2HuKGE$S!L0!QPQU185C16nx#ptJ
zI~Y@T7<0nG8Z;qj5Omyn1*2NYtdr1dScH<UhP=Acw36IQIq7k!Nl_5dii-}5K3e;P
zf>p%%pn^9VL=iY6Iy{643A)D7QUp8#78;7>e+d4D#6$;OAQ~dY(eQ$>xY&q<p!#TZ
z3KS<UCuZPGuPD2qs<5=aw6dwZx(&-))xbaUAJ~7!f9~Z>yh`z5DX|wo<du~WlbaY<
za4EJZDYp1hbV*`lDe!+Wv@-F0B{Gwi`n-<jLSlOnrL%}ifECj4*;Pa#6_F{$q@WcM
z$oXxMFYipNYDz3BMNc+7@#6W2Fz9(B&L5sL1grc1F8-fFvkM2L;UQu1F_5N6$jeG8
zFG;Nd{%f(-90Y%5)#b7ZSXwEpGNq(4tqhqDMpKmp4pDBBs_hD+OX~_aoZUflbvwNQ
z7r^iFJ1jN>a(<moWio2bCN1!<Q;GGsAd#@KHcxLYr8X7GXiZ9HtAW>rve%<zJEU}z
zkZ6@q)XX+9u};e&xRqR+m}!%+`Yfv1e%D06>M=;1TA|;nbQz@@30*0o`R!WdKD4ez
zHmzOACG%M%F_*5CvK<z<8^@#-_?>Etj*r%*Rz~yMq&~Y;&hO$;n`NA?rt+++yo+i<
zhtI$oaq_Q^n;x$BJ-$7-w>fxlZ|Kdv!Tojb?vnBSX7`)h-nVyrpWp9!x#oCx)BWML
z5BUGEJ@)OZ<<B2Yz1<pnerM>-_SB2bvCp3@yaf1{1FtqGo}mO@4s2ibKf2y?f5C$<
z!AD_z@ZH^s7wbb0rX0_f0=EbB>jBlBLEY0?$Fn)-<0&(O|Mmz9Vd*VD`(`&MXg%CD
zA9LA5gDpDAvo>Ttqya6VPebU{kpfz>Q{Jg&)rcwOB60<{tDM$a+=k`^6c2Kea?;{c
zlcSR$G6nPorECz-p<^F*WU(>fe?JHKyD$9U0wHJc8*IuQ?Tg|wcs>vviY4@maX3eX
zpb1FglXHu*3WNAB!OeShYZY7|xcP(qpN^`^qbWde1>!LN2k{@D5*L;neLgiVEEBt1
z7o+nMqYD$Gi!MbKCx(|?3@f>Kq2l7Xii>BHtDDm5+cKL8S*=}}ZN$uWV#c9mwvjX2
zC|RvAT2>P!y`Gp_)0tS(6q{cfk)9rMG4WhPBsA?Y$UDS8o`C-j|7iA}#u_D-r9xux
z-;)v#p8WFiRFIh**&+T@O3PuXB|%Fo1OCgiD$ANE?J~2%>@`@tdaKuH^IGhFJK{g^
z?>PiNo&cM}WHf0|{F)3Jq(C}!LG=nF$^jXdgDwBkwCdc1#=_*rg5<{hBz8-gjNE|u
zkNVe*1se@d&m*EhR<pY-Vy0Qj(u){o8F#YBd3CbC*QvK^gjTIcC8jfoO(r!S6b70?
zLM8!Rnrac<q83^-ykM{1DzzH~KAUX7rwusd{T{X3EcDx@N@16R-z8?Yw?QZ}JC;YP
zv?+)aUh&4P<MB;^f9Tcr=)v8d7q|V-SDm{H+PAk{pKb#D-Jjj_<MY$I-nTbhAMW*k
zzB>*3@NnYAoxu;gvoG$Ay?QY5VP|f4wGUx`ccu64ob%4C<KeY#IKsi5G5iwvfB#_m
z)y>h(3ESqF<qDXu9D<D=<--Zf%d5WqdFP8o=gzohYZzP*@`uCH+daHn0q%;2jzPdR
z7Zv!w=BCbA$rEPkn2FM->FidvdlemSMW<ELreM~H$rVC!1)EStYAS55%B?O=FNO=G
zgFPuW1qz}Uk-$LH59)+4$P`9jgjhpd<dGKu>Hz%TUxB~pLVy1a{30R(s(hg+E0QiE
z*N6ulWnN)cVPzri7^@n~g7|N(s)7OkD1rYQ{3k`7yBr&m5f_n@5Sf25s^DT|abiTt
z#n94(kdlNS@PE0MoY6?nX=CSg^7070TmmnrlbhAa&F*AncQSK24y~P$*-Fi5Bqmq2
z#TQpa<>ZGZrJjw5hQ1y8{zu0zhh7kLfq%^Wk^F-t6<T<}|CtcrAGDIG@i_%4NOG#`
zG2Vp>1cQG?<q`f5<3DoBAmM*$OC3ka*ErNBx6bU=TRkSb&+6)Nd3xQhZU-`dm&a;%
znGyeie}h4T<j0^_X*Dv$f8gI?)%YD+w~l9#(e(T-IjtF?UfflyqPOVS9XfWWQ_l2i
z`Mnmo-ypTf*d`grtl;UT9JMgGKIFG)=O_Bc2RwFzOsC`tSVS(Z!);P3g>*5i(~M@R
zPT)6+`y6tYp6xO6+(w>5%N}qmd+gG=Vf#ox=d+3Bybd{+Afz-A>k6v#;%P1UW^wx@
z=%?oF57z<*_lG|`8hr`yuQ~Rv8XsR#zgn}rzvcY$LHCzi-CsZK#pk;_?$@^iUp$_9
zw>9?W{>X=ilb=1FdwGBS-OkMZosqp;!#k_J`*%hkt@S;;-h(gC?~XmWF}QbQ5T(J>
z)&A`(p4(&QjY0jATf8}J+`r;^zTkPW=y`s{@pRhsa8$Y3E7%^EZVifV2H0yp#)^jq
zTlP?|xhdDZv?VtaDbR$4I%FUY=(@VqU2b`YOV(lJHzErXQ!9DIa(Zh?M_oZfd3HrX
zDs(im)8jKiADR-A2JPHKdCg>KOC`l8flodW?4*$~NRXm%VH&(30M2nl@Q;QKYWx_2
z3L@-mWK>983?zADQjaJ&fdBIR(%RyRhSI90a%4ex2v7@@A6@}t5e{3>!T4XEo12?@
z82>|q_>T$6hzZY$j{x`!FGdt5gcS$Df4(gKe0joI#Q%&&PEM<!pi5drk{45y#S}$B
z(Bye!Q68CJaAYKI9)XqJM!nq7d9l19I=?hLHT!&Y!l`qiLGjB^f+aAfU`J2DKjI&a
zFI-Fqr67OAe^g3(LT*uVY0c$I;2-P#hsi(a4RPU8QI$~+lJM%B>gs~}nobT?i1R@^
zc7)AVkJ0Wmd;8puZimBfcX(|UhtX`)o9%imkD1ImtyZZ~%Mkx9X06|29q4iQ_|PI!
z+EpyKmgiP;i~<s}9XX{D_?Hmt_#HLuwn{-)ogBolQl>2!$uV+O1sn#OR;HKp<pPG?
zs2uEdjScy2fW4F>Vw05uMz>vyKkrbn2ko*6k9y24A9M(NEL<cu9`MRKB;x_y%%FL0
z#MbXo$hjCX*RwjRI;*p5bK_WTg)S9wzDIF;-tl1B^KNJC%e{%$cf8M+O}jJlC-aII
z*LCl1SiijQ{rQ&v>z%;Yk9t14?|rw~^V!ZM^8YvYhd+HZ{qEt^huyhnw}*g#fd9$O
z!8hA8`10)b$m^|XIKtD7!PoaDQ3BkavfZC?+!`_688Pk7c@C}ywuTLx1L_CEnn&Zh
zhoh?7-Mrh~ob4g;U8Fr;`gIrOx|<68U-uyXQ-S|wFLMzt;G|F4>6144u$k7a3A%tq
z)S~6o$>~*mVg;kUl-L-20%}V#K*U;<lbD|cKC$@hjD)Q8M2O^oLn#eJP0&yPvHZmt
zxWM1>AFPP~3;y8(7tTWl_iSVgYT?M_OL3qk%gs*92Nz#ieo0kfSxpgm9LnoUaLro+
zt1l^OC<$HxH&&F?6c?7}AiqHUn;aW@DeCOys0*3VAvtm3`3V4jcu_(~as2txc;NqR
zS^SyugfltqLV&-Nsw$^z${5;Gx~4d2>LR)dR!mnG(-p-uX#q)?OJJurQ!iDw#T8da
zW)y@bTt0Il0*kFkf&MA}gWv~m<e?nT8C=YThK9z*M_o?8m{*)uiM9{O9I)pPRtH!e
zcup|t%Bil;#lWk!rna+<CT1h~F}Sn_huY>bdIL^m{s6zpY1COXCY#=ZE~vw3cbY6#
zgI=#j{*V5TQ4g&0#5}rG$k55yIw4U?Z&I_{4Sb@3M=+uU5K++ou}K(8W(RiUI9&}q
zN^9^>$~g)V!>UvG9j1WG;>18%%D0&`R+G9f;Br~@8aW?}Y#x(*(5ddTNCqr|38!?f
zTRG*E4%>OXCNz%d-6r;xA<O)*ZFbNMFL3F&sJ<oirjDwtnw(g6TS36cxYDb<f7SbN
z#rI}w_^Z8%mmAK#1=aQ#e{W9y;+p2ehUM!m-`6|c-#iNZ;%VQP5B={qdp~_Rf#e5I
z!0Y>?sDF|F@822TzdO3WIR^YcxitiPwLQJH5(s{IF!gkOU<cE&tKIj}=UntZT?#yy
zuw8SBHvG~DLuh!)?(~Y+UChm1KEQv!m%k3=yGW}Z>WY_gXw(%S@K3$!p)a}^ScjXo
z!3CHTcGj?k(PyA~R9$9avx-$KB$YGUN=S``Z8dofWm%O)m&*z+L5Qj_=TbpV@}XR9
zYF>6)PA2pNli>w;1&|2g2|$vM_>W!?7zS`;fyOD0EO2LpA8~L2;-HX-b5U`Stc^}h
zfgdG<lsLa2tDrci2qM3g@PdL;DE~tI8ip@*#ifm9#dRgnLd`GE0-HC|u%ws^i4kW~
zBF|++UC53N&5sW&hz}`>zfc@^t~6+8%HmH0|9PEaM1Fw3f~hOVnkF`tS-MiTu7qhg
zG<_*kS42}4kfb@Cg7juaaxLLvX+v~&N!X=~vtiLtxeFdbWB&VJ;y>_zH0cgDdqd8m
zUJi$3#pU#*!t$)zmb_*{VLPF)gHYVw2`g%8gOvuYwG#BLT!vUL*Vr{0s}eAB2kb7N
z-R{Anuug4KnH)wmKVf#a`4InlJ<49KQX=8Am}G3F34)YSolH`TklLi=c4~QD5=s-M
zy`ru<QCBUuwL(m8G6<;_8ONsLOF2X-k7ChFEC#7gE^rt%HiOz>)^&R@Og6xw&{OU4
zJA3^Ot4`|Fi3go3ETN6qMN=;Ej8`(-EgN(42d$hQBP(EJP4{ZoX8m(`0*0(THZ+9l
zcwN=Rnq0(xT0_>5op)oxyt~%(WFzq6ZqJvGN1m^kcW0#ABiw@p<*Q}w&u%+@@u=s!
zC;h*C-23gbfiHLbA0G65`f&XDtv(dLukVe#*_wR$U;-}i^1&qV|8i^U$R1qp-n%pM
z^w#k1djIy45ADzgSFji4+nlgY7#Q<5{<2%N?iH<h_!~aK*02&ipWRXUgMPsstbKYY
z%T6L(;JSw##J`Wa>Zbw!i2qkTtSe4t@FKuM8?(?yEI10F_Zp}!WtV~1B*lSPM=7Pb
zh)`eHQk7R<mQjs1c2P=M;pGxstmHwoJiQ<{BQHB09AbC~kO%?)r~-ih!{i5jQS^m^
zbnf8XIJ&upJ6yPMHYDPFbbQ3cOEDn%%LW~54iuP!QslWs8F^qhFU`)!Sv#yeC%+=M
zuo@Jf#f8;HSn<y*%*e_DTXxK)=<^rDPbWv5O^-aE9ep7;E~FstLQ(v=;`ku=OJmQJ
z#+`x-WD&%9BpE_|2~$(b(v`5ZB}{EGQ&Y@T7c*5w3}rDxQAm^K5&792th8oYN-Z(5
zv^6HXGW=53*|6ASC%|fiwIAd^5JJKU@K4YYMcN+>{#f%k)Sm+VQAp^8P`rB(lgKQ%
zTvngggq{+&kjyV8^GYe4atf=8Os^(Us=B&r2pt^^l1MGqSTtJ1d6&`IZAaKQ+VyIK
zN}-dR945El>GmT3x7i(LyWNE3U#*toEDLieG7&qNO<iqG9nJNv4Ujx2tS>JjHrFyb
zTj=c#)Ye)`Qx&zPn%+^*AT)}YM7^BnG%AcLkwU~YYsBdGxD7IoS!u=uK`wGzjBcyh
zW4Bwh>TZX1Vld#bYMchifJ-^*mX165Q+~;8k8;c-8glUZtsK9eHiR;B*1a-mpBvN<
zIi(^}EwiN*_-`soW;7T2^^AL09J|Y&H+TD9-}HWZ*Z*Q!|72GDc#{9}it5dZ>1X%7
zzj!k6?@xw)^<?PVr$ay6?fLA{@cvEzoBKmAHwWK8oOu6m^7R9>OeX>R*N^7kJi4;K
zIk~ge57@uFH}NFc`S0Dm>f2fJ-k-KDdPLJE=A4DS>=0fD&s(=>(ayNlBYH5RcsQ!K
z)x){vW8C&LZu)6A4s8|4_tIDW4A_c~vFztA`S@4doH-|Z#>t+rF-Oge0RttVCHCk@
zPI)_?00ByXwqjyaQF~oM6G+R-(xF=geX)w7w35QKA_#`#8W$YenU_<6e+ZgijDf@%
zv}r&GasC2|Le#=RAp^)6!0-w{K_(>Rd^lP_vCy=Iz5^7cAUO-=!=UyucAInaf3$*h
zpgq4Rv!FN&;_pRe1^Fd8Ir$*22Q@NImCs%bJCPiADn0B>X2jW?==1rp=L%!b6vmz|
ziaT8rdlCj0$RbE`N%BIPx|pFYVQ7mP!ThI)rY@u@3uy8Jsx)Y1QBEf(tBsM`NJ*+8
zB$PDAW>-dD%03?!cl`Jv{)5cVLH)1)ivM%L9`D((@UV!OxR|8W_~00<A+MQM+{G=T
z@JlGXG76`X!m6RrYDmPIuFhsk7emNI&ryRT5vR%KwHX}-jJ@PKnM$w3wAbNtINUam
z&*||xtTv+wD{v~AOeVwz0*Tbw(bn8t2QK8I+KSS;%5r=uFU$wYRdq>WQ&nka1CZau
zrxD2Q^_|Vt^sZ(uty3f8xy)FZlNnV)n^xq~OZ-+%z;5tbbZUt}EfbqGnqHT~Z8CH_
zEQ8$+l+{jyVAQJ^bBVB=GuNjc_ew@w!d^2A_=guP3~KK$_b!bZ`)vXhy9M}Xx0QEP
zWU|^yERxQxCHLcH_vhQgpWW+wcGa+lw0}vpKPP!UtNiSi>+9XVe}6vq{o|qE?Tvl=
zc<{rP|LKP3?ba~B|89HiJqm%xbMGI`q6GMCZ|TL>^v-(U&U*juje!@N6KIDX+!}dx
zXY}bZ)<KO6UePtb1U;VzBSvgH-0o4_?2+6ZR6HD0Kc3dz#Cnf|bj#1Yfwad-SaEl)
zdMLm@s^Iki>qZafMh_SGU-a_kT^xWPgMe`>j5%zk4;U%EdXh&?Fz_2CI1T75#YPaS
zfR5VymI}B)W?e}}Wl?%*VLBKq@^gZk$yu4HX{kvm$+)G6iHV5_4+F!@=?kcHfq$?q
z9jbWXGb}6w;D^9RL?krJg33*oQ$cl{l9dIG&6HfoY-S~AWv2uU+?Cls7z*C}pojy&
zpOKNBl$sugBbvmh(}`inQ$kN3;{S9`^x6EFvjs7y3u8|e9fJP||Jm)5T!O5Cq%5MS
zVTEKB3_s?R6nGc%iPC(cB)?0P)6UIoVP3AMBv!P=71Ty&6l0ooJ}l<=@smHq|KR?^
zzn~Ws<Oe&9{}6u%_|IeBf!PP7OJZYVl0i6|2kE!mdO}evrJ#dU)JZPuBvk~%KB1<w
zwVu#QV32uIPyotxR;|IVN5TW-OVnbx0Ko6^JF)ZN@;ZQjm)nNSANWVc51M3bpmlV#
zv^LkbHPtuORvqF$$nAsgZFP7EitEad9%h62t+T#D$Rr6^Bp#iBsure`^X&$OO)t0W
z<SwJiXVJOMT9;WH@YvjTqd_A#t7Ls{8-5J9t$wH8Z&P%e1(SZ|wGq>`5#w|~KI#$-
zJNdn4=75zw+oRZC>06t!_gndDHb!q%%%&1Csa8a+@~9}AbN1b9?k{&HzTKPJThc$8
zmONij9?Z*MFX_Ix>;3Lv^!xqM-|mn9@x|0v54vC8cD%fYX%~vz@wX2rzT98@{OQ7{
zkLTY$oH@8RzO&x*7(=wXqbP@8Y+~VI@a5gnN7wu}M)a!#>h(d*1C+pHrmZpanoqjX
zEx$jk-X7QO&EZGZnwN2ZNVJ9|$I-R!A+5NHNP(`oh^T)N{MQ3);QxAnf3;h<=;hD3
zInxf-l!HCx;EY-sLl#<}f#TB;?XnJxgGH1oR!14FwG@RwXKh{!s5{CttBTXgaI~HW
zD%zkzEb@Ppz{r1&>R%kiqxeM%6c%<qG74Ik5wPf(2nd$pX$TEFAAzNy*a)Z-CMPFA
zpeiFh5!G~7dhkp7kCu@E3fN1TnL#2<JTW<W8Cltv)1eua8ha@r<YMHhi=oF-LQkZH
zp3De8ogH;LKjw5n%&CHylZCM-iepY3;XkuQkkcm0>y+jbWci)aybhQow_TFcA<AwS
zX158l+xR(c+{|W1T0J$n62ZSAHoq(~H8(6S>0C&7Q0E)@PjCeqE1*9C*>w>Ar-Jzp
z@DK2Z1i^nHJUSvOAubNGbD3EwMP*r4^|^J;c?}H(O^s#PBJON$B6YUms)bJCVs%X|
zQ5uw5<QygyzKd02g+XC-7!mpHp5V?8@*k(mW<};>MCV^6l?r)0Cie3|#n#^1SYKUH
zhSQaT+;ZfD#RYH#;J>Y|iqzUbX=`k*DC(%IAh*>?xY#NpD@81WQV6pf<aUG7qE({8
zu<JDrlLkLJOv+xLWo*DZ)a@Ab*?L?SzZ2<@)@ud+#e)v%)nVi6gyl-V8ePp^6JyBE
z1^yQXG@FaQTl4N=r^qDiR58(d1tD#-mebm6WMhr((N)*`dqcl^KKJ&9YiCmOaGd{g
zLG@wP{LOan_b<nP^>X4*uV#L=JM?DNvcG0N*z9|=Gx>UF^2>vzFZLJE|9^RJY;U9g
z+0FhJo5QGx5AF^<zdiUI;J-8Sc(v#LykmR8ac9!_V8*&RVR|@gyF0AAGo-ygs=qU+
zd^BU;8dKlw72F#V-{@wo`)Id&SQ|ddst5HiWexd{kG|$-taY<i{Vc@)Yu&;npJ2|#
zh6~KO`Ewq@go8a~r4O3v0UgDwCb^VE9j`?~t>JZ5GTX~Y4Fw%Fxy_Z?HO1+N_`jT+
zm4s{bBk-gDgXACMEfl{;_y;vyOmsNt1z^aFz_uC{8H(2-4CAn<5I9R>V$|j2xb(|t
zs|GDSEk5l(TLKcEtW4<QUQW+Qx{S$Ja?~YAScadt7;-En<ak=>iHz`*S&^smqEF>V
zpDc(u4l9m5RvLS}Ebc^l11qzMlikY8Y31WHtAz{8Z02M%Gg0lPH__7@(fOyO)pcF2
z?o2Fgjw>jS%E}E-x_luj?##IhXnzJbKaYyvkAHF)|9=2Kf`1VFSdK-#9~TvKDLx@B
z1rJzSAtWfua;vHeYAef|>uNi3%-Bw3lIR>-5c~>}T#I8~mD;FL=#&zbRHjiFkOcXh
zcCX#(bKnU;{C9cm7Ml^1U%6Z?K;qA#0sP?PY-?%gYHRLjZm6#+2l(L#<<Lnf%xkHs
z>}smzQM=?EnutMUbTv~u8^s)oO3b!srEZJHYty+cT89DYy~d=HTGTS9K@Oa|OtNmL
zddO#<?sH8K`p^vxIP@-)wA(5fuuHG>>uyfju8-*F0<s|sYt+G=?UpVN>2A+C?=E<U
zY<wfXUC-^XOQ|Y)qngp+Q&VnDnV+uspI-O;`e5O!t%1FHsIG9|T-E;kj^~$82ETtb
z@y8GIe|bIo)osuIg67qR?e&A9R}UvXe>VT~=L<i3I)^Iw&ApNLTjTE@jJ&%)^k#GL
z<?Vr2w+9d44!4K*HwK=r1|DDYJX&-<zUqFw<ldUGY)xApOq+LRtPiJ*+mnW^3C+%=
z=E0~G2Vd*m^lQ$p>yFN4Ct=k?Uh~tC{~YH3%oPlR5AlE1FPwJ=Q=lt8;T0dMe(s2k
zHDC(j->;=~>!?lzQO|FcQERz`;5eYGJ`eN0y3&mD!jz(%q}<HJ%v8u##U(-9H9i7Y
z^HDe%!$#1#6Ja5zB7^vcB6H;52V|g{46^28C>(wcix8JCMy6bfMm>v}LV8McT1s>(
zt^@vVhd<&*J}or?1Z9^mCqNJS63+8u&n1MNNWAb#^7&6LUpST?b|Ne4WNy@n{OA(^
ze^K1=lDOkX_)o1Pq}6w2G!S9wu=+0ih+kf=Z%?UjyIk7}ORZ~3scuZEtiM=X8IzkE
zo|<|gKK?9p`>@X!<N?M}Gfv=x$^Q}l(e}g=-~|w$f&nHvG$KAC1}lP>6Oytnr{-ls
z=rF&cq@)hab=8esP3^Q!GMhr@(Qz;<lnG@j87>)d*n+D@u`&pLquuQAx$NE_@E>%6
zL;Ra8dbLI_m4a}DNv9+AZ)s|*ZK$oP!ntBmejQlx>Z*fTP(>-gkN6M#lUwS!q;@Hr
zq7X2oJQ|1E!Dn=-B^;k!KicORAMnkN^h^zSM+5d@pLxU&%-be=&7*$(RKPgtH_Y_g
zW`|s3-4@*By0u)dhBfJt-kh>-Oj@oDYDR4AS-)g;)PN2TezQ7Znh7Y~N~%*v9WaZ`
z0)kE2WtVi^nYKMzaz9#de7`sM>;0L7CFABG`|Yyfm)pJH?Dqfa?et$hUqt->u!2FY
z{{3y&+pU2QkEgzQvG~iEOP}qGeY!RL;oiWzJH4-O`d@5#_gCEqH~eqz48Gk&=0CK1
z-M_QsetDz!4LoAKXYaaid)~IY=-isM?9AI9U$O1ZS+=Kiw+2L4o#Z74anatnVCz_R
z5SAUC=>MSkb5#E#{;vhtt385i0TJT=q7Nw$ciO?3_X_9zq6rso(8}yFQu{5)_c+};
zx<f|Lahf2($?dEpHx{&2X4RFXl^3KGW+!4VA~+Pj7@dfIW^`x_wtd3RhMzwfb`B)-
z!Q;o+!#VH8_$W+!@g3)2xQGu4IR#RnxEO3iA09;}Mx??ng{LI`hhgb1H7OF7mK>Fq
z90%Sq%-WJJVGSwtV$|7$kmHHxK0d_%C+XqGvZ7ApMjg+OK3))etSJ7IlDJRGVm~R5
zJC;`8nBLfw(bST8Xc<k-8I8^94NYl{jcJXzL$6D#ugz$vNw2F;tFA~cFOJX8#{XGJ
zd<>>{hpg^kL;V;R%}{~=5&uD6>!aY0Dn2SMA}TQo+enu(lhX6jvoOW0C<GgIU2AO<
zv8jXJLE?~6@N$AQCIYciCRZyJI`nzakpZQNLT52pkm|T$cEo=qKfu4!YqvWs29p*G
z&q{?vB;tZ;1k6;3|8<Z$E-xv`&%wz*QXtg7z<(1|u^MWT{BX&gQWizRq9F1E_7<%S
z$nW)7U_QISXVd!3@?N`Qz@_RoiF|s#U(Xw`h{s&Y8J}u-7?5@^4moD~tfL-Hk5LfN
zv8FxZjS17;S=-vEe!|H?dwl1bAD1cjuefeb+m;4&9yz%W{}Q91N5>wui>$)d%~|Kx
zyz}0a>HEF;KfGFeJfnIzCj9({?blC7{_V-Y?>?RT+n3jn{J&W;yj;`)|L?c@zj;3Q
z-Rs3KcSnD=J@mzc{?9kN-{17Sz3F*>yZhbk?zgvl-fa%Py4`=U(f8{1(5qX606*;M
zvUmH6{lT0S&%yH>Juh$eJzlVHPlFUiw(MhFbyJsIq${?L#Y6C4b97#FbRqu_j=;Pq
z0hnvu?A6{M@V|sCs9U(;7tOeMlTOZrn?DK{u(5iL*odYDv=omLTea<aUbBQ+$7m_(
zsLrl0Nv(ixZ|23Ew7B#ntZ@RCVIU5Q1=U-SQ2k{1g%kLU3=NV$;QkjUkhlWFAwXo<
znK0bNgq(^9!)eU9ASp!Lg`~KU6tF`lge1j>T>8<Vmy~=l3?6YAb}1(1QhX8wof9Gw
zVlTu;oQ^;LNf7_%KDiw7@ge??=SCgNk3Loq`$<vUf8am8p(d-jKDVtYzoQw3UAWx#
zrku9Moc4yC_PU(5+T7NfoR+HWrt<8@vh+H1Z?N#17LCg{{AZj!4e;ag>!aXNfAI40
zsQv~1(cMDY9|?*Kq<pCfV53OQPtPmKEvhQ0ZmekPs3TBXh%5qyN1+MmOfj1)6$<1M
ziApKcXk;3IUnWt>wPu6aX*W4+7MI=TaoW+4KXL(&&FQw;>}J%z$p67cMWc~A+M98!
zTaE*u!<+}n53I7ZsJgTm@t@FCPwQxA5?a|^t#U3M1uqI~w0)5LIE)HEZeCobK9704
z+b|W-j(L^S-P$Ridc-anF!4t$q6r&++9_BWwros!mxgRJ-TEPm(66M<VWPC)+PUh5
z3(R=<GhV@saRc%YB>A_d?RREe9ytkrGN5HoxfOGMm0M2O7&jl>7`QWG`1Z-%A73x+
zPOF~HslM6t{{F?}_XlHt{Mq9F`uXy2_C}x2s$MK;-`uc$u{->W=hMG<Hu=T&z~`HR
zpWXF;aoh9NrtjzX13$ap^Rs)sKf6Eh`PRti+hd<?jemGB@%i@T2P{<I2s~PHJX*A)
z*R;Rh{cO$uaMAYYiuKW9{J%XYUhCna&$H3XLeFQ#)rI^gh=1flZVHCK!TleYpM&}r
z`OhMr0Dlnwvu-}{KkDF&IJiR&_MnB;r>FbXB)^L2l@qL@b|tHc+X<0{{N@U%ZKf1M
zhb28OBRM)XDLN@3{BTDa_=mzyP+a@aaQ_?Re>gIHj|e>-b^)X)cp4x<awZPEKr!bp
z#hp(EJ1!(?;?5`jXqVzI99dFANOD3%QUc0eJR+g^N8-Xx#-024;@OXq&wcz)@E`wi
zN!-Ur@jth{zJSnFLTN3dwUyFZ%4p4H^ky7kmEdB8QD004dvaX~wYHd4T|}(RZ7<1c
z$WJLti_g9oc`5RI_*t}iKz?`vXD=W9<k-jma{So8LJ9$6uX7=%!=f%kB}67brx9|x
zd6~H-c{tCkY^rSRYDCzl@F_GAjUi?-B|MH)%#+I{Dy>Ydld3cTzgQ^;pAWzfv$z~~
zuN!9b9%exv^gmJmI_x%!(V$nzWZ(hnXm4w*udl4Y@dLyh^U6vJU=^hW4OOM>_0@#t
zI(k<Ni`>Sgwu{(Ay$oYql|v`R@XKeA^q8fX=k{Af!#2s7LwKcIx!kY4-ltpc(Za6w
zXl6aqdAInQU%uR{zA<jym~t%j>!;naUJYZ^!n!wOe!A@6zuvvpubi?nmwKfKHwSR5
zvWk-Pil^Vq8gYqzdgff8VPU{L?vktx>EGTNxi_i*_Q}K_o=?BOZhp6@{rCIbe|$6d
zo3}H6`C{q+{QTN?kNY0aNcQJdU)=Hi+tZO>y_)@gfASYQLqEUQ^VPk;S6jVb@AQAO
zJNnI|iLZ9Xzuukr>e0l{cgDYYJoV+H34s5_M&HvF-_z^fC)eB$7pxEFjIix_(~~9p
z^JUl8lx}lOd2d{KdsupNK)4z}tCPLrqhsI&%m<P0Jp?~iKs}5_AA8BqTlDkhJ=_@=
zXU@Z)!Y!noJ?3DK*xAEY)_|GWW1yiB#%icd+^!2g0oC-@B0K?g#i^y)AWMnMOo~cN
z#HA#J)h+}DWx^2NPe55XJoH3l=qZQ`#6_NshumbuxtOpsQ9%BMpm#)tos5Y*oe*{Q
zV)VJBnDemY*bA_v|J#m!iJS+S63%Dhqw()K6&G?m;rz#l|4HXRP6_!aBjS^sVDfXU
zDE3%!+$W`RpOnW3lb?$C<A8l7tFx9ztQU~#1cZ7phKoAtMeViX)*6_wC1`?{YHni%
zyRL**Sx6|&YRE}0ML`}H8h8Hm*-wst{F9R({p7??|K-FdKRI#iUr$3W8P!e1xyZP1
zEMcS|?a9eNI#N_!Qqf#pPiX95639X-Q_g0|SS%@rjRYPt4|0`MrISGY6HKxSIi3K6
z&4S3UH5-j~tJRI<-);4}tR9!$?Q(kDc89}aHe>Z!CYErSEJ6pNp}xMhrmCX66mnf4
z>Mt)VtOK`IZ7}mEw>2<HZD{q2Ib^wzCgqXUxB}CP2W+YlhjP-RUh31Y3>wyl4CoxL
z^eR^Z@|%6?dn3lJamz!bE3=M!W5!$ks+Atu%~9><tYu?VzucpowsOY|l-r}Khco6^
zD}miv+k%^WH6Xe*soz>~!HI58+Ix*mmzvaX7hD-IE)7~1di2+OR0pd)kLJzaZ1w-)
z`S>??U0<$QezVi}=TEQx{=>qbKFt5^v-z*LeLGXaSJ!mkZuNcteCk)P=f8b6@y)}b
zU+xV50<eEF_RX{Lub)nS^LQHe_3k9>o5z!1?Tmid?0dC=o3Fr=W#8^qfZw*WXx+VP
zd%A4jzwSJ^?%u!Vd@!kdIAhqEHQt}l+!;~a9+utc<F9nHVAuW3WsJRkw5vYmqK~!U
zWzM>B4$q!(v!|V$F&lf(OdGbq&<JG?SeR&h`E^t`TtL}n5w)pU^;|+(S6v?Rg>p#E
zrYC?bB^6@y@i>-8oJWK|3Gj!691jaU5fySWHtckK#MuPge?^?dcZklzB14Zyg`J3v
zJdI2~G5XvkxIpxIC=vX3mWZ4u4m=WPag`MtdOYsJv4nFU0sohN#D6X_|L9|fZJ$rd
zg2)eoA674-G|O-n&g#^#J8>neV|M8o1U<b&55gu&yOta@4XI7p)g*1N<u_I`YKsY_
z+4VU|1(%|d!g1Y$_<#JT|8o4JApTDt|LNJY$1mUs!$rfT=u7E|sd;Hx#W~QKfJk3W
zM{NtO6?eHzC5x-&@wHr@ipNz5dC1{Gd8}3o6><cBfFH2eSxkDX8NvSu{0^Vz2>%X`
z8~ArP9r&9^_-E6ZL>yu@H8$1P)>T*3R+mE~D`>S9gqB8fM++E(2rYHwjz$i>lf&rb
zGdiV=PNSI7Z3tdL_M3&HHqoS0ywI&!AJ*QRG(VnqVncp!*8Ui6lS%!~l;P=&ad%Sp
zU{tv^soj}1JeW}58IZ0xnTs~cM!#rh5=-^w&o=sQ^hl<yG=$2H5#6LsG-efgRTMq1
z$)zBTJH*%f4GSK{YPbB+jAehx`qlm3U+oTlyB+xXEywo{d;jw3^6%a){PnY?zkYh<
z^Sh2G^U}|6+JF6I_&2ZSzI!?Q%l*l3_a?u8Huc@Jsb4;y`sKm&w=ZVCJDB_Bv)O;!
zoBp?_Q{OzD`r_f}`+NOwa4)ve_w2fN@4EZ>niuxshWF*V_r<#BV8yk!WPP$|d3MeI
z_=;&~R{vm1gNNYmsPxvLaHF5U8eqc({%+U0Sy#I`OMcE3AA8=5_>cZSd)&?*K;>&7
z^&6<YdP=W>2K)z%bf1puQIqV_PCNl3N;S2)xTPYq5+dUnM`S?}Nx|TJJ|^M}@E>~N
z*o6zn!a|NmhMb5FI~9vOC-N)|Z$VTjz<)9_{6thZ;{U1ms53zqi2A_=p#AYrEg|My
zEWjUiIy&q`bjT-h=RX1dk^fve|ItzW|L^eMuHg{$JhE9pwF&7C0mCU|y7-Ksap-Oi
z-NmLm*)%(gY@rbi<ThnTgRrrZURBsum{xi@HYMt8=t<-}CqDYu6Yv7y|KvxwdJ91o
z7Z-*Lgyig$%wo{5=M~qN;99wX+|bTxrz_}OJx^#7NK68up3hT@cuI*-qY$dFc8PAk
z6qyf#Kk$#p5BwjcK=1-9vLL_5>GRlKPK(uwrB8`S%w{l2;Gb$~Zfk02X{c>&1SfQD
zb3=6phG}h$=!K#zCbl;b+Z)N9&BTs+QfC97LC^_kW(nOYqkGisKBHjVA(?bZr`+O6
zJ8Rm`Ty%0)-Mo8!vYk=YqjA;VykUP{|6<<o{EG4EtoG@g?%9m?@tEX(fN{f1+Z>kc
z%^CJ)^slelFj}24ktd9lD{gVWmTTp8x}+q#n4qE6m>6}VI_9E7bgM_cJ*s(q&Gw6}
zz7Ol>ZyyDI`OyE{z2QH-UHaSStAG9M>Th0*fAzroe#7$JBRmk3e|R(h+gEeHeKGsH
zmvg^+G5ec?nP0yM+P}Y?`ToW1ch6_O-JkyDv+1uNPkgyE`m={4Uu+G3jt0@Ko>%MM
z*EhTetB(Cu$HBVm<%Z|QnrnaA{`|V*$)fquykYl>5ue*L+IwU2JHukQz*-OI=pp#w
zIRGzK4<B}|SFqsY0sn~qQ!f4(G9L@Q-$?CN6Wz*Ax2nsf>at4P&7u|qze&jgO-dE5
zrMR^+>j(U!{zZO%n3*H_NAeSTK8Sxj0pTHt^QYhxFr-FDrNIyQk2o0}c?wTJJeoN_
zz5@Rg{P^XM_&*aJ4*VaBJ@;|^*`Fqy`Dx<0pZ?GICz|<GyNKzMuzfOKK*sBl@_Qxx
zUNOH<#OoCXEg;|pgd882;b3$biEYZZdR|>AsWh`HJuWSXf5_gQ`1qp}VE#n@bK<Ax
zE}ROBJRhAHbt&^wdSOORMQ%}jK{+@ph}F%^rY=z@TgMXFgkrl$Vi5}Te6E_$k_$L;
z5nC!`N%#V#SZUBA^26W-0RLhA>#!m7@q6t)kHhP+*s=6)RI5}x5UW8Gx}yWpA9jRx
zz@AKKp%L4tU2RNKC#9pA)ZR$#Y$Aa^sI#6*Xk>#Im)U9LlWkI(SHtmZ+5INokX110
zmd>~YvkvZpleg^TU3YR<UEEa<=emcr>Sf;@klY_pKc2HZU$nnmupZ249uM;F23Xr;
zvS)L;R}03clgdZq>Z=Y`pQL3#MfJ;BQbLVOO7Y66l8zEib-Jdb1eDA7`{X+#iceP^
z-);pyy<z_Tao|@^dw>686u|!9pRNAs-4);;D{x<IdVak(_Qw}<|NGtI-`>ps_0`<(
z_QwBxcj&vFfq#D#{QU0m*e{+;;5qp2+4MKNV_$3yez@EH{!aJ%Ti$o;uD9#1S2(DI
ztvg@fbHn}aj{mdGUR+iltT>)7nPE>BO^+9h4`($Gr`21N%KPK;+e4!DUM{@g$X0r}
zD}93N=mH($e-={!7k3&Zu!}eDU=NsRi2r^K$tG<x3b15btDslPC>0`7Ij5r}SOS+r
zd+svwAE;>q|3N9e!{q<(`48}q?~!4rQ0Kys8bv{F6{T@7{-2DDJQWjlIyUM|eDv9b
z=yRw7kO*Cj`R5k@L;ODzgI5Fj&-qV)|D*VyeBr0*;UDKj9Yg0IZGR*`<?+WW6JRG`
zRF{zLmveg+!U3gtSS21&N=6luQMq_jCLWbaMx>HqiDXD38W3^(9IBn#r6)9t8!BjJ
z*|iz*8PR9MPvPbTInaq?pgj5su6@s;bBkV9O5EkV)U2|cg4%-8mZC~RMLn&yjn_g^
zl6Y2*#4QxNghDf)sbbO4krcAYJPrj0DrG5T)z!G;QW-$+6D$GX0sueqA4j*(h9|(|
zGJ_&ngTXLZ4p}rZ6eqgKm>zewQHdSEd=UA>R!UbhlibcGwR0&QY*H(>-Kgz#EMlX8
z(IKR@Y1kcRVekqJ`0q9F2Q8vOn{e1J8np^0Y{D6vaMHjTl#%U>T3Ji3x~tgAZLkU2
zto-&K4RhWtzdvDnvE)9OH$R=$?oO%qX0<PHt~;mO8d1z*F@V=(rZ<=v?FLr+kXh_j
z(CD>U)Y24wP4=LSwAC+rG7kJ(fBC@s&3)Ieo&<jTY~atYCw~8O=5L=~`^)<)zkfaT
z-IKxZ9uNQN)x!Vzu>Svky7rg-*{|;eK0`3Sf=o>FU_iJ%D0;bQd%fcO^k&b84gZ@p
z=c^Ul%N6sRHS60o%i9$*?A@B}?G5|uHOJcx*N40Q&+hkpxYPY=-L-es{Op<qpHG)e
zFdQ21&gph$G;o2<QB=eNxWJJY0ROA~!qtA!av%Dif_Wbw_@8ug#~qw8J8Q^H$1XJT
zAE%;I&u@@3s>Gxc0ilS~QAkHCu_n8*H0>z<XC_6bpg4^_5Bvk|fH$51fIl?!#DB#<
zD&h!O#Hq-rQ_<09kk!Ovm=zN=fdA;rfBpOq{70amK>q)K<3H@5;J@nPN#LK+E#VC)
z1w$&ym|8likxi;(Q!4qCQa+`SPs!zzGWmp5HYSk{iv)dKrk6&s5Ze_^HLUX7#>|AQ
zApXH+jU9;N$3B7<9you_;i5S%JTW~n1A1-Mg(Xd;6@;={YE?6*fgo*X=qUmRN95)S
z99))`O%zc(xl|<OT^u%%3m4#W<r1k{2}22R^aLOU0{C6sK1a9L=5yQJPT=2QG$<8H
zAz#1+`8bWnq7Yf+E)FOPDV-cz7j}F2Od`&81>ip<w}@zMVtSjLO;B(;#q<^hyIsfW
zF!8$_GP+mI@v51AEvMJS@3#mC&EkH&ut&vp3+V=0M@L~gsjAQ*pxLz&t(1>UO(zk`
zSu{O|GH#bWn0D_idiEEsuddk-7L1Q4RS!m#y`pAWW1g&~SVO9Fh=^uEr<&Q+T9QPn
zOz$ebY^K$1_6iT?G+%7ke*2{7cTanM|E%}-d;NcUKJxn)<9~iT`{#Eve|R(XyO*%}
z|NY_SA6~2-Ec&lGg|`N^w@35~ZsGk&%guiEgofP5Z@Sy9cr>biGNIcZmTV0RcPEAW
z^OEQDvX@tsua-1#SB&r1tsibWa7T%Q$`6}9j0|vA{9-luC>*TV;RUd#OU5Tx3~&Lw
z0!L2(3V|Dayj3)Xg7_D$4WM!nUF#Ju_ytGxFDl;=D;-ndK{Ep*056^ZS-U~dq-0bH
z2}SfKD5z#Mm8I4dUp}gTf&YJu|8N0xegJ-0co;-xPeK3m$RZ<7N8ky7MV*R_J{=ts
z#D82Ys8N1EKEC`@!=DLS^kMxQcH%$B|NjC1JreGaTrjGVOsJ((YU#96I<1h+DCE;J
z`IJ;PDUnZ#<>Mmxm{2sxVRzGL4q}I@xt1Noe_Uqt>2Neak@-Lh?GXQ`KROo%s)q>B
z?;`jY)s>XBRaBG9>sd7|!bYO9on|0$Yz(%IPB)N=Qc63<v1}?Ng`kx}L^2}~3RFso
zT8R<>_&;j?pz{OpAL8Ew{99c?{9CLREIo^bB5=!s5R^^sVi5^!GJ#2IXOY`6)fHp0
zjoBrowH-3NVMm2v5|IspF1ws&li~!1Y7<kOGJ3a`=htD9%kI{5eOiuHMANg$1~%2m
zVY1Ls&Cl0MB-pdI+1v(;$7u2K1)8ezCPo)kOeL9xv|FRry#@QT1=HTV9uL8-UP-UG
zU09#btjJP!Rap3K?B)U?v8t&gxwA5>J~vU=T8OQig9Y_B_guf<>-+P;z#sN{{`j=-
zPtS(__I~QmZzlfuYWy$nX8-o-)xUjM|MK4Cl2fc9)Yhjbc;xKqZu6*1dvm7m`k=cX
z*J{kR85#MOOT0U%-s%<H_EGNjk{^xI_9ofSXGAZsP<Qpv1vadoZF)bw=fiRFi&Z-;
zm=Ueo;RyT7z&~CA;~o;BdHthV?bf8?-k1!9KoI|dAMn3EC|MbhT<?=C1<?QJO*pXs
zfGtqk2m}Jmj6N(v7--1<O`=vct5(!iLT$)uuS{(!O|30V2L6jO6SGrdQ!hqciVZoc
ze~+F3fFBzKu<-Cxk&$QqZV?fugWsc1M@FBCj)5*}5ZFhUN7(<T;Kwij1OF%){{P^g
z(<>7WD#c@J8G`?mMn0{Q%_!wF3fZhoJ|mS+OM*5fkxhuj!+dTJi|(R!XxeKzRk@AX
zaaqwP!;V7&>4^FH_@Sclg|M?|`yb)IxS_NHRE^Z?MoxXZxS6Etpy|38CNjfFBC81m
z5wV?1Y(rn3L2Cm)GAdavmjm@Tjap?eDD@Zsm@F>m55eDe2!2cf+%BurX0(6?P9YXS
zUxmYA(y;i?rjii<xikWg2CleHfM3m{7`Q||hj7IF;8HMr8m>!0Hwrp!QmRAFa4DG%
z1;egjn8ai?w^Kp`L3$mxvr$B96B0X`%1Y|1YxTG{bM@=GXGFFUzSdt=Pq}bDwx*(<
z$KWt2M4gbi*kgJGMyYw*{=Df%K<MJuOWVqL4F%lV3=5~3+gi+RF9&IVb7@*jNjemZ
z)_X*cr&V9w0sedc`eNV@`+dKEHt>fR<9~fO^PA^Gzd0EF)BCyq=c|pqWly)9;FFPQ
z)uq{SQAROmajJiPX>5Mfe|5ZX$ZmAfJN(3&``sGIxIdcEZVqCljr(|7bTBV@bw&DW
zQSs)Q=FN5e>+6QM>$W#YgjTJuRxPjBZLe<t{MHvM)@O(KM<VoO!LU1jlme+x3*R0R
zulEX819$@XtNr4&0m<s1Y-Lb(wO@9nTQue6O&qQOA^xNP*`ufU4y$0Zs9D3R5q1^R
zo3coC8J$(>kc_D<N~*|BEJ%-mm?NgQiLvM7B2S~qgPu<?$w8_R1`4>-QQ>C}4f%iY
zOLR~U>`V-TKY)#{=8yRQpTFQwCd8gczbG#JM0`lF|B3t`_3u&skKu1Y%&~vW|B?T|
z1Od6YUnL*aD8@DNNsVGkt(Z|MW|XoynQTrPbb+83$RraI@ra1q!(q6XT{=P?w>Gya
zH$FS|WZ1D!PD1wmUypzC)04-7yuMI6ive>HR#Gc+k^EG4R@YGLnz)S};uf;1gP|ib
z4P?5GOi__Y5^@KR*vbTHIH{dZC9s&te^^qHRIODT%xVxhW5(<CI{iK;Dql4Jy>3fz
z3Wg`ZZp2+8SbnfI!DVvUbS{g|VNkeqtTcD=DD8Yon+VFkv{s0}8@WU+yB)Mwel@p8
zFYu|DE(OgjBI)@Mi|mv$Tcz|?9<iR<QbA}aZLKaMHCFIQ&5Vw^OmKx&*Gf$uxpyM4
z{=$A`M`RfU0oj?8=WEKUK!wiY;M|O-6o_X6j{8$?xWLw!Zd6C3R%Y;;3wgEaT1qv)
zt&Cb<)QV~;KczlDSwgDAO$z#`U*2^6X}|x^&wGFUH1OM(qrZ7G^Ub57pKta4=GE-~
z`fB6gcEGJ@=`k?7^?U@0+{AdZLa;Q`w|;H(%2fCCkT+n}8p$or_VT+ux~FrVeJovI
zwtK~}f7SBhs^RT5^_#1jH_O`B%evRgZ~@cXHS7Buwoh;3)ADA`{A$(wa@BgUV%uM~
z?OnAzzJiHBF!{MVBE3B%xj6v*qZcH={?BT!c(q@$Iv`&eQd}L9FZ4-gx&@P70hXPx
z0*CmI&t4tbtL(Ikn~dB#8MT7fS-@<~qc&y}>oVIaFE<n?mgmIgXGCSDgr{B%Nkk4G
zbv6zN4?P<ddgdPudqMaSpE1~ci#i({4N>@@DCGYP`yc);z#k71(IEcgBTglR9{>M^
zf7yTFU-J+6KYRjakO=*V|Gppb&#TL8{?GVF6@2Q{ao|5TG5T`u<^1Y`(&n<t&Klsq
zk<-{FY$hT9(RR^wB#N5YB_nk3J6l;D%?xm;k=p1~BDj?yEQf)XN~6#k-~wi+)9LpE
z|9Aqh|7ODo0QryGWpRM>(5O<&MFMc}vtR-+a58~^BC1_3sZ~gA6I0t{v^FiP)5Ig;
z=J(LZhlE#PK4DlS6b%PMk5(D8g-fU<H<h$i<uz60HdYi6o2odZ*5=Bhi!qTIIr$Wk
zhHvQ;2Np%Xd5YAYQ&<f>&(``z0h_~QGO0{~P-;}mWHUWh5O3a}ax4cljG7!qO)jT4
z6JEe-DrqdvXfDleEzfN!&Em9__9!|Z%^1G8?f%Qb;9p-3{Q61YPj9Eb-5+}}rNb!g
z)BD3uSN%PDqD|J})v|i6GAV<Qml9`Ca3_adi?h8eOQTn32JLEzsIkIMtbvUC(<}bH
zYrdz~T>C4|m+P)KYc{l1KV7%L-mjV7ApWly@a5ARmQOd#?{B~tOm9}rZ}1ST1*-tW
z|Ggz}&FQwLlt_Mn|BZej;y+vf_+RN29pYaG{4WnHmj-0>ed5`Gc-+AQ_>U}j5v=HN
zN?UDW+{OQY#Jy#7+jq7u{Q1n8LfWPjaLmjMF*BnKk}Sz$X33H*Sxjb(DYj!~isLwr
z<DeaOz+uKF>9n1h4xM-1FYo?aX(pX%=bpLuyzA}SOG|Z<uCDgk`}b^mb~3Fnij)^Y
z$O*${h7=|P7xIKt+R-7-VSe_3o;F@CJKP+;azPpa$ROOW;or_?JK*18EBXER@DE-C
z_=o=Qe=q(M^$d!M4L72g4cx8x{}cZAG2sbd0{GjQj1~s9fkvsLl$pw~Riy>Od?<ki
zrw2O*I=qYjTbD0B*tYG%uXlan;AZa?;vJe45tSJelb4WClAMZ5%Ot1eGc&;j3OTt2
ztlV5`PF7h?1~xmjBqyV|Afvc22ZKhz@n|ZE!s5_aJdOa!50rhZ;IC3>t17ji_-5t*
z<wB7FW`$t(nMEbT=|$+FK^vF?gKbz89h*-p$)*)&GSOM(<YEQLHPF#jEUXOpKOt96
zDNrzr%c+HeGU(Lg@yhZ*;;sypjLD8G%!n<>NW|o47U!f!h5Cb7m%X!lavqi-RF_rM
z<I8LEiGq-@1SeaU{LE}jQ6UPhKccB<mWBk1(PVO0jb^A(-D8w;iqlXr{smFq`LX_4
zQT~ZxzHz}`N#TBZiQ%BZA}vjTt}D#&JUdqX@>I>eBZjAET5lX_95Ay->!pLWVm%*S
z#VH2ae+dH(np-?dNhvDDs1kIyo4PwK<3sJu)oLb8%*KTpsfEKv@ocMdp;LLdS9$aR
zlsXk>h85>VE6$Hqo*R*`4Tw*|ThPl{@8_K#6s->k)&>Np`-P|aMc@Kw2IWvSTImrV
z?G`O{0{`b8Y~jEwFadKBTP`r5Cmt}6`%Tn-GyOm{vlpI#YG#*(4)||Tk(;ZKR{&hV
zEH2S-@~W8GN=CMnlnz-iy(pSouos^VceI00Nq(8JUWpMNQ9<AWcK+_Wy<K*}3<FH*
z*zE%RL&NK>*~0L{4xmO*!-R>T&yeEh))VmG#J@A_yf}R21jSODe*^zzI{Gi+pTpRP
zqyUUoFeam!No}B0>u5wXxwHz8;GdWrlJO4z))Ux}1;ggoM+p8s9K6GQBT}QHL7S@}
zDWN0<!9OWImyub($}ZyO7XtZVq6(-`oy<=|=cJ-?Q&IUDa8ezE#!!fG0G-0*K>U}1
zpt(j<p@vVW|ES<0s8ChOq4^~*7lG0zo57;P4I&bUO5#$8%rYztcG58UwBj5xDh<v)
zGK(|B1eBaoQcf<cVp#=0(Up+=(u;uqLoN)u5OhpB36+A)i9@AEW+sHD!KJ)}sPu%W
z$k0Hz0=(14E+8VVkjyFNt1w(uDP5YAR~i?Sl$V`f3Q|~5*DR#uFpNZzDJ`p*Tw1!n
z-q@y-F^khPBfQc=JplZ%!JhG9z7hWJQGp%>snLX-csM7~rzdSp>z=IEJUVK)y{LP9
zs^#wS_PGxANP|*N&X<(sRj{y10ZGmy859DQly1=q8!YnXdR1>{jlN1i!{wJ`#FwV*
z?Nsr`>MP*p%WRu`u}i+vUvYX&b$&{7akA>tRMn*k#f4GXxna@S0sgrG?)gF9*#X|#
z0q&VT-YK9)2Slg)r6>9%$My@4?H8?diH@}M7hAc9S~zpi4zo~348(qY8T_kS@!w~r
z_injBcQvEa$Y|400sl=(VtoZ3$pQq08cr@qqf}5cg!m*jIu2d|Vs1D{UVsY}B>H7U
z`y__BAr~a!qzjS@AcH#4^Md9V%=y5l?JmTuU+i{&X8RNV|GQ5B@E@nI5d7N#{y+Qg
z$A5yBhTy-J`*Z#eSul&<#-t<6he2<G_|Kr#GsqSyPDMb83R20bp_!phL5`p9gfmVE
z{z3U4R$)E?{sUUpo{s*Jfl(mOm7kbUlnN8EskrnEVtO6}z@JybFDl`c6tau+DX46G
zURp_ZazS=VK_1}009RT{BU3qCwg80vs{s5~{sa6U;=fX(0fXTOd4)_Y7V<c7ESCzc
zV=jV!0!(!>N(&iq`L`s8ib|swrt)!llCnY>xkSmrsyTQK7cnIR#7_#PRHPQfFUz&Y
ze^Oy8E-wk4lUNKl@=)0{0vcPG8xb0`V+YKyx<x0Z<(AS);24WQ!Qof(_%L{?X7OY>
zCHQn4AH}UMW~zNdlVYM1YqV;WkcCQ#Ox^2?5T0mQJqBSk@2~)`hyc&Llo)hoOeHPr
zV5jWryy@<e;qg)9?M40FmHNBKJJzP!25RI*asHrCr{a;}7>7bg*T}dQm9SbP>}odD
z87pK0GO;MV6rQTYP#q6HSSy=qfejbM!amuNPU&i&<iwz4ZA5lqyz=~*{NlL$^0@rs
znB?M!`1}y!0_%OOwLa$B0eB7sXATHpzUcUV{%SYxShrxMOR(I*TWW*VVD3UA3%0|?
zjKo3c{Jw{OAU}xJFuN_R4m|^w9jsmeE&z=%NCDspsHCPt7QjKrGC?D;U~d`X0>MQo
z0eNx$X_21sAubVq4gnrEp3X1}2g>Krt_9eC0%o`S?HB8J2>#aiZ)HK?1^;aCzY9o>
z$ni^S{Qm|1^DFgF$bYT)r`2$N&VQ`<XVIIPv?eCK5!zu)S{;*Yrs6eaMUujF3M~AG
zxr8`>wtJfu|A-3KM{w}sqpxhfa`JHwj17&+NlYkANkV6&lx9I4B#V@t&&)02qKbKt
z{1)cZQ8~nX;QuMbSyue#pi3|mGLge%2}EK!?D^`pz^_&6z@YxAQfjyGUj_JwL!kmb
zhbdq)1WX#AO6HUCY<vk5i-OD7%;IcjVLB6)$}CKUn-}6Tl$=@&YPV48)NpZfD%36W
z0sI^w|0QV*Oa|~DkZ8l_Cld<Osn~oD5yPY5nWWO<yexkok6k;yglq6&QE_Q`Xabeb
z<W;goDhx%KiD9N;nb{<19;GrS6L0Gb=jfB+908#Sl@cAA92Jz601Ej*5yAeUfxclu
zetUy_a#LfAGol5Aq(eOl(DuA_NdNGt@%o&4^N{($$*$Y02in#2vW(DzXg@}AMkR-=
z;L}uMrdmu_ix~<Y4U+o8)Tq3;p!{gx3R2#Fi+H%XVzL?J4MlT}yhF{bm2S?d0p6KG
z!P=k@!v4iE*~V1G<w@D)3E9PQ>BSMzg?{$AKIVEabFG(ss+)DPn*(iv6Fq=`{z?}g
z!T&xEg8wG=Of7BF0##6QpMiY9NJ03Ih0$%Mbr>j}@M#3_gZ2@<RZX$J0x$>)PXGuc
zD(D$vd?FVe%PNkhp(087;bl2t7+^sOfmzW$$>HwN$P-}e1NsaOUjd|Hyuph6cld`d
z$PfPy@DEP_nC<6)f5)#scKrOe*7*md{`2ej$7`r$Ba2$Y{wMsi>22&S{IlqdEGjgG
z>sTZ+m7pb-low~xGWO<1xP`fHw*iSmr2Yf4{~y3zS2$<C=PPGFx8Q{ExcubA;?yK;
zW-2Z_qbxg<l$A%%$!F#iF!Kvog$0a46tysq2=)I0Ak*2{64216@_8&Nls%!`u2E?8
z6>22@|Ac?FLakP+R0^e14t-%E>^`z2Tozn|5z;9FGLC~UVPOkc=sZ?&7P}~&nV&??
ziesSC_*hUo%mK|S2>w+}H0)vVahY6PCcQX?T#!gaC6?tUmFC13XT%Z;K`aQxCzkLj
zST+bdV^Hbwk*;<-cYXdbXcqVcg~cXir{>_2a*0t{Wl<<*3`USi5Je^zI(vk>dxS#$
z6{hzxlaph@f?~o0lcGao_l87-1O^582K#%52l-|tMVIC#!JOFAfadgQ)s>m5+l$)G
zdF}ND^R4BUJIA|58wE|3#7c5LtsuEHBfgSLuHX=v=-kqbr0nQ0INqHe?q3)mY~Wz}
zjr^f{X@9+Vq?SKk!#dQ&S!rXQ+|OJeV4oZ0oE_v|80KFdmtL8!xH6--I;+~4R&Gqn
zua62Z4|31LE3lt&s*7>5iv@efC%U++2>yAniMfS;&Y@=Zd_8@phB{)V0{?+00E+*B
z|NUlqr;*xWqP6R3t!gR+f5-wF6lDN@Fn9u#^vp_XS~)S9kBw!QMAHl5{$K=jgmGCR
z=;WaMSikhWp7Fshp}uzh9(&xKcK}?Gj+M>3CjfZ`;BWr{|L_F-qP<Uzto8q&>Yty$
z{HMeA|8)H9H_q1j=fA+emP|I$snsk-J)7CgVYNW<bF2P|G{OM?ELszb-UJN-Z~-RK
zLM7-(XgMa6ksX;I3IE$a-?JS~z$3{o8~}s+kC6ZF`PwzWJv1>Qz91#BB+ZKdbV3%m
zKsGf8(q0q;Rl-4Ic(_s?5zQ(qp%Kw=MVif_!|msCIae-|DykG32>wX?uhQ#OdaX*Q
zQRy`*U>D#5Rmw`aOePje1)K`lRp)agEQ)}F<=~6hn0$6gHej9#7w+;BnFYzrqBIaZ
z1lcz!8Kvgnp#H<hWpS_>2<^dU5K&3E+&D1UdniZ=$7IJq3xJKy=HT;)MQOR9uovhN
z2qWBfJ8ZuCa?h?kE>7+~e&KL~IV=Gcnpzr^Ru-CE;vE>{=@StY6rGfqk)K@<kIY|(
zhXwiX4GoA24-EwI8c&bF0KfRCu!N}4lAJ`90DX8+e`-tv6#ed!>DGeo>WpUdQ1y+a
zhCwrVZl7$rRnY;bPU$85(rj45$cpgIjtoR4>?P)=$mzHim7w1u0ae4n8h)>dHEdx`
z*D;Q?vY_Dw#OA_?;L@0IV_bX%Y(jEvO13$tys@CUHm|)ht-LZOxdcS$0DG;6d8&(k
zvWt1VlfBx>KGwxw>Etc9bB?re7F$?{nwbY1=yP>6Xao!z$^B3WvOxT&_ipi@P7}NW
zw0*D(rbZ0nKk$E(5GF#=YIaTqB}Gn772^}Q=xBHX=!H?hf*=vbq=um412Ut$6GC0X
z{T%{4_jtJOge`R>Ttl9_8@2#GK}MHhbl(Q-GjIVI=5&B!{yT$m5PXMEt3jO;s6G@(
zw_3w6{{VCdX^_G1fe{E8fkW^QBS299|E<qXnE!u&{%2dPBQpF0{0FZsBN-{w8U~{Q
z$#aphhgRr+LCpuij~IggW;V5vNoimZYpFOr5hg-2S-DY#F&+^fU+nn;Fu!&C8a_e!
z>Z6^uJ7DZHENO2-VQL~eJsFpkLV(S~taM^dCOI#UT7+if@B(s~h)NW(@jN!3%O&yQ
z@U4O+1p#p-Pp(A7eUPxP1GcZ$Y1LY_TC36`pPDM@1uH?84MebDSDmfkGb?%Yat>TZ
z!f;`#ADhbp=98C9&q)CM)AJKJAbyC=pcf_c@!1L{M$0G48E8H}mx;?^l;+SeSqwOC
zP?U_#jKpL_;IpGB`EhJaDifVTM<<hulM2!z6T<ysg8f1~-Q8_$9KQMz90BwR?Co8h
zU4mTP_XdQfX5<q>qmtddgFL<c{rv+YLLwqVBSQj%!{Cf?Xo!!GPk^6)cxZ4zZe~JE
zIJzJeR3qklbtlFQ7iV<04;$|<o32dAHYOEUW^@M|nF}qvnI_>}t9+(aF;=hWGl)Ah
zf-aNvK)q_H#W2)nfE~=iI_Xfg;DCWSV5CjeGY_}1k9YId`}yZaMd!yP=SQU%$I3Ux
zrJGZ-+cWYz^NPENw6_i!zy&T%DdAVaO-^>xPIOUEbkad|Zl#L@<o{?V%(Wu%|4<8a
zp_wt?NSkY*PS%oP9B9}~f+-+4Ota+z^llTi(?Hp$fhjmVOu@BO5gRKB2AId-mly>_
z8cw!?mJTk!!^W}DF^u9UYQbJyb_hB>v;bBKLGwP$Ei%vvj>!AMkzd$kwEYs9YJ&++
z8>m~tBET;g{L(vv-}%nq5B|*TK6i#R0f^IXWCYaq^WQpd|G;JYhi+ef=)L13Sbh()
z|2Wd|BV_)~;gcALPh%Vq7x?$_Uxrp-v)H-OMIhAg`K9fbz<J>OBN)_w;3CE@yPY0^
zo?*$6aj298bXp=j-lcFlH#5B~H-}tU$iza}CyMDLF_R?b5RrSgVw$*uAyR<QA-_T;
zQo@ki-|=6iQL8j6jZUrCYc*Pp3c<g;QZ80V1xhhj$!An@DKa)mM8^t9MUe3FpyySX
z#>!7d@Q+Gj6{o^A4-PgH1plGiYXGfssO8|ZnV?2O$Pq)ag`CeX&0v=#F$?3Vd6CrI
zy{w{G7CN3=n#{qa(J^V%lI+}s$Ov!u09Pkp7e`MQ2M<>lPY{^&433P+EGVL)(A1cu
zED+{%_woq>*at)K508K=+IvI8!a@T4{lOPfk`go1lHhO*44?{0d4o-|<w5Q00IUF1
zJveH*I$5y}HOU^q!8*z!+<$2k%(sZ<+bcl5aIwdDaKCA`+ceo_9`7)Ybr?q4R1@v0
z>5j@tP&jE39_bXH>XWPwh%O9^E{%#VPe`v!Ra}{@yf!7jIa{#>e)YX2<DDaxTT7Pf
z3;Iiwm1_rhC%ORtv=g0-f5!h}D+^rUU=tniKT{7+0A<JwT>uh{!1bAFy%svKp#3I#
zyB3DVK@JoRcQI;ZSd*Yw%g#6OPzGL}nuTP+LL3Ai7zai0UzQCMaiOT>z^qu`q;U7G
zjxam{cHXYL+#SA#TIf3$c!%>}`|=L_@a1RV2VeLf#Xmd*j3zKPB){-UYd|JKNi|fw
z9>IS$Cnu&P&MOkZ|Hm)^jm-a8@&EB|1pi*)aHk_bIj%S*5t|P9PsL|tkqV$LU&6%W
zpjatnkVGt!kV_H>C_)KCEN4j|_`}UDwNwQK&-d^Tlm}h_y+)_gs5L5;S_$}v3)c!M
zzY6d#U@EvY;K}7IoP<&$fzlM5G%d;C6s9my$&8|84kiOe-9*H^Di%h|!x=?o1_7ax
z4n-lz)bh3LB0aBI$3m&;xf*(ohMujVWy$bKva;mzvScwnnT<|j6v6rUJYqox9E*>O
z2#k#g-Wwj0n3$MfP=du%i4=ZbAt53v(Jvqb1pd6eJpFzBg9Ab%!otHN;1FA&AN=Lx
z>+kCm6&{ie&4t1YsOp1i%CQmi=}{FFYVIuRuTLvM|LtfqdDeuPHRI-yz)PKJ;T_sv
z0SoL?ZF0EzIM^f~Zc~l|@_P)Rc6hK~ceqalD_t<>e14?j+<@@Xu;ALH<i?DAb5;Rk
zrPpT_H|G?07OL(q>F*z{zPDO`?|9SgqqSG(wdaN<r+Qcr{7-Z;0sqH-9{=Gr0T-}7
z0kaKs_%}h~Ka4;_EyzHDDrjFd69yr=jkI<x8REZLR0NAKHBzjWT~I~OQqwck49Eh~
zC1pu`4CG&X$+=<p%n(dkC@Lv1H_k6*uV-wKOPDX5CAIf+xAB5&UyeJhF7PwV@7eD3
z&i?<1|69YK2qR)NG2oNdNQV{}sg_L86VM7wHa7>hV7#L|zq0${W0-(O2oNF!0jFMe
z+wbxWLhv7xpA=h!;6E9ckx>TIKE*|_s>>!3;8wMWOBV4dVgXeori;oM;tCF^+=9xp
zLM2tfr~_0x-|`=oR-@5r0RI|@{pu>UN}*E9lroW0!c~gcDj`F~rz?4sN;Y0jLzkBo
zh_N{$OcuW+ol~63E=d8^DG@B8z^D%st>IwILXa<mA!l-f6jv|C)`?2$g<!>vqT(iT
zaf1L=%gM8Fa!qV-g!D>cvW$>gLCzK7vZ?4yLP<6Tm64sE0GnkQ>8Uw6dD%JnaS3UD
zfe~Kb0X{x{9$p^yaNow=BiJ8&BqAu--_z6G+uIxXPf&orua_I(zc4R_j?dj^LQXm^
z_sQ0VWLKt?H|Nw#&BSSK!GZ}rZ780$V8_j+vn}-L7Uoz3eW;GvyA^T=8U$0I9Xe!M
z8nYZ3x4==bGZTjMuns?=yf7@jIxe|2gT(wBbBY@a>RSuyI}572hqd>Y%`j7R|5WS!
zGaYwNv}_(WT^O%e>*t^7VjSP%|6BOyzvDkY!+$+(vW9~6!b}wC0zejwbOB8C{YDz#
zze7i9RRQ_ITf`+6F<Q$(Rg%--Kbne`uB2vEQqm-Z6n0TG6(*uHgE47AMM?fJ7n&aF
z2`NBifMckSL!c*82=a8^0lV$4_AueU{TIz?4{QOywfBDm1pI%n{{#L*&=IZtpI*;k
zHp9&GTmH{(VX>Q8tR|+_7!7oKJ&jUBCKw24B|4X%8;6PYjrE2V;EzB304CqIef%L9
z9QC$!+T$JK^ArA)ldu_SxNKlP0DcUcgy+(U0xnf31fz*WG@+OwDhF{lwnQPSR7;c)
z|6#%hrXAE!{D*=kaDE^^h-p=7jS_wg_^*<Qs-#?%n57ahR6LrJLsYWyl{B=JkT1jM
zmE&@Rm~=sD1`nIgErn74EFKnaUSud~`E}B==1OXFCAk$;3gx986}Zkye0L?TQ-*1m
z7PW~{joiF?Zf-R@)5OZqGc%Q>bQvK-g3Dsz^T@dTGHd}3je<#ytjzSlz(7wgFBdm=
z7hppkU~bON_Ivj1ba!>}_4Wx14fO)CFLzhq|Ng#yAwZ1$ee$wX@R&R}B4bvtK_Kf$
zuY9?i50eSE59*G0utqC0r#1NpjU@-Gv15k9aSM8~7C&Oa3>fkKu%Bxz8?9$7?3b;K
z8&AzQtS#+33r8_#tJf!W7bdDUCM$2uf>qp_SKeOKAUicnx;ux}_m=e!R%;%fYJGUF
z`_Z}H+b7yCFPPRx<fr=ur+V0c|J4o_;QuHjza2a)|3^wdZEPzAnysTwR+Gof$SlO5
zi8^4SA28ATO!OXb0TaCovS1Av_<x-Yu7hDf2tvcmsiLGQNhuZZfD)3V_+)Nz3<DKO
z&I!e31Q#a-<j47C!d!HiTTGyHI9#>%w1F(Z%Vnn(|L=qO*Kq!8KS#)q?RF=q6v5Wa
zu5GXaWB2(74%>h0vi$?MFFx@4`olmQkbQ&wUu5|gWZ$CgKZ)5&fdT(y6O~rWwC2A^
zz(-tw$!=zVv6|?JHPRUM6nZ@cMnKDqcuW;KPmmjrjq{K5+3EP@$DjQc@c+rSkH9|v
z{8Kx)yB+2ikrIXQ|B}>He0C18ppcHia)<;Ty$rNX_yVdxL={MALJ3_cWr`};k}96G
zN>rgPM{YVo{`(XDxAI>I{)p+2^j8aRp)6O)g(@jmEn#V)L&T@3IK(OzPDw*o5>XX|
zJZWhbADza-q;j$8aPy9w7so706kyY;XnD1w(pCkfTg~WIlY5k9`;~;As<Q41Y^St%
zpD4eTpWDRCYvAQpv(l?MS=GEe9V1so$*X`)F%d;36p;yN66{-+6vLjeH>grNIXc+e
zJ3Bf7=3QOjyA3G&!4nW15EKv);Nt4+>gw#}=>_~J(BD5MGAt`Sfl5S+IaFOaOcC%G
zdn%!01yedVXSJ7yW%I_8@v7WuEoxFzFs?0_G#1X+l}=Zqr>pT3)r4^iezt+S*eO^Y
zR;?dwytvZ6ak6jYc-QsSw%e;sw~tsh52`okRd){S?=0!>EgNnh*4$cD-8*W$f4uJD
zTKkjB{ZBTA?w;wovQ&3&Qn@}Xg)Z>P{p=IntmD9cw(xI7{?Fon9c`+bGHyZ|0YfJ0
z;1>SD1$xa4cmg{0)J{FHAZkMe9>Kq$M9<0BQZv-lbR{`WMo1Q7;yFdp?80bT-d><U
zCCUB;@xEEno~aS;a1Cv5fMc-t9zXY8UM^p|JAUD6zum>|vv=lf`{_G-_e1#d?uYQ@
z{hzzo!5@b;F{lwi{)?=D{Q~~&Vg3_Y{)%?|#ESno=WVdw2>7QX_=h?tLUo|#4`UAi
zeh#=m6CDijPp=~*E>K58wqR740#RN<X<R^p&n~C0wrz*_|DT_1`|z_*;jZ^~dlx&u
zaR0rj(eZ^){LjcB=H*k0f%fBhlrlbx%;#@`UrOVb(*;t7u$(Qf<Vn>6nOa(@mV&G&
z>^Z)R|G;{5TCHBIGibHIg0xjSb)^<`Aml=|l&h7nxA0HZ@<>_^9`+42-~wd@<+yAy
zE)!B(3Mw9#8C9CGmsSubC1wNLZ;;~Km57qbfS!(+jyj+r_bPGw<(N)sVTS~@Ph3>b
z%d6+-R&#TV>})+NPtPjSFiNB>yod(}0(dMcEjtbRVgYV0ZqOIA-M!o1#>T|~H23U5
z!Oy{dmy?5SpkIKOr>7&x(I771>gC}HCuu>Iw5T9MDrE6lRIQxdsHPw3SDomWtq+!8
z8LQl!QXQ@<8&l;Uv|wiq#k2b2aaGPibMay|W)4XTFblQ#rDn=<I~%6=&&@YnTIs%e
zX6V}b;ElDun<v_?9ku|mzqx3<yIg(ei0S@O<GrIg2>uUOs~?_fetcp7v#TS|Hz%K7
z9=dhB_0pUUy2ED&gvcPoe$I(*?&?qV9{~Tm;E$Am>S)ji9ygOmO_cZW&j1S4Z2<m5
zZ3jgl4Y?VnAmsR3DXvC@HnDScjO;3MI;2q|Ogy(RnpqG<%?&4JhGJ3ziW2>DqP<eW
zTw?+q_xi#(-0t`B4=(V|eva+G`^Eb|fA^cMUyvtYtNvsE>-dNI58yw>35q|T#yTPM
z&qM>6Vg`7#n2iYjw_Je1XraH4e`W*qC;XF1W<seNQy|WR6@b7bzuhih{~iCI!BP9|
z4zBjT5do3uu?a=#DW%z2M3{3%qnQLemkK9BDG2_>G`@t!m%hcnq>@)&C6ojHRnkh3
zH-QbuzsG;bdjb52={0)j1|V|~GL2lQE$8VZY`ut~<5P4zqK=KzF|k@&i4rzk$^ifA
zd~7na1O#2;$T>08yl6QoyINS<RzYg7B(*DuJu1+>p$+OOy()ZPRcWsh+gn-ETZsmF
z{7yNhMN(KV$gAP!TlhsaaIsWEQI)e~QocmUBVciHQE^`G-X3o55c%!)*x1_avb6#2
zvR!bj3Qo4UIK#b8phF&je-Jx%b8`z042s+v5gQW$BOW3FQwk?$DtK*X(UCzNEGJ$Z
zsn{5;xH+R;2EuPBp3)T^G@%ax;7!Fd+WdKa!D2OTxq*0cAN^!I<77AY+^F*M!TPH!
zT~|*JUOPK_>-@;=_1>GuTkotkfpGer)rJSh8y>AzKRs3V<aFJ$_2#D+I-g%1czJX3
zn>%w)uZ`b6-MO(?eQ~Pl?2!1(0siS;?#UkR@gCu^ZsAt{m%G>s<c9_IA4tRJ>uJyh
zgQKWpCd!D32Jye&2o#9XV*vgS!Ji8JA5`y}6lDz+1gQU;cqk(~SIfv!lG0$zPl%4=
z7Dlm9k@UO>Qg%pb>K6WEyi&tlV-ft@2K|rXA2NuyPXP20zW&&Li~l%p|E=2>zxDe1
z1BCw|_0Nx^owgzPcSOJ+@A7GxkwUh>k}tx4n2ii(BaPKe{Rd+<Q5cP6dIJfJQcJ)a
z&;`=mRB}>CdXSC#&W|DfZ~JijXP<n&ecKmbZ?|=`_lXRQ%uY-!$x18D%O#c+f=C0C
z4A;|1Kz?{!D)1k^fW{XxgkqLN$|;xeWfcN>B@Fz?;GiR1wu7-ZDD-HwQ2d1Wuh;7g
z2A$CWPN34O6gs6$S0OUU_$CR<B%&JlL=zWp;*=U#Xf3@^Ln}~_vZbZTLa1RD#&Zgj
z*r-H)QId+5S0lhSLHw^I^=KFeG}L}AxnGNiD-Vd=D=7Y{(IEZWr!48IM0LpuI?D_8
zNsAgKaP$ff0x24W0G=k$T89BusO>;VceID-Z)a;`1GgJ&ZEYMJY;EB&*uBHi(catJ
z6U^S;-pR=c93dn$C^inx+N5Ahi<u0f4DJg{=#4e9nLhRD5ykmI$+ZdPr4hx!Cen~9
zZ`N2crOrQOEM7F195k04szWa}l&$V#tncTpcXQTyxaWqXS7!}3mm6=KJaFsm)V+&S
z_b*P|KR^87+`zrH?pr6??w)9Uc52`A^^T|KJDzWJKfT)f?B>|Zd$TX^A9{Xg?!lFj
zYbV;)XLYAXWoL#&X9i#vj&-b$x6&(IhAGf?!C}~FZstNKV!oa_S4Wwxr2+*47Z`yj
zz)0;k(E9Y$UdVsV)-C|a+690t81BO8x%ql_9+;Y*B_lxmkL49ba!?U0R0JhAn2-@r
zlH!{i>ya7dkr3h>>1P+@vD?>er>FB*z<=BvAesH;IR9%ue?R`a0DW@U?h5||kUih|
zu@mh7e(_sG0^;-kbp7%_J$HQI5BgX3pF}u*66FZ^-xj;I{DRE?7$Bt6p#MdyW6<ki
z)t3rr{|5u`{~Z6A!t(qyYD!o}$R3Yf+rIb+#@>MWfAIy#UVZ84?d%^L8l9V*jLyj@
z&Ces16jKRU7L^1FCwu@u55Q073&EIT3E-dmbNpBQ6aIC2Z~?7BuQlpaMvW2_`1K&#
zDdPeD%_6EvKr-_PW-fw%9ivD?FQ}sA$O&muY?7cjft4T6DM(NdbId$AVnymy(E$8|
zM(&V-IjEx!YRiUnIOO7?qOh+LtY}D$9@b)q;1XgLx<`rYtRl6k=#46lSz8WMsBmZ_
zJuTVY)fu=xs9C!?IXgQbZUB>_5cc8A9-CdxPWJwOK0aQa@H)7;xOjVc1o->J$3=oX
zHJOB?Q}C5yhEmQ^%4vgb^3{QgjWHztulJWvR+o-wQBVS1tS()&U=JFL<}JlXn#+#1
zQC8dOr@EMHFy7F|zBnSeHm|v}Qh)nY&+YZm`xho2UYdNgG4bfq=)H6OU{5X%y}UU5
z{L<j_jlmb2LoaTReRFr_)%}H6kCtBCJM{ea-2Dp&HjdPtpV6Eflb;(Fp6TZtKfqh*
z6T%k2kxtR!Ho-6Af7DDJh7{OHJ7Ayz{`Z?{T`&uwCAF!c5k{z&LoWbp5f&S{1zKhf
z;9o&blj7hBhyfP>{L}J7N!dZzH2?f~uT0nr3w4R|vk&&z?c=(`!x`i~KL;@X8qR<1
z=l_@Zhvpa3|I(9*CMu<xPOSyC2O1=~j0PmyGaBLZo&5~|1S7UcUXV#o3(pL-^|bl)
z%a6Bz{wZ>^X~*YK`~<1bpoECH{8Uie%)%EGP%vmZ5zhjJA2wCM2mD*X&k%@^;4hW&
z%H<IM;glW3f4J%XKK@l2q~{CFAMmf&X$%PdEA<MgPA-5cd$WX9EuvNn$<=%W|3)@O
z&n(u`3slrx1t}9;K!QyX7N_tFljQg;6*;GxSG=!+2;_g*%pEuLrmDG<X2yuV3{HO{
zfLE6ER~8Rdl?<!UBWlc`s<dBI)~ka=PtakIR%_*z<su>;9Tyeu?e605?HS<X?coBz
z2SNMYkomee1OJEPhI`=33EVnygHZ`rR}XhrUmvf)0626X0|Je(-$uljaA{Z`6R#+z
zO>}Ek2NmlFgcpa(Pwp3h3i@&@eGZUsE(Oj5UT~-eb8H{wWEcI^e#Y58=EVW_#Uakd
z1e^(|x_z|z{@VTr7e=34o_u<B`pMO)dlyFUoF9I;G4b^J+_UQkpKs1Tzcu^fHiG;Y
zcjuqqJ^1?Z(N_<TytuvaaAV}=iG3I64d*8m&<<W76rSi896KOB+9O$pE&evap<jys
z2>xLa4EPV=A9jJwTNFq~epdypm*Z+-9ZXQ9W96!7nU$o}3StVlfS@Frg$hUTpA%G?
z5m=by11T_qe}DU6PbB_B{_ElN<-dskzn=5{{k?Fu|Lot5|5p6tbRhCW1quW%fT*3+
zQE7EZsRtf{e=ztVy^cbuArMU1VmT_4o*t1EX6I?M{p(M@_yP`je1+U>a<+8~^axIj
zNI<1S@Q3ww6a|B25X;zfDxX6a3g`kp;2+kzV91jrDd%qC-+Ici{MYdh%`d>eS+BO}
zl*X!Z!xsLlCCnNzwMIm);S;KPr6xAUz%16$Q962omY%1f=2j9i1Vu@_f_Nn{yM~Wx
z6qoktm>{GuSu32cm(0}(XRFyW7V3<dG-U)OpRy@EaoRwbHsGd>Wz%K~oC+GP5e?Lq
zcUvkN;Q*4DSDc>#V{E=2E?#aB{OzIM>14mhihn>pWC3ojj?n!L4)OzEaCLD4{QG#j
z!w^tfYC>*qItrCpT$oA3W;4j>Dmi<kU468#^4x%Ay-#$kgS*&5JK6>epLep0a|S-!
z87s}?RoG<jq@3wxTpVI=jBu`u^RCa9Uq7h0wQPL2-ud|A@Y9X)r&lH)U75IdY5d;C
z<dYi*AKyIu`1a8k4^F&#eEiwnrEeY{ee>kli~EZ&A1u8B7kF^w$+gK_r#i1K)m@&^
zUVub+SaNbubi7}@(kngMEm``P__wA2*7y%CfGz_S@DHlLpa=w2pjs)mMpR<v7Xk$W
z7l0=KSP(n`fPZRkFvS1T^nfk=LkZM1I=~^s%LejaZ`U1G3iNAn{{JKV1Ni|f)ll$y
zkh-D*3!>Em3!-hg0GU=tMgaewBfxiX4Qd^+%!I`#iZU7Ld$YqGeC)p5g<LV)Z37p+
zcG<bxdxrXirbHzbXTe}+AqkD9;b1J1#AO2hX#$WD;W2q|Qc=Vfh`C$%7r-6ga@gms
zgvhT@BI4#i{;m8UhJPUL1Ngy=MtzM@Tdh-?s-(sWfvKEREuq(nsdYkfy^vHVBv`nm
zX6SIS(MG7)AmC4zm8Qr_(<=#?W=?T~2-hYnJD{ac)`_O;h4Xcyg?iCq1AnoBwNOi$
zw-67S$qQ!6yooSx!p|BBa~A4s9T)IFSS#<Xt7<kX6+(7#R!WGko1eR*my5lt1EN>&
zXtxVKogHl8)4_J9hpR()NB~^?ba4d5Zzo?*H?Zivp}AQpsJ!%I6bKWg7i7lbG1+1s
z352E(cge4eRlyM2u@0EY5ggqIlRW&je#ts4r|jphw$e_v)6euW&-OC`|JTNOH)cdP
z56W*XsqP)OJUZ9)WMkyT^@*p~CLeB$KfE&ec=O=XTZ@lxEkC?-?AgQ9-#lG^@#yrM
zXKQbspZfOc@oyfkym@^5+sCU<H)n30-hX4I<>HL~!ngu1PM#Tobztf7e)(#z>}V&r
z0Dqy0wa~~o*Z?IUDwKdGEmUj#2mBA17=2suf4>Rof?2x&K!IQi2)Pa^fMf9}11Dd@
z$OinE;}anZU=>DC^FoMOfq?&_WWStP@8oc|*dV7+pFRFSfwrCiaDn$(&`%|uzeo6g
z_Kx@c!pZhiC%dhZ>F!Su{{J<?{~fmYzuVWp_1gKNpUuZ%4xhfy{~`GW{A1OWQY|dD
zQ-B3gYk&pOsI|xw0A2tMFEHqQSq;9!52>}JGE*s5S)9eph|JyV=xhJgZaD3W;NQk^
zw*#DN+8Yp=0a9;y`NR?|9gk-c2|)fiObV9;3ZG1n?B|O(LJ3zW;Y(!izz@M+0cVYr
z8YQ>@9CxfzLH@gy`ho*!A^$a-4AllrwYJh+B{f$Ht7V)TDWgtItre8j@bGnfLY;t6
z%fp%3B|2Jxnw$-$B4;bgvecAZP-CkTVCwiq`((HwBXg>jw@}AlY80-t@K^V7R$CY=
zO|+x+wB<V5ky`Rn4e5}Xv`|f(2dUde`Dl~6uTI^dE@zb%!VY7EuX~UujBmO4c{q81
z+Ps5}tAmZ3qpj7v-JGJsgZGB`dpJ9IySey!xP}J!WThpdQQ1X#=>%*(wm7RKKZ(OA
z1qx)TpdD;0zdEit)5Blymz?eaZ7bHPUh&!>NLL{)aH@-Qu7`iVp9>?+fdA{0f;$Ig
z_m|WUS4~gW8lPP1eR_T3#m(7gH)bAPoqDh_`S9Az(;ExV?kqofaN^mc(=VQ!{qE()
z53jF$_szxcUY>pZ?DXrWC%=Do`o*n<o2PrP9%;IAu=>Jy)%vjP%<wz>m#=h6m-Y$X
z;vY$Y{{jCDz(34F_rM?!R6&tOFw6oWPe4l*sR62>Vzf2>Lkdtu&9J%vrzq0O|8bc?
zC8<ak*t!d|*WV$~%hvBF7kIx5@IQ(FuioN6^q26DQIWv~tWN;hLL*mGDK*gN`d9cT
zS#YH)bPhW!CO^vA-|=gkFShQy?6P&Sb@6ixj0=kgk>|3aA_^AAC`0fM@t@70a9A`B
zX!`Rx0x?e{<%`STfnOq5$SYy6Ujs`o3WXXuZu%bnq5cEG--`blU4^AeT3soqk#p<H
z8TArsy^sjVujd2!@n#mrKrhtOQAQ@pz$nmC^K{e#Jv|@H%*e0j6?In<M$PQ`M$vMU
z;8=^`csuV@2m5p<>vRX}^gh<9R@SLz=4u0VrJi=Afpw@sINek+(xmRTD61<3w4$uE
zy#e3?kpb=@J}%yFc3!S_-foWGZVsL<b{@`l?oPHrzV306ArZm8fj%C=eqKSoo{^ye
zd6~(%8A&CmOiEcXn+95sC?*BXred{1(xDE;rD4^TG3AXp-K7!bYA1hf0JgW3mnW;P
zOjeycAYSX_p6%sc7~*e?3U5u9-#esyv|@O6y8h+)_Gj0Ip4^&&lFpOOnTJ;=A8brK
z+?ag2IsXzk&y(XX9-aK=$@&kkuKn?MH~#vE+kgDs)o)*%g}31K;}g$sE<Ct6dS$8Q
z{IqfNp!wXW;`HC~FFV#<zPwKeD*#CRe-Hm-7TSn~K4fMNn3!9;00{pF{sU7$fd5WC
zy;Vg<;y(;R^9$c)!N7tb3lLzS3mi_)2_|F(W6}drNdd4934?G^K~5pQ_5ogde79JT
zwF>mJtoC14gx-(;j<%mV*nZ-)=M(2$+d%xm@v9FY_}g#)jpO#;xPFDye|&et<%^%y
zf2{EzT}1{Lz-g%jJ)-$ZvXIHu{|f&!Xn|R9I5j2@Mu1Q;u7S=wY`@yM>nj^L;pb-W
z9_SgGxHqXVBOl5>6fBxrilN{@;hYGYx?C2W3&Ms12>yJyUGx*}0q~bsDCHG!-$f+{
zWs?ec2-FG{ta@p;nqN8tfZuEc{MVVZb$Ug$%8GwZy^PsZPHUD@8bzc!cm=q)Y7VxV
zgQ?+SYIx{cKDv&NZ4{R^O9_pl(l#j$YW@>7ytyXv;TFm2KH>3wyc7G_CtB&J+v)3F
z%yV7rvz@H<eXNsB^b^gj)fWD8Yx#U@)mW3Zry6AZS=55`ytwd`2*221uP|RY>*EyU
z>k{DO>f?^M0DSs;xd8GLqQm0$28RXu1p9i02l_^b2c;!OrN-|~jtk34k0Zb`OHq1p
zekz?%43aFM<hD6!xHD_m7*m`ds$A=ruWbn!0xj8?tXk_69BCjOZ>L@u5?r5@-kGbo
zzo>e8-28IA_4Vcb-`t#het+@#y+e;TXYOxIJh(jmXk+r(b;x-ap5I@3`RMrf-(33r
zt82f1v-#IQ-u=rTZvFB5YrlJO?)9S+kFU?&IXigcSo@WQnhO&e*hF3%l>`3)Ih57j
z^5gsEM>|BnhW}v;bI{D_w=j@FP!q!c0sqhifGG&*0=(ltkOBk$0aH@a0srFCcvjI~
zYF;QYCls3j8}WfzF+M2~9x*}AR{UEjkTnYi{9B0-eEDb2^ZrkMj{i>__H1*q`NRpt
ze|CK2@Z|^gR{VeV8<#JC<GBM)0DSZh^-nAQiz>>B6(o295Es~DLGTJts(}Sr(_t9<
zu_guZA|Octt(HtS6R}m;Jbpn;ajbi|``30~?b`kI9^2iHuJ+!+J`u@LDMgvN#G-sM
z8bv}Eld%|D8Ic3^9|m0r`&t5y7=+!Vg7R{)w4z+5kja%`fd6-2lrZMDHT0=B=nVz~
z68SAAusV~z&Y-GR%d3@=nhIXMoYg3!H%drg^<q-3kWeGQ)d@=L1X#d7h%GmZ@l6tZ
ztAx-lFY8v6^=m29wJ_Ww0lDTA`#7f{;I}bOwbIx2(a*Ls&b8Ce?xUS)p`2=_ooHc!
zgDmfpEwm}8npAxzxr$X<njW3C*Dne7*@HZy{M^EQTq1nk0)1S)Jso{LodSH^fb~R%
z_$I@B$>jL-xaioB;Fyqrm=M3c$a~=w;BFro=$)Auofxw>H7cw$C7KlL+eFQ{Ghx2e
zS3YOLt`C%-0ZoPh$>k~K)mioRInDK1)rHaW3xk5I<KoR3$(^OjM<=w;)@xs0?09``
z=-a#VZyqkcx_kK9&H2aI;R%=m7W8y;_UWyKM|YN<KRof>%geuibMuei-TCo{`+xcK
z!$1E1)*rsT^6KI7M^~n9o$k55+<axRcH^LBZCnF(0%W3l%YpwN+b>;)DrhV3U?Xd;
zfd(T$)6fE_rH)rqK<RzhOc}CJ1`tI=1~kHYjr49Eb)V{O{#zqK*9eNuT$G*(UEmxQ
zC8M$|1x$>IW1}KzxnZQN5Ns;Me<THfDTvrW=TIN}0QcQKE;~FOzjSv%rXZZ`KXZ2Y
z-yq-W1@GhEdC#ZLHlKhw+H7;!{juHGAJ~5W8{6&w;qcjixPD=c{~rcH{CECzugfP<
zE}z7>ej4Mt9SkKaE2<!2RFqN;%r+xg0K$SSNJn6+Dew;e)}(+;s{#Czu&UC00V)oi
z04o3p{&#&1-5+NNgTX$L$<fJ$nVGnPTw-xPvABqgfm3s3Tm=6#A(t%?a-<S|Imnv>
z{^5Y{Tl`zW51$n<;8q2*o-ptL*f$uBNaVNTzt*I$)dT)3YLt>Xg`lC54ft=8k^%p9
zA_V_+Fy1M`L#o>#EUo8bYB|NVoT64K4rqT@C2mkpoo@t*1JQE3_*5tVY>((-pX5@X
z<YKSzTo-qJAM<QG^UOZlX>f{m?#V9cu`b2o4%J+%W~8=4%f*+ZL}W(z<U|K#MF(fa
zgk(mC#0Pr^xB&&)6X4|#5#SyZ>YW-L93SSh*ViR6)VC-lJ|ik9(#J8>!zMP+HPqcc
z+SffZIy^fTis5NhxEw`(<U}QItz9&#$e%M~&m53K*groC8;fed|Jf1AI!txWR&CBI
zZ_ZcUTvXg&RX;jodU~Ph<(0nQ-JJUV!P56nj=gzs1n~dp+BD$*$>uD4e|l%}@x5i>
zKfil<`G+?*e*FI4pZ@UhZ-066r$65L;q~QL502kDJAC!nzH3JsHxAWYoHw7JHJ%xR
zP8<wlR36;||B)>%M+E=%^jR2zfGM~d$~bZ$fDE(HLvS+242md!$A7z;+@?n4P~bEM
z)PJnWFYteB@JHf5Hjaaeq~?T`!60Z#04mWhJH{&o76Aer!@TYNU3d97@9=Q=!p$D(
z0s#II7kF>}=l$=$7XMxLeCh(3uMP4F*xP(;x8uV-Uu@xj`!C_&^^<6~Pov$ogB6sQ
z70QVv%B?IwN3|A$ww?g`I~Ulhi(2bK6k08XVj-8RaVSwy0yfDj%6o_1*HHZ2!oQ<;
zkau_@NS&ua{+mrG1pEUFg8C<sN2l<abP?}o_?O6~(n@fFcLuKj)PJg=?hn)lh|ktv
z7PGOY+6-1_Hq;r^wVKL$b$O#o)TH1x%jqq0YKx58RF1p?O%h@=On8EU3-B>@+~Q_2
z7Qw$1J7l2G)bauUhnhgfUv|1jw$>*<KcKigTy<%n^3q_%#eUhjUJ>M*=XwO|J>oNc
zmB)Kj%Uzm79lD7Ig@IQ_%!w;W4lj%k&5I4riwn(*4+BSt4|Goma!ZTwE=me5P6;hc
z49br5%m{Z)33AL1bIXtP!XyTj#QW#(^)8MLE{qC+`yuq?a6K+-AG@%PmO3UcoK_dl
z88JuO=%)`zE{s(|@(bcT@C2M509g{&g>hJ2mtUPz-9D;+aJu^Gh31#ndf(ih_~HKH
zKfO5hm)C3H0#9#1S9j{swV5ZIbI)%ce0KNnlehT4`2FkaKYoArZ-09HUw?h_=O6EV
z|LW53o}LE$Up~@&6^_U(!9hLq*=haxDZ{DZs$;#fBVA&o1-y@csF^e0z?`jv|7b|?
z2mFsA@gD}^U>yb&;ojk&)?=V|ZB4-e|8K0o*TWzj4+YEzn7<YOPyzz}1A}n1yzsKD
zV04N<Dgl{-gAypTz(PE2{aki>JHcV2&t2_c{u!b1j&`2{lmA6~-xuEh{&W0qv)}oV
z?U(;)_xW!efdBt<{C^Ms`BDN(Ml7y`EWk>EXhahYYN3>B2H0B{c*}*hTwn|Ta1NkI
zQj&yA@s9S{X}<&TZ)3C5(Z$X)z%w*1B0et#@DC|4ii|Fy;sF079-S&+0sfgHK1(d(
zN+djqL?n?(wkXg);a>~**XaTMCin}2KY+is#?ojpHkq`IdS$b&qE#blQSw?U7;P2Q
zb_Km%!Dy9JTV<ryav~6(W>IOg5Zfv)ZI=-asA*#s-k6a!qNfk*V5WtC1kM%huRK4j
z+n6+OOc<|Co3G7)nXXS8Z%k{iPpGfJa_*S^?1<^ikY%OUG*%}w^YBt)J`bHv&Wgdq
zd9>v4;)Kw`_z-+*xS%*mRFVWk7Zqh`3R1cppIBZRCn<^MW`&Rwyl5#tg~5ALUAJd>
z+Hf=CBw6w1g4jt(=>c}updfS6gj;Q)o!-w|>la^|P;E@9VHf!7EDSK4?!am=NTVDv
z-&?7BbgKEudh7GcoiDHVy}mv6?frw_JwEy)g8%g&U!H#RVELQ7hhN-21o(&A&NmN^
zym+|s>dC1eUS0m~8%TSv{`vR!|DQiU`maCV|M9!4-`rn)wmEm>WY?tyB=}!CXgN1y
zI5Vn-y8u9e5FzM&{KYmNNI(GoK?MTAe+^}#nmT4d*5Tjc-$eT<{?qmwXq{Tf0;sSH
z3?)!_0-zCW;G<L^hegYflTt*ucp)~HhlX94NZ|jNw7`Ny-;AvhhzJ+~^4R0+jF2B!
zTO{TK=I!@vvu3jYm-x5)>bLfv|Hkq2{|){@A_pZS7gdlkRX~Agcs;EQ?q8Vb&=K2e
zih&nwJp}L40ca4w7MPY$1RD2*H2?U3-Hy9<?EGrao?Q;kw(frJA#vde`Ki{)he8Sl
zL&M`hoo$Q%Qw1CvpTiIe*difUEEYrjFMk{VtzRJiLlp?(Kky$IdPc~<*<5Wwf<IVe
zwXxZ(Z8j)dbn-T>xJAi<;NPL3bt>uYm9!4{te|vOP&(wKt_m`Mzhevh0~&g_95<+;
zPFc8%&C=Bl<@q7=h0*G@LEYuanvEG)iK@RkU%Ppz;l`l`<g~=R`P#JU>a^v`RNckN
zhSfgfShc88N~)rwc!deD_fN}=pk{=VQbWmUq2(nB_3V5zHA`2PTuskx=I8B`pjssb
z4cyEIdU`!Qy^fYzM@!R{CSU?>8F2yX-1rGb?zJY-Aw}_YW&W{7%B3Oc`Qh@7N!8^^
z;QabKhb{M(tM4AM+&_xglN0q%*Y<sLsrS{@1Fts+-`pO5b8qgu$47qu?Bt(cpZ)9W
zbANud{=?IgukSCvzJK)FhsS>RX!Z4@l~<3C{qFf1<iAk=0T=k;_4PmeZu5t4um0iN
zjW>@@0{?$_VGs^VUYIkkPw4>v>l0ewKPLwi@B{$)2mH@BvLN_F<=<NTA2-7m%ohI5
zv|%%S08(JYn6L`~`0vm_{Szn<1xldM1%^Q&6Wqn%<SA$wpamhoLj8X)voMlT7zOIM
zpoIdupy|=D1>+pyV;ktU+uQLgPhkFl|E-u0_y+?T|IU8V5#Wc`-++99Ke&6FmH#-}
zA{5AO=Z8PV|Nn6L{69RtMowS^*=-AVLiqm{`FYEK^2DSq{8KPhR0RJzdYO^Fg?}d5
z!u;7QPzQ-HoxX*Cg08H%9Fs=O2uutDwR0=~aex}2pL=M0L=q}J3y*^Q7vu;raN+JJ
z{8RXB1pfj)XghO3!CwL=ynX@ykpF@U0RAEV1OBbSA3mEbhE}tt)u?RK%iFc$b`=-S
zY4xa>h-o>!8g{RWxnD`^svvb&k`7c+M)VN#nWF~av7*DxvZZF(a%;t*2FbyC@m#&|
z_<r^IQS-TR%cW@mfAi*I<IN*2_l~yQU243!P`^1}e{H^bZP+~BAnK|lSXf8})kM!z
z6H?Xqlxjv!Be$?cSkhCD?UxmG@^e}^8QtRi9%(^eMbWSZH&BTlR}&^QWn*gGl#V#A
zA(~3#g((55tgsno?l~jvs1A3snSQ=sv@xN8%kS4`^w(zfHy2ERe;_|+N6KMQ_u3o`
z@LTR2t%L2YXBT^3Z;rmXJ@xu768!)0;^dF7*8ckD{Esi!Ufx-He0~1Wbtr@$d~y5m
zH}{Ubdc5-L(TO)t*Z%a~jlcYG_s8#UzIk!+#iO;ycaJ^0z5Mv<%$>EK8^>BNFH{5m
z!Qd_i;Q!R1@_3&d@?YRTR`N4l11>;^=GVBHhTz|XfFEqo!Wb|!VG*FmK;5sW!A0!9
z<G%tAa-e!XO2f`Y6hSFzB76d~a4)@JF9{YQvp^TuKR4bxCDJW6)G5q&Prw%bJ?s(u
zTXAm1Kjg6Q?42Y0;+J>$M=GDYx4q^6{~rFSMHN&`6|Gc5$Lrt(65~Dmv$j|e69z<K
zMw(s^pKuF<23rGYd1)pkJ0vA^j|<E|?y$47adNfy@b?H$*qdCKVZ}ehe=GmNvnWsk
zdfNr#aiI&q76|!5i3oI@r68wP`F{Mj;$N#X0{AT!3n0I)wi>Lt+Sq2%?lY;{4HaDm
z>3$vDQWOm6x&1m$zn(jw;|_x9Iin{2qy_#DN)NYI9_>&ZZk0`$dBYmogn>0_WKS8`
zhw3HAyVYw0hBE`Yvm?ffQ`J`v*56zP5zdzDi}mmjY#wUdTxhyF-?%nxIMgB?GqFbX
zj3E`JUr{!wB#vsx^JeC9gWzbRV5ycqr^n7{v9mhdtiEhqT{@&FnbeV{bmTb;W4@Yx
zxPf)DOS0N7TBv1pi;HSX<Hu-OD=OmYCia!lt!#H%b8SX{W5&EWZ@NCGyLHHPbw+(@
zQgMAA7FhHTkJa8>GT%Pd@MyjB#m3<4o73OkJM_bol^<W8`pX-L{};Y{bmG~~!%sI4
zJ-&7jKA+uKe0mdFV2jW19DzpgiwCEn1p2o>-2MLb#?$*JAKW^6^WxOq3uD)hcU)VB
zdtnIvPmimC{KHX<)dLkE9}W126ksd)Wz5uC@lQwaZ=#Qw0Q`($Gn7D?R{jI{hx!lX
zzpY3Kh}eLf20#=+K?@7C-~j&>)bw&v3coZC7U9dXLon$9s3f2CX!nFLr^q1t5N{hu
ze!U#N^st8ucb__K#e934ZFV*v+wc15oxO7e`0~CBy!%@R_=CGXev5ybt^D`j!hbp7
zpN6TTm#P_C_-7K~D1g}-|KH*t0e>B61c4z>0MS6gR^qdvq?HzK=VlAdUwivKQ2+4?
z^omFXMX0y+p8`?|)PIU8I4FVQ7;xkZ?)=h7pxe)ZYo}Z`SHR<o1)_5Bg7@(c<j0!-
z0{OSbe*k}dT}?|3g8vS)s?${2W0Ln7#r+1skdZ&A2iWt5^!zciaH>W!SuI-WRIhZZ
zX6u9#X7-SlI;f^h7}?Vn-n5B(xL$I!xni|bxz?|{IBLE!ReNcw`s%^@&BJg)xAxlM
z#+%D6x0YLPEw$ZTY`rvD16lBqb~*II<}GZ1{oz{nN+Wl*iGQMryIjLKWW*gdV~^Aj
z7L28%l|{qyqCt7deo^6pa?ETs>tG%Ga5HzMU2t(kwK-$CG^$^!<xcR5*6M}VM|9WW
zu=|2}bHTDPp}Rd-^LVBC@$sgoCtIGLY<_yO>6`V|S7+P4z0mRIa`*49_J0cn&&{##
zZqNPjVChfKkN?-VXaCptm;d(W((j+Hy}7^o<oe?M%d-!z%s<^c{QUOe$2aGn-C25e
zcje*c(fikrzIuH2$M3Gad4B%Mz14eH=dYgZyRy;-*W}O67*36;z~Cqn2tgm~lSBRI
z@K*l2(9D^uXZ#fZ{}KPt0tN~+U}5!JSiL3&@PCj4Li%5F90Y#=KN9~rd6l#b*n$`0
z;<@M;T45w1JGeN-Cp*?XIl?J6*ghh_HpqLI760zAy|w34E4Bgrw!1$90`tz^IRbon
z-xsXk;ZGr7c75!$^JDM=dnkT_{VV(f^RXII|AD(5C<Oly|NoAEl8HsJ0P+!CIBE@x
zR*MKk)9W}}2H#mkBL!1Q$fD(iW$bnEu-{|1%idv+ql>)<<iClL2`JEfE66G<&Mw2`
zkxFyP7!(y(L?@KcNH_-ExuX%73^E(q!N@ES=-dh+5r+BC3TyFG3vwq={L$%5CS!Gt
zxu(9_(o|z<t~2ec)pb;Bx-6>w7DcZ`e!wUhG>L{x!V!yb#LS<l5l_~L7Md$g^y(K|
zWK$N-gozEu7AK4>_&#mn&esc$w8>W5<?9Eu7e)=|M|9_gwWkj#)`v8gr%acpjMwIC
zZY{OmS#H0#(sg^e<MM37`dIbpVbiH0<M9KUW8I3SeX_+C$>A2!QZsL{jy|i$O{h?F
zhLVG(lF_Q10eR+tEU$x`T~E$xrss75JCb6i%rMfRx;EQ*b8+9TrH=EX)l(|!x&7MP
zGY!`!&Br=Ht4eiu!SrC!^59tAgVp*cr<$Lyx7}N*d2*`h*;?zX^Bvz@*#G-m<G;T>
z^@qE&e|miQFE3XA^3BP=y;=KT-*5c&+be&0v+>=NwO98~KD&MF!S$tkR}VkFwRHc+
z%-w6VPi{eJ^ys~d^Y<@Ie{<u|pI@#$zq<lH@^EA7?%AQuW8D`Q8qZ9aPLJt<{2V_3
z_?NBp$QIj$3vF-+hBH;inyg`rSJR>MJ8GeiSRnE<hM@mtVGUZC1J%s_Y8F_pg|*+r
z>@qSy4iV(wfd2#e2O&g=|BwP#P|~DjNg#*FKt-2jLy62kD;kFXog)2hVfY93zkFPG
zdOCpw3P=Eb2D()aySLfz2KN53?T!!MvyW_dyfyeGdjR=c+h)HDz>j=_!7thE+A{dE
z<D)%a{?_&jBn3uJK<@a^XZJ@?{||TiEYkV27}qUx|2)S1b1VJ}<-q^n;-6>)oliD^
z-->@)Ex;bY|IQHnml>#l|4b|Xy&P=pcG^4Gz~&byUj-*bfQ&fE$fgrYG6?9bGHezJ
zjhthl;R>k)G?fS<pQSV^k;SBPc`yYH_=iCVn18e4AIX2WivM~T`7zeiS!x?=YFcW|
zt#yWuT77qowx>qbQ=>dkQ#n*qF<MhTRx2H^l_2=96)m(>EO)4;YxvVOJOuwn)=V}3
zV1wvrd*#U<&B-3siT%nG`xVFcSDfgPL5Xj<UAQ)&ygXqzJES@{tlpffxx3W%V0Hif
zmF_!7yEYg1U7Bk?Kizn4s^RQJ&H8xt=~2sxA=Al0!-+oSi7v_MZqeCp(OM_(STp%Z
z18%7SKcOw^;AGa7rJJ$IjkJsb1%9aor1;Gj$Lp@nx7}UoyD-r(Un{&YWVt+Qxj3vp
z)h#`TtN<wPFPiQx*IbxX-dZxl{jbd>^Ih<T^|q&H+n-<R`F3;k+s$#P`Ty`>@lP*~
z|Mm5mKfhZ0)2s8pe{tc5XXjo&IQ8o8@t1d2pWi<A^ybkgx0jyYJM{GS;X9XRZk?OD
zd1n0X`q-V*{SVKNKEJv6^!mcpll^zjjXd3$zP~>F|I+rB(QTjE+V{ujoSD)z%-pm|
z+N5D<n3>T5GgD$_1~FJ*%N80W*|H^Bwq+S4OO{EtWM&MfX_FLY&Urq**WYS$Ix}bB
z%=4bL_R><Eq+Ol)-TS`wzBf4Zxvdp+Q*jFRqGAz-K^R?=7g5c9q-6zH&;wL7?1J%^
zlOg+ki^;R_Qv!=9fmnxu_b<$Xm~DjjFTfR$N$^ihL`=e5AU8~eNDsL-I}Y(2Lf}xB
zBfi%TcwE|d_v|jbe4+Rm&0o8|TCr`zlFh$1zoUDdnA`R%$?t*xi!TJ^znb$8|MFL`
zFBqrB`CDlIT@3y|T(#iCH4A?KIaUBK1^=JyTr+3iXY;`SkuT=K06*+%gxB>bZ}9Ju
z=zBkf(ElD;aQ?xwo<9ii|26-<v_KR~xWu`ghyFRgf8FMF%U3N~h2qcktG?K}5%oWZ
zuAMz~@7kFMx6VGeea`jHd3Q`Czl$Sq-Rr?EPq#as9<%%h`i4FV2#W}rEd&aWj{CI=
z2vbf_^N-w5HWHr<3Y|@(a_M9~BRijwDWYdc=&2HVvW%8kOpOKq3Tjj_1V25pL6}j^
zjW4G{yArj{6%=H9qD=*0KBZBBW!5pZ+^B{;d`9TGVR!;I38S6Z<4dzN-l56@|HCz$
zX`^t$STNBj8f_>TGUN{!^80HA1GRYrdK3)s?3E~+WI{4I$})RO(;P*Kc)jhEgtrNU
z&AC2$nwvJmwIan`nfO3U^3>6MO#<+r-mRjK8VW`X!X6C=(yP5FyGu#I&yj<*$<wXU
z_-<9kI1=*e%x-nYBo;y1#4jDPm%YU=dJ0kJHq$AY?karTt$a36`FvRa^NY5hU)o-b
znVt+aO!n5y^y#O2bkh#av;L|#WA)#T876x*;|}%n;f5Eu6dVA)7|c6N;$E|4v_m=N
zs2H>s_ZTJpX4x>Bi0egN)f}seWRN82^P}~75jC^*pIW+KISuPD{u}-w{!s!dLh)NB
z(FK{8L5xHEWfpD%R-?L6u@oTuHbP)x5Rlu65X{28cEIz>{`(j9UO%zz^fw!c;{UH!
zeEr2@!uhXVNc1gY3gSP4|L>dykpHjoe+8qEBfXFU`j!6x&+H%{N{A0OLjE`Khd>KK
z5xJ%N68!ss{{!nbf3|GplBLTRWAfX&uhwrryywtW%sjt(`u@#RcW<1zkJ-vMFM8a$
z?0FY^8Ho-MPY+i=U$0;!g2DpBB0_)VA6LNa`xoY%q-JCy^-rdg0Y8I7qwyHzJZ6@V
zl`Uar$(ZRfW@-^5NluSf(qiP4NcjH-QD$XMtQuAQtSIDmDw)_=onR5CHDWOYC%Tp!
zjhdhOTvT>N8Tc5OmSB;@0sc;9IzDZRwC?h3@Q=^onw*LHylIng(kL9Q%O9;Tm@rAF
zEJfo+(P+JJ(2(D!7vL21Rpktz3Aa3}3m1nl+L{;IoEvE3dN*->%p89+C#abp+9G<?
zA`b5?Nw6!^2DJ34df`x2UUxamEJ`xv$F~(_bmBG<CYmM5W=RT?u>%^~SPi>fj_Lhb
z(^lb|VfC9a?RO*U@5a<`Cw0#U%b)aDyc#pSnX<s=|NPSN_DKieA8b?gH<$D^Nrx<w
zp(fFURr1uKdOg<gddxW8TRqcP`)sKG$xz*}UDItY>TMGD7=?Wn=}4P$pt+>0LENn4
z_ZSOXD|4)BdYhJMDoU-%MGFjG2@$+t-!XP!pk2rZ&F{r1euMb=FaPi7<RA6_e3Ssq
zzW=g_MqpQG3KaZ1uK>(KjC4B{a_5-WwF9mf_uW3T`~0EJD1P3(c_sMY2>53w|A2p|
zWB&`t|IRPpUj+E_FY^!o5B_KUzaM5Ocw_~6k%Gbhd$*6(NKS(Ua-Il;2Ciwox1+)T
zwSDIgeYSc1(&d<xzX;8bYc{R@`ryukS1|eW%E`OePr6(`eHXL!XZiP<<=@lwo{zUX
z#7}4l_zw#Ir3xDUABHN>q~vr|JZEQRQOFb~75vkAbZQ=xB4Ckz<v*j4l~&A5E@LH^
zG9ZB>>qS}GoVapEWCcA!%LL)^&C-lIeq0p`rEc)~@%6be#(b<OjWZU+T4hNsg-IQ9
zWL=Qq&&HScM_^RX9yf3&>i9SUBYNI&EpMQPH(Zww_;DRfnWdBF!f|uqh*2_R5De+L
z1IUr8$UTaTUS(R3Ji#Un??OIP7SUH6Ggy|;R~GM3#ty2}M=B{pTH2t7(FYw>m|+sc
z>e-L#cu@vUR23_{G&=xqbFySlRi<x4Zggv5T9=xNS<o;0RIi3KFNf5x$1C4GuKmw%
zEk8YNdOO|pcBbXKsg`F$O*8#<uf|NT#*8yPmAyt0^!$)f&|k;xug~o>O2^w(Q+CbE
zk@}aT#!-iE#G#w)tDfjA?=?y64Wgk|WlvL~wK~^a$*Wf}@e^|eLoZ8hQBhizS+%)g
z;J=FDSMeMFUqqxm3I2<zLHHy};ARJ5VG0B<)svm%{!8*_b{$wo=yf=M;t9;}LX79>
zu=@ysk9&jvOW(Mh+k5%gw&Qy@?EQN6*Iz6mls}fdtU&NP2mYTR`B$m`Km7v#PfY&7
zC;kbGK7#&PIsb#z*n*7_&_w;`Cp%XY`sd*1^Nwzq4|D1t=zr*++wnyH$1Oe3GdtLu
z9P)Sghc5CW1wKggy%ptp-sk#Wg8wfTEnU2L@q&fR7NGwH&F}F4;Q!RU8z<ra@7+A-
zcI%R-%S8{Ds~-1nV_`4EKT<%UA$|`Z1%~4xI0lUXfFJrFEdVL0nJ9S9Mz;%z%%ae8
zs5Cx<n$IE^u(PrE5dOcAl~Ke_le1D)IVo~Vls-RG%SotUMyr|8(Ep9XG-E+ZB{Kp~
zU64Hud2xW>EJ}n~B#AafhC`JN?49Jd7GVrRa!&=xY0y7ImGqGscE6TxM_HPRcp4b0
z%^fxf2KBsgqY(U$m}H|(g_D+|$IV4g%+jfb{P9}eFw&<Q(pWWlvW7CMNgvmwPwFzq
zbm<fLtjroH#{tP`6DKt1$64~?>X;Apv|uAYTF(xvW?^M^s3H>&)95r0Y81rcJ#Nfc
zFw<7_xKsYbQ99#LJRej)AJV)YuYEmccs*ACa-?CVM?c=Fo3QJ~x-?^*6|YAOZ^!GP
z@cSF`P}SVuAm}y-`Wr?4CUJLzaI~ZByNRa9{f7Ql)o^?Hba&NIOKH17*wrX$GYE_t
zR->9(Q=Hje#cipem-E7nvgD4^EJJQMI^Qu_8vb8FA#(ok|4#m$SHQ3Q6Z$7N-G_q~
zFg$+65wqal!;kkb=pSMn^n(*Yw~u;U`R4xlz3~56j&DEp%_hwM*|u@%rrF7FaQ;s5
z|1a~u5&`e8{Nvyd@c+vHXY+rD7MQ=q|Me&z<bUp9OIET!!GG4@<)4nLAsE@AS^w{T
z!RN*|7mj?sWy8WH3l}V$zhKGyRU6lAJ-F-ewR6xv;GfVx_imkYyM4j^_IbD41pio8
zhiT`5Xom^*4Sk6C7v<29K!o$hbhp&ZRCKzhr-FYvnZ%~jcyzjeMK54egzRiFn<QhC
zir87jIT=b$x`L5V!A`6ZWRx+Y%Na=AB^yN<dR_tnKPLvtzdkpvQ4rTGNw!Fmnx#qh
zGD744cwldpCBUG6I!iO~OMJm~(5IpFt0|odA}NIIP@k4DUZ2-n$+D>^4mG`7%d%B4
z`)h?`mXe9)lIiB+=_V<H<!Q5cx~X8YA$P`@J5`@Esi%$WNMoAJAyvviX-bzYwoMdm
z6+{^rLAA7?W?r<371qQGYZb(`<j1#)6I&!nO$BjgQ9`GR)T?8T83j)}iofkqJ##3Y
z*yWS$veCA}$u2ZGR?Ilc`<lcM?c*J#)Aow7j<UfP`EZ+ZqEj{8Qu6&+{oC>Sk(Oe6
zEx)Ih-&e=)tLKk4i+daLN312^4;$Z%S*Q9Mx-F#>j_T>28t~s*!?WmE7A@1PqSh6q
zw^T6NHS|(ukiH<cy^K`Dj{y8m{)x)}-^9N-2*2`AOu?EJ|DH(xID=oj|3djB`Cp83
z!`LxYJ{%0XbrAU<Vg%M{6o2l3|35jq4h-|JwtTS+dr_TN!22WcFPsAKzXERGd40?V
zzyBovH==!S#`wA<`rl6p_$T>Cfb5#)e>>X!lHctEmyU1P_T{1_i|5auJ8$vaRU1|l
z{9h;dhyFi(@1|4#pL4x=!R<EU-#gy!PX4`ugS>-7eL{$KFr<LMe@sGhGWbtL<eN^(
z&R|k0Tm~bL#VlYmggI1EPPQb6RG339=46*}GUe>lQdW{dNYZeUDp(jKnARxD(6XYT
ze*k}dULy1l{6DfiP!u*rR!;>5I^QggMdqiim;fFI@q@Gv4h4P-i6BJ57Ew&IIM!B@
zI;f-fs41<*X^sk-LqqRSW_6ZP`t*5y`T|Eanr91o^@xb`hU)W%8U<q}gbBzR<&D+m
z4C$!drKw#-@g3rrwt{FA2cbc@g%@t12Glcxn>i6}oG2?hyp<Q(B~I*DX7-h3;_+*!
zk~VJSPc*~xOP_WYJ+T!|wTZ{A1)~;0zo}rVvux5<I@O^ZYb)+D@VfMz{zk!wwdhH&
zuG=Va7$s8<Euw%?TgCTNme-?oqfG_fHOxK(YoMMp)R<?l=1n!14w*`a?RC?GO=I2q
z(XPr-TY0xp*j&jlsj^Ip%!Z<L9kAEXn@h>Xlt3*nszpiGupfbcB@qJqmrw%mb0j~J
z{2|t%{-S^W%l~5+7-0$S(EqdfpR2LnXTt6uMf0QgrCsh9cHcX@^Y$rXHT=1QUtt6&
z{Qpjbz_ar|ox?xZt|0P1fFGa#WAU@(<K>HyG9}WaD;Lj-pMTQ-V-+Hnqa^y>P4Rz_
z5#XL3<V6nlrUv`aLVReUKJ-vu`a@rOs2}5@FXJJ8=uHX6pLviLa4W{+V!-W#*G_KS
zwRyp^MRVr;VeX<&S8iCn?a=Nc*D?7DD@<;jx`!ih3t!H<-ogM`@PFIK0~>zbg95xl
z0=+^)yn;h~LqhQujAf6p@z6h*_?AJ+%w&*B90rZgWC)mW{xo3@MU+F9a!4{xRuLx?
zJJ6N9%*uRn1vf=Ok7<yQP^heCMOSg-Ds!T1xUsdo*v9;%CQ*t_N$yZ&w-%*A{4|MT
zjRi4Xr5QGPYO6HCR-EimraQ_qTBOL1MjP`Z&7v4vNlG_S5l2D&Tf01?yMoqPMsAU3
z;w=Cvpq^@eZ>?asQ3M;R6Aaho4%hK}^*OzeOx5iEN;*pE@Xzd4q;@M(I*StRMM*Yk
ze7i8Fi5ps#?O8+e#*<kSGqjN&Y-EMl#0f(c<ne08WG!o^kvC%%&UDD0+2zl>l~Wx>
z5I-X=(vg-z%r+Uc%AWM7$J&cWTk#XI-N5eDGi_Cj?pp4vKJ}1U)TU$io2A38@}ZWJ
zu};<7sis%MHIDjRdo8!y!0Xm?hZ}{%CaK+2*lSgcbgC!2t0wK06P;>HCC?zIG?t{-
z7be$;Vs!ZtU6o9OD6xbbT*-|!6=o`E!6lSH7$JKI_zC`z|3Uu?rt%{HN%TKr5N?Vm
zf?vFU&CUXK!7iBWFf81;9^-u;OR;^gA8^I|w{v@Lp4@Ta(B|WN(fPH0-&V~3T(xz>
ziY*ucy=KuDs}_8=eC}E-0$cKjRX7F6VJ`V()nbGL`26@E+RDWrFQ2^vh>HOJpV0p(
zfkqd=hfe*oWyL2u*38-a*}VO~W<C#XTmZWo<8vdy_ja<MORE2a%m8;%peHrhn+6Mo
z`OqKE+9O{EVfcwBHN>41gp+bR&hujMokKTHZQ8eG{;GwaF8IU3rSsQ*xo+pteMfH~
z|9_p(|1Q^0-MxA89{9g~!Oi6w`1kd=@9*sv?1zqc&yXO`;6OYCpfBuEWK>LSe0)5%
zUS!Y+<xfG_3+123q~^0J1vz99hf46z%_;``xunwEY*lWiA}2{NrBvr<s2GtNcC?Bf
zrq4~N=EDCcG>KAMin7cy+zUw!LPW&LHhCuWe``^4vpB9>k=`v&Z<E9~2_ovap+-SO
zdr5K+;sRB6ry>LIVqIk<z~5Gq*;0acv7Bxlw@b(AHRKK(3&u=Bc>BpFSizVvZ?rLY
zq#lpOs2ZXVYN$hM@}MfaSBZYc6uUgxAx~*3h^$9BWu|*sqKhumQ<vpk#|*L-pz}Rz
zq>4G!nES+1@QiT#@@KuOm;KsTLsj37=x4gs(>+>@w|d;C!Bms+E(QABdrf(0U7xhc
z>~-wUDry%|z)6|33MX46ZIyIeP0o{U)kwRttwA=`tA8_X9c?e`)aTl&a(WH?zJ|Qc
zI=&sxUJd#ERtTr^CxZ=RT{T7(zgCi3Crzv`Ow{s2tMbCTE7_=eQji}uNHf&DXfR()
z4KAVt&BBk!H$Xxn`hPJ69VX27#|XsPLC{3n6XM6Y4m&mEMso0t#K3FOz86CtobkSK
z^!|l!t{>lVX8)H*_iX%T`}#d!e})wR+rC)2<+Ej*RxkQu#Vr3z=Yao}OTqsiRxidH
zi%(W9`efxI@ca*K`Qne4Eg=kFX8Hek#lnwQF8Byf0Kesb&u8<#LD~Pt`LF|DEP!2)
z_r0Cu=aS-oFO6{i9&r8y`TxbglYD&QAQ1e+|C54TGXm};cwR<T>zy;34{Te!cFFw3
zbC<1JvSHiCy{8TyzjYD$f8>9T-?@g~@4x0B<AB@(eGvt?2l&I#7aS56jvF8;DTP9z
zvS@Vh&!HmcPZh9e1rR@gpHG)@$;EuKB9E%zXP4!Xz<;%vQY9jl;QfmdRz{B~r$^{G
zF;(1{hI}M>vg!p1wRv$yaWcwya0QqO;?3gt4tbibB&}7PV9JfG=RT_AK5CIBc9)|f
zG`&NC7va>l608cMbd`~ti&8sEDRwoZOU>-mvU_UtMv(%t7EU%xr<x_x&C+R$c(PeE
zWf9>Zj5qL+I~uRe8LwdtsVNRcnjIHGDH<Eo+hvI@qFD6W*U<cH=>Y~-2>#XGrQ}gP
zf6^p++FtU)q58JJ^7W|U)u`d8r_Db;vHbYh@)BhXJqT~hCb}@;s%)$sqfVfLihCP#
zhs^oot>Vrqy1j~KtE9GTC_^T}jIE?i$84`+O?8z+-gnkXdrZY|C!4<=uD2V6wkm#~
z5rUuF*&wjh7g(!u?F}+}W6_koX3|#GP)yR~#?%P0aU-gX6=W(*ZBdX*NI@!gxIvN%
z@?kgvf5E?V5eB;8kpjXIfbv1*GmZenPbRSl=0=>~<rtrfk>2MYxt|MgIpucwz|E7p
zE*#zp{`YRf7QD4vH?4&8|7z`$4cG#+982Nnti~?XC7-NV^2th^g2kV}RxJ8>#iEb?
zp)Fhd(b8F4hCdAbgTJ-#qZRYv|3CQK{No7Bzm?#BCmEwN10Q4n{tz#!)8~gcJ--uv
zKV}#VKk=f5c#wnKNdaz|0WOJNS0CQRc>nE3b}d`K9L|5`+T~yE+PweFkrTHsoxBV3
zuN}K}<=E|OC+=RyL)b-kR6)4i@$<ar=Y=cazMt0vKOa|rKk)A#8X6iN5ucEhl})Bn
zX-pge3WZCf=CSDcY&!Ut@MyAJMsY4po<~y(XbL__$<NXkvZ_RsVtQ;5B|^oFsbI&d
z*)iIjn0jGKlQg4-8&l1VHc8T2<s@7I27VL*;VxxXvjlDC(e>Pj27V;WuFS^0&{7Dc
zlm)@lA<yVh)7li7E%J;`DE~^ft%A{|VfR7yG)X2}ipQG^$1IWw;s{8m%;G7FWZEi4
z2s~lTpJ>RPsOL@AaYt+Ll0`-Zb)SaXub~ZSX~^x{6&Y>PM00+0vpAtsku|L6KW$Sy
zx2s<b)Vv;T_<r2<ddl*0()?z|`evs2`^l!qeYz)o+VRd(yjTslK=~98nkCbn%DzTH
zkAXL05sfvA>^01;YG!*Sy<NxXGYSSRkVd@LO7>`L+4Ej~Ph(Mgjqq`g{>_ASpjp*n
z5VzvBSD$CA6I!&~Mitwp7vs;2SXECtbqz%%H4iK7Bg;8qrOXhUnxV;wP%<9Yic^4n
z2`%Ke{{O$`KV&*v;4uXkC7|9o0^lF@4^IB!{K0>O7gp_@4Y+g4<J#dnXZKz_y6xEB
zje7|3vl_$Czg)XyBb@(AIRANTiT%HS0Q^ug%kc?G1Fyd5qkq!g^S>NNVBts07kmi*
zSI)x-L?Zv=)IY!V|NB3m2fK~UD5=mh!ERZApYZUr<cC1*{}%rD{CiTI{QHCdYme??
zx5bWQyH|X%V$re%nEVg^_nkg`?B<2zcR>D?BR4M}y><22-K*dqwLnh(@A`Ve?!kP#
z!9U93g2Tfj<Kj}&Gf?b_XD>=t7K;q$PZzKm`4Imcnv?$$0aGDh$aAUKgjFTWsg|<j
z%%nnUOerhAj1f~xk1AtC846PMxrr6bh}yiw=3-KVFu5`(61PBmNmjEs1@FIg{Ky*i
z!&**+MU><yqnO1>7HNtN>m&-(+Do(TY8uE#_QO%h#5->rS^%^;2zC4M_|+sG#xoNh
z!pu_eKL+N(KTd%~I^HB2YZQz(3Z|O!r%Zy02Hvoq-CxD((=i9Db4Cq(*kCo=UQX^T
zBO$GXtkL6^l9xT&Z-<c5G<`SK{N{1n%V`W}Xq_6Yn;A4r^<qk!dZtG`X;)3!D<*8J
zk=ByIX0Td>D7f2@>!{_8T4keFDPrL+J-4Hp4d>ropZl_3GuR|)*K&uNN}hM?`<oP2
zov^>T{LOUdSZ8%hwZK}HXVLOH>%?u<`E|wAc3qymR@|n|o9WQh$+A^BQA*~cQe<d_
z2~EZ6<(!Dhym-|4gMX*<$0wnGX6K(2%=W(%?_a6J2nc>E#^GWkj2py%YVfs`pz8?%
zS7UuI!T*DQzgs6ft{!$dx9|MnufhNBEo-)ITuDs+U%lvaz`p{;55x*Q@DK6{88fS7
zmVy7pAN`sA?c?yT@qafZ<bHaHd-g+5GUPp>@SXhoF&{bIK0ciX!FeP2&_g^af$mxU
zcay-s%W2nhJC5&MwP_`$0Kxx5|Li!rZ~vu}hi{xea`O`ShyOo*m#Brf>~RnLL;oWM
zgestWzC;xeBoHQIp%*wd5hee`xU0;JEbz|+|4cgkKlG1?OLy|G$Y&Sx=|vo}DxY3k
zlB?uo$eGE-^aLd<K~9TQv10WF89IJaB{!~Km{ymc0xhKDMBxgs7G)dwad-=>$q7gG
z8q(}mDH1*zjRlF;B8&l~8YQWnK)r%tK~Mnrb*v6Gqg6%k(6Ep^=`rN>8zloKDU#|#
zCh=HvA&$UklX%i9n{Fu@M^S-Egd;Fv7R|IsW~|~#=o3@^XoFx7FKD&=;ReBoF@K~n
zf3!(BW)|UZJ?m1v?A5**t^a{=^_ExT=5ME4o{gEN`fFc~H;vh~y_Vu$OVOB3Io+*(
zHmH9&T0hmT9c)$fHy0uEIoc+7)aTn9^2gfc0~V<RrFsT_M|BRwPoFXWRbS<>sjyAU
z>#mc|bX5&lR81N|yP@d&$#y&(wAK_fRdCy@^INq%Lt%D}B)vtQ(^D_4FQ!hmR@4`g
z<@9g`Jxopw!GC^rL86=<3g*%3j!&l$dawV%zaRtHdxC$={{j5Sf1(N+yHL|Yt|tau
ziTA$}<$WRe{z<>vN4>5calLfF1tSBG5dI(hfBnUBti6Ep!OlCtPk8=WTfX#@rL#)j
zsptRV(+T;1`|13XPW?}apIQBbC$M#B0bca`&C5UDzGjZo{~w$U0U&<h|6Ma4;lz57
zFh&y!hY&flv_JG`h556>o%Ux(fDz(F3v|!+yPM>B?a`f+m}-4&`<gFTE?Knz^$&Re
z{c`Wt9j6X`bNTe)8yCU<@!Qv+|IfMJzU+DbFZf3lD8@ktgoj}qL=4vYqV6XZFJB}E
znZl-0dGy(ruN=CF%YgVV%wrZq0`cjJT)>~N<dI6)=?Zq5lATh@PAcal)=5a!`RO(J
z>6N@hH9K0zjn{GG%(AQoVTzU=i7Gf9Gt9t?M#({YQ6`#S47qV=Z?Tq;P|t?MKFq2h
zx0O@70KbObR>A1dBF5y}^|?KD1-*^pexnpNVksQ8$VSbwiRQuyYau=-EwU+V;Y?fc
zR11EHLm{5RDQGL4hG;Q~hmFFKCh@36I@wx0)2W<6Cf{EEyboo}4d3G3YRdd(s_FSq
z9cmRIcp&(vx;0Pxt42FY9VQ8?)X=N)WT^JVgb7ov#vIzAj<S(X)lA=P4Re#E+aw<C
zQ1mxJ_7F#)Q_pFuWc1Yw9(So7hJqFi$5AI4Zc`0dORZJ;tu?|Y{SD9h>#f?nCKcOK
z&Tc5qs*|Nv=Ea#+jJ8U?T97i`t~He~i^yTJ?2tN1M!hT@*hBd%7!Pp;;0x{o;tHSy
zNGOPWu>!yg{3H0q{6mlr{vm--0)dUF$wAkk|D(Lmhdemxck9Ug^ZRa|-g$#q0lD?~
z-c9?qeYSnmDr`RbeD#7g%Lv&|sGX&R=Lh_Y7k{+me`q)n{~Z5w_kA%J{KF1>G4Gr8
zbKSEeJV+5<6iiYJ_hrBcsqfEv^d5eEab5(!+yabHFIs?GHlEDft~|VX{O-wZN4Kus
zylVN{#Tb5z{QvqLUv56OZ|_B10T++oxr+SH=?AyYySrTRdT`U%{k{K({`VlVpuu<u
zhW-Kn@c-!fB@tymWcYtB${tvB5syXipUVLMMSMmvk6u$!P@I!J`vjJW;XFz_1hA7$
zB`kx4tj$ePv16fv%9&{8PBlrhDiIN|qSTBqRK(SCV_FL{T4m{tf_UVJ0Dp^|Y%a{~
zC}-HzEUS{*T1M-F;ICx1sp+;#w!MZA_#KA)o_bMFgLt4xHq=x!WGNbLE*iHMO<Iem
zti>~}ipOorCmqTsHpOIXAvF28RW#8enr<(eZc~iS9+q)L4Q(Y89g3;0(wXjxX9HD_
zd$n(;8lMg5hTG&*_Hx27!@RBNX?NK$R83p)xJ@}|Egb16MgC`^zj|h@@#%zVde|^E
zR5NU^m>#SevXu_B%6lz^y%yP6hoZZoz+RWzS(D?a<w6MA>xI)@6<u`&W;Ls$How;-
z!=z{^A8VEPNpHh+SGB2(T~|ypmXPYDsa5%LN?wFj!#0<(OF2oCR&9$a7e_!rk8CYx
z0eBcZKh{m+2snQU{V%2x>##BZ2mEuA+}Q~j0Zk-75dtFxL`D~UIF!Xr9D#>!r~L06
zb2+mcGas=3`ox|Mhj(EGFcJS^`4=|dVb$-dWuHRufMH<2Xwip%$8a?MTf<-cC;0z#
z_qsXzKA(Hw3&P<1!M`Ut(wiFTOONtnMEWsd5l-?0*b(pT&u#%)C|YAMDdl$D{YxR&
zkGY)KdSu)Buh*?wzkKz&l^B8b`Svfj9ND|)!imE-FCp?h^We_8*(&I(UamKM+;91M
z-170b<KuZ3@vpz{FZw?s0;wP9f9$$|kuowE<ZK?2KWv7G%arh$h2S5?V^kH3<oLoS
zDR^WBH%pO|q2iO86kNS9vy>Glr$?2u;xsvEWg^uH(<&Gk^%<^VMQTt2El6rD%C<<-
zG1bJtkF^wKw<&2Rh@WznP0O*M&R<3A)NyT<IUQOS+<sRL&ru7{5BP;W4Wd3%VSiIm
ze^cRbvwW;YG1^ix*{*!tiC5b4XLi*ycy*h6lJNhM@n-Q9BH~u!Y8bc3#*r@q{B0#;
zP)cnj({|PK;c8Ui&tO(zZ^e*RHeePFnnh#49-kH*A6b7Rwi*fh&G7$<iLUao9^J&C
z;rWbpYScJ3ST{M;@MOGcYNT$wr+UPu?QSX_=}--|%5Al|xB_g|?9LiaOBLJED16pe
z)oH*HfG*-fFAZ7c7F~f^Q!wo?47I3@rEF6PrB0kuofoCyKPu;iHI>q8q*+RK!nj3k
zRq{|<Wl^%^jPN3AsPhPb{}O625era*CDg##Es#X=2hCrYe}xq=#0m`bzo7mf^FM=c
zfd7<`8&O{81MVF2ym|;PU#Iqbj<NrHwyehT#~oj-+49B8jq8@KTeV=-@;S?ve6nQG
zM~fGJxCpl3gT?s%s}Vo>)!zRQUzRNX_`NOtP5zURK)=yHtLOdO`bEE+)jyx^SvQy9
ze*<CPd_H&I`Z?a1p-hYPV?_HiqX=U~&RTe2P9*I85x^JXBETyEIn+Bd&@I*bPVBua
z!B>ynIlleajxV-tK=J<?ECAZDcH@q(wjbHI5B%S{bnO0(({3)8-R@s;e{jvy?S{9<
zO&?F#EnhE~i?8=RKcW`sVQdVs?JgDbKe6F7m6Vmipk#9x6amB!k3l3qxm1WBZIMu(
z$Ck1wxCM%NRAnwrC7|hxd3q^bL61_R3W6P1$xW~nQ>*!?2#QoQ0Y5ULQH`P$czdH5
zp>bRdKeo9zt65HNlx0BPL*$rCDb{j&M-``|ircDXx9d1v@cgxT4uhb#LD*X_#3#YO
zrDPC~M{P(&swQj|s4jclt;4evoc2?D>EjOhR7=s6MK*0Id~7M2hFiunfkiylgipzo
zRW{Zn8aIok+lrqLXvS<MsA?E%!)+oQK;FifH)PBosLLO26pvYCgC_ByS&D#nsI>&i
z&#9g&bhSL4Xn8u-F*V-yc*-_0YMB@|P4t_djkb<;8|)@IYJs{<(oREOXDtE&c88wT
ztYx6=f3ic3D8O9KvTE6FHN0L+iMdkHq!o?Y>ISXaCMDY-Ows2?)^I~K?4VLsu(^y{
zk(XG?P3W&HX)J_i5S8;&gjvB-N)W33@dSn^U_uDdLkj7k5=xMW6qujs&rL@2mp3QT
zn-S}t8R3%p@Mco*bt3ucbw1JWa!TO!D9;N)cTb}D?b6XL2e+@?v2o?spRd^R#i}nq
zU$x=0WuL8Hv~ua3r3*h^JpaS@wq)VQ?``pdkKWt+AL7f>MW4L4<%@_+DaeO>T7vLl
zF`@rg&gOrX&;LK`7XJHJ%Rbt%dd}W;1orzk61MM)d3!&f3;5yu{h2X;&A$_Vr}HP^
zM<9&nFyu%x1Kd)*?!<t9<Tg+4I=O4p&Q0q!uKnzbwd=oH|K+YN+mC*;|I(>rcdndp
zy>-^(-evduS3F#=dAVQr_PF8W3A;JV|6O12`~JSz2oRB&fQ64y*y|pbfYN^|Ih#cz
zbD6Y!F8n_c{7N|#wJcAO&k-?5#XLqKm!^^M%k!DwABmt6R$@g?LK!QzGB?FsOw;ib
zRp@ZfiP5m5DszbVw^dH5%TKCBFK}+0r6{XaL8+IdTg&L3mE7i1T5}n#U7OQclMC`Y
zs(JWguN4ro2k<wF`WnRpCdq)M81N6bmX39lPj+dh9hEcPRZk$=do)PPKem_7bjWAg
zOJ*SQ5u%!9cpV$TyO=R|xPd?1h!B`RRL>o%=Z%_>x|fZ&6^=AZhMR;vwVc6*+<pUZ
z5bsl{rZ8efqh!b|8-j3ZFCE8j7x;fX)b`gspK5<R3G&;XJ?WkvZ<`*oj`bs6Ha{M+
zO!w6fwO5R^m)i^i9D(+loQ|5DRvo)l#~o=__8`x15P^SV8O2yevo}avt0k?~MMGBY
zNVBR&n5N}MR`bKkn1N;N&|*f2Rh=WFhF1zQ990ri38%AKQo@W9W%(CUi6`J${Szo5
zJ68Zh|G@ln(|kFJUO5R~^cc77$h(>0cTykTiu1b?=X*Ka^-Q?y*+|cefp<^1T{&>)
z{JygXx9r)x=Bsr}H>_Fm`I@Ec*DPMUa>0rvpDtVU$$R*TFTY{_x8MI&{(m>i|EGJQ
zf7Z|W2F`zjlmB_M$_LK>FZq8D|8Ij|Z)%7)DabR!&o#yKc8m-B|EUM3_MP3kdC%t0
zzgoXx^B0@8Z`!hN`_AJB_Fp`O{O3v6TjwzE-t)l~FSlzx9<b}ao`m^%-SqRi>*wPZ
z6c7{@8JUzAm6#ZT2VfLHq@`0xSvd?Umq|t0zko%_V`P;}b1NkL0w$@5%P7s~Xk>!2
ze0B+kR3+n<<&jHyX%IiPqHI$!y*wvg!->Zc(B{Ne@#3L#tR>_I5z5-*^?3;<X-2Dp
z+9*l4$PwrA%?g^Ol-{P{fO%U@o~=5!t5#sIE9h<z!TL<nUNng`$p$T;W!Cd|l%r+<
z@IUUUn(0Q%N#(SoVg~TrOQ*4lqC@_;T|U(;L#60weeOsjcc>04r*j5snFA=rsb%(5
z(fVsy{dK&NrUHE8EcDmnP~iC(C8qpAwA&zFMvB#3G~SNdHpO_CX0ls1*r^@use3Wg
z3I4~&+GnQiW5duwEl<YVo{o2n^)^lq7<-z_hTAK8&BYytd<dcTYJz`@nrYJu2Jk}^
zEE%OO8n#iMWhl(-tQK_{iaKfv`%R^`8j(5=-XmPi30E+Ji|N7T{1{6qvmh<dAf}8q
zRWuYcks8uV($MONfe$zW(yRb+Hu%T<Pybo|A%XmO$v&(&4{Fr?tgzd0z89mt&q4Nt
zxt)#hxDb5zbfC*Azq_YgE_`$O_>Oajx9#8lIs8BPU$<)Enw9fcFQ50n&;RegTK4gd
zRde>Nf#xAXV1ob87wla>e^&Vb{=eq`&yqjL3n4X%_b+c!pjW1^Yl_G1m^)WOuAFu~
zz5o0-#8~jJH(>bH=G|X!Ke%i6iG%wuo;-T%^2z%*&${1r@_!Zl`x5+Ld(XeW4>lq_
zjE{x>!9)m@K%ny-oiB7MnMp+!D2qkOK-*WTB)3e=D_~?6=d#Mhxn%|15<UZ2P_>W^
z{^h)^GJblUjA|@qX!yx$Zek@LLkknM9E`Q6TT3W4c?s3I@pZz~I$=tyl4g`-m<qEy
z)ZAtzy}69hQjue;&g;?(I;wL!4f%)}9rdE_Mloyv4I{|;m<v%|>Gb?g_?_hA7Qinj
zJ5^I%Wz#V5Z<9a99xkjUs?Qy$We?Re`}EYl8cJ_9dB8v)sHNE}GrM)<-fG6UNzhxx
z=*6Ik8q8hc5b*1HgQ!@><3dZxNULJnp__2%P^I={sBwDG_+$(-{Cj_RJMr{M-|JT+
zPoH#8j<(K>wY{8iyqxZQIn^;>(>crvRQ2>)<ZXIETQ#Ry!?36srgA!ta96FcQAzKv
z6L#vDrjo2`VOmQCzq_uatxD{uFGf13oEwc)zmoZ|gb`9m39K*7tjtT!PYJM=<#p8*
z=A{Mb3Q|l8Jdi~T(|shF;Gfw4j>;eGKgG)zI=}d-zT6~VTJ(KHzA1s%!XBK6aKrxD
zQz&?TaQ>V77xv$~a_G*b0~d~MKXZ8N;XNQ_?WT3h)~{T+X8F8T%jP=yU$WpMr@{X(
z`g8kRSHOD}1pTvoAzljpTK{~o98;hde(=?@Pqq{M&)d76kUcm8@A?1r2t)<2AcTGi
zf#3Uoq<!AQkKzX(auE1;OZB)D>vA>p@)_4t2QTj5d1%{~U7PXvwPoM7od|wU9X@dJ
z)R7xR{Cn2T<-Gg7OJ1&*z1?6}eLY~;{5)^^d%Fe&_=G(Sh>SojC`urpe=zeWBO{AU
zLI|9iou160XO~LziwknZIb>x)j!K*>&t(;IXhl4F2_MRbUz$rP%_ZrJSdH?WGEP!u
zUYeSh1Qn#^CpXI(ElOrhesUFRK_wZr!c=n!1#Z7Vn%Q2?Zc(w#N=BQ8+pdG!5q9be
zZ2E#uRK?eeVfF?|w^0Vc-!m)w0Y79vXrHoc;q&q7wAmw2F=0oiN!b)~o1M4<N=BRV
z`wg6)8v1Y@y|0?ot;_1JB&@%N+*_56<Itlc58_6sVRUP$m^;y1#U8B99WdneHWc)l
z#PIy%otWob^?0yhqNi?p!1V2O=ZopCS5JC>dN=d#-Q#y}XP!RpeLC%UJkg0$@a@x{
z*H61=Mw+{=6{Gg*F?$u>d0TWj9o1Z`mSs{=aSCvSSt@cG<d}9scHn4~(scQ0jdDi6
zQE4pa_n4H1B1$O>FJBLF1(cxRlljP8LX~8O$mt0q7L87rS&;5ujExDB)CO6WoEDa!
z?&IWtw(^0P{J@HL=Oy{F6FlkB_cKFp#dw|%zH>bI-kH1S_FX=-<<_Y^XOC<>d2sW&
zV@QpoK=R9-o3SZ^;2+}Wz5j;<LKFbQDflh_3+H`+U*ZTjfB4(8fbWDNTJ*`Xg&!|l
z@ZnM-DE^4h|BGhdf9HL$V(#x(%=`UXj6hua@wOG8?p{52-)Hmpte;Dqg7pjce73-;
zfBqu=4PgHg{Qe^QXDi>}{K3CBDZn$s+cnkwcI@q|A(zg$oji1D|L!B(x9<D;>wVj{
z9@w$#=)OHC5AQ#J;_$T#CvIOw{^zW#%LUK-PX4d>xP$-e{$97S5$sWDKx8;lpm+kt
zIJkI}|EFi-87w(5J|iU&+g@b(yrKdwDj(qgi}-X2$mcUlkPhO|RKlE!0+xbDsxIUh
zigU{OscJ!nCNHg0kf!CQnB`1sX-;*1T5UmUtuPJnH_9;OmEIuDYAIz{OBqcHczzCE
zKOF{9ryl$h<U1Or-Nr(|k1v3~&s@@Pkq@<&qG4pTUFDShgwLPV|2P5@otlX*^#qZ?
zR84h40?DV^WW(_7dX_^+8K|N5RFYtJb*4j;)vF`-RZ;NSU72kwPwT6ubyj3|>*&2T
zIfL~An4?b6XO@k2sHS_W#~n3~hnk*@w2bu{zkTd@`*QH<bkFN=Cw~0t`MV#Ue)oFf
z#k0Xz&j&__t<N6Y#|JHs$6BX{E!|elsH3*8xwKWsX{m(%XW<qwmXTX^yg_qGeF?R$
zFr~gQwOK(i6_IKR()1EipHWd;!t66DEAo;R%rFHlP)Q4rWqQ{NQz~<l<@7{rxv)bg
zQgPyh8GgcaUj;qFAkNf_(`B?^Zn8U`ee=?>0-MnPtauMrtUELMK0WF_V%~@MPJ3TN
z&D(*~dp4fl_Z7&;n-{iSe1lQITi0*hxN6h7C7-RFzh(uYe-QsJU;KyvCH|Ms{oR`R
zA8cCu$+qQl!2jNL^LMYCi-$14zkA($@c-)(h;-(E0@zVO9KxLW9~=Q(0Z#D){=G>4
z9_gMBQe1CE-@Fn`@PGL7fxX9fZa=Vn+ku_i4(>wz=dKe6_nkd<=*qccH!maqe+pgj
z9``PKyI%5fgI)3SxE|<zC&b@7?4e&om}hVhmV$-HqW3Wc`Jb5BxcJy;`2Tz!yGWQ@
zT);!-zkr#A5Ln8k%Wwp^^dcUuLd;e2X_b<knqqFbAX_8IR_A3_7GzcMu~d%UQpUww
z82Hx<)A9dTtti7<%4#emTjUIjf?-iIt>xS{t)Qz$)Kx3C=|x@jQb%K9x3S1!EV4Hi
z;<LY5*=v>$wv-LEBD<&>vsD0lr%l@t{OV@;S5I_Rpip7bQ2`Y+X_Jq)q6s&*rxqnc
zq`pc@uNG_BsrK^BE>%XiI(x91-d9QP(Pee2Q@ga8ZK`y86}{KM9Wo&cRN!dL$Mf!B
ztMchk-9(RJqSyFztbL@%_;{@2&C8)z&xXJMZu;#H&wu{os~>R$z8(Mm_2~4JZD^qB
z*-Y2Wc-ylHo5QN@X{j8s>AE1~47pY<^bgshrZ*@lJq?8&)dl*(v|4FOttc5KQg8_P
zpG7apZmSYDsu25#OIZ;LN`Rc~jfSsca$tjutjbAiEa5BIiS}xtBs)liBY<I&X_$Nx
zQYlQV6eeNv2{xZ1^Usd=V8*&Lq8~6K?$RRfW<9){9C-WDgNvS5k6$~s=kTtLCl75q
zc5vGlYZs#C0q}qI*|Lpm7vtr3)zUc-KTi1nm-t^c_jjx3{eI)3kGC%Sbl0k1_{S00
zwRZl_HS@e_Q9d{V48r`FQJAMs@Xw72;zS4LMBySJt^i0NLi`7NQG&d({oK<$?kC^B
z8GRiqV8H*8D+l+T+`Z$_j;#k_JGUR%gZOvvnWG0UpFMK@(uq4aPND_g{q6;?2hJk^
z{%-{N+<h468UE1YVTcF#4}0_|CMGf#<3M8~A|umNlJYogX+fSuz$@S~Ih2fiX10)-
zE#uMg1R&#36#`bdD5p}wsV?M|3CQYvQe^>2hh7*#MtvczwVYp-pI#+M)eAG~Bw08D
zredmDK?nI}Io+&aG?%fgYF@ie&{Hq#)QfBe2p$>W2l-tMQhcIvvd@BI1LaU_*-#5Y
zON6}|Owk$dtRON!Jvt}<I21%g(5=Bl1uP+*Ao!P!nDcrJOv3+H(ff6@UM+SErFWF3
zcd0Uma4)Fg^RpbfthTb0R#lq4n%-Z}N8dW$aC^<tftHdUO9>{KK>5J)znXDO4z-RC
zm|s5WdH4Oqk3YV6^X~bd{^R?1@1Fns<BK2P&AfU!*w<qm8)|<2toQZP-jVKxwtD3R
zGNSe3rgA#Wq#_$jNv1MdLkX?7zOY_S(o0iO(uq*0Mvw$?Siy_a3DYg*x!netRmCo$
z1}dom3TgnBJZgDy1}WVr=jWvb>k87XDi%h)2{Zf)(gX5P@Q~yxCIyt|##IOsMdT1_
z^aE1FJ<_8)*#AflbIE*oCn@k|QqbLy`xowAJal63rd?as9XqgX&$jg&*DiwR->?=l
ze-^D>Hg`EPKa2n1)I7we(|(mYPQmlf=pRBj{YL*l2od#f^FCNU_xH=@5d3dg{PEYz
zKi#=%?(VhocdeZ_`$_QcPKiR%o+mxZi$Rz-E7CV7I)ED+z=;jy#02I<`?Dha8DakP
zP#;R5S2m`j_&i8+cS*c=Gx7%bKkat%;I)I8g18M5=<v3$5AVQ!l<mj&?LK{E@A;Dl
zE}uvJ+ex$l&-#Dxf7#dls=w#WU|*L<LH8bp+zk$L4GH#sgh4<LA4Z@CIxHn6iJ!yD
z=kxd+4x7oMQ;1kNmq8WJR>4VmbQzDP5HKqX`36Nk#J{G1rp+hS2uZs9OoN2dqT<y`
zvY~HlC7F6jhC!MIvjBJ%yGcPeDd_lOEzhxNc(z(#FYT;DKe!CeAIuZ}zY!M!Du)XP
zn&n7%4z(zu{Kq@6V_J<)+wm^-RCnb}Zx!Z^Orjd0t#qoZ;zeKOj2-+}U={Bawy~S?
z`V35n{BA8}5Ws85HdQ99vplP(lG;^~ZP%i>j?!M1)>fKo*HH%>bJ4m!&|EaxsUB!i
zbQudLFvDuVG&5}d{#oCvnVzx!rYDmf@b*9d{OadFegESh-~9ZiH$VRP`p0)KzI!!3
z*k>IXunzZ_-n|@{95DA-we}_@4s4T}Q;%R@No!P4nvnjLrd!HrEox@1G!-%)Rh=~j
ziCSKaiWO1H2rFkt)JxNcn@aSuY%w{Mc<{>d!?Y`NS&qJtk)II^{clmytAxq?G#^1$
zkN~nL$(0j-j~RQHo#-xQL@4-4Iq88}k@wO=Z)Szv$qsix@jp_aA@@)D-#+@_(t#s8
zH+-{Y{lOg@F)Re+BlEL%)j~q~L-{QE6lwhT^8UBV|GnVH7w0K>|GR$?1<a;Qf4^wq
z2MZT|fal}Ii#}Ys;1~Fp&;9pRm;$uq<1d%b*|uih&b9M*uU)Wb-2(Xk?Q7<_&GPR-
z!x4xk_-97@uw(o<F#$QSXxs&U%y2*YBVSqwp?nbeW?)l*+nt2FH-5wau3z}yI?Mm=
zZO8WQJb7sM+2h|_JbUEE<rDXaERfUx1O96PUhw}ng8Xjy``_~SzZVqf5fXwzpwNhL
zR6y`}ocugKl_&xwVf<}&7TyBT1;7w<=tK)ZE<?hhXk@vyN+E*ZiUOKCpQ6prt`?Ee
z2+*Py)XK<O$Q~)FN}O3E&8(JWG?g)}YEE4-xvqq4g7{Z+n$?`vDgl@W{7^o?zPAaX
z?l1f!^6fJhgMS$KN5X%ivjXs=VbIC{<GyMf0sL~fy$o-xF9&L79Gb^HmE#>{6YcUr
zV{Q+j?K66H=$&VS|2AbBum}7&5wrZ0?OL)`p442DXj5n7381H5Fx(=?zo-wBOsvZ3
z{`zmHI$lg7vC{?qhq~)$MqA&0H~!|$%nv`m{^?I|-~IUA&p*F;`{vn?@1Bee+wARi
z!`-HDA9q79^|x2I*B9X*f+)DDBF9inHbU*o$#s%cy*LH$dU))s5hvFOleBqp8g6t&
z4(bNO%UNM~*wy7H<Bm||#6sT4vi!wqo=s&rRZ?nxdY~jTP@Li4uHnepQJhp_;B8*A
z8!zzzD-H=C7d-N&JiMJx370UU_-Xz)0tx<Cl7nw12j7bHzT|!5@a?mEt{mTcbmy1*
zwtl{C1EKs^&5{rJVetQO`~Q3VFPrmkE9d`i{o;={EuXV>^}HQx=7D_pe>nf`tLM6s
zBWI65v<K}M{%4Osv>!X#pG`amqaw%`9Wj(3k1SuebnFQ5a7lV_JKp7b#P!R87tXj7
z{{L_Ce*qHc@^P12XI$@{_i(-7>2}4}>zcn0?5dylHE+W1`1`wugak!Jr;^Bw94<45
zLnc#_lhcw@GGG{g&SfzLISesT1&2>+Q7)rVU5qN|(mYC8E?HARtt`M1$Z9I(Hp<zR
z1sQ-JRxQrbOS2mlwD!tez+YRK4frka{N<Q+gL(kM=YxNz=kGJi@n!ZRko7kG^3>~m
z?ImQ7tpZX9M_|IPndz;5HfX>X2=&K(HLr&2pZC{74k0ucvX+ck3;QuzQ-{G3nY}uC
ze>DTZgMT~r3+QOL0`SXDHDQ+Ggce0IHqbe0*bYN3vaC3F@c;cSr9(FDi}6+@Hm8TI
zFQyzLj)v)B>$}%u-@l&v{_XRh|KsgXe|-C&|M$l~{rvimKfQSIbhyvnG}_lZJ&fAv
z=HAvSn^9pm77n+n5YbyT_}_%mP?TLKO{>8m)53IAbl{6NHx{PmMBxaOG9HyN9x3S$
z3n{@GK~i6%Ou-CC!H+QAyRL+;6J-lCgQQtO1!-Qz^iaHx6_A2+Q@!(2+_{OanD$SK
za7hF3f!89uF2wm?WhHn?>5=SYzc|0Ep{{2ly{|m-xafJ~_>I#Cj_=;Q`>VA(HmyPC
z2mAy3_XhsoJNCc+;yfDvvj1QD>A$U*_wU$<x@p;`TUXB|_+K-B*DU;7SIu!HMg4~V
z7;h#l+J{&y^k4jYQ$oDR#LPq2v{~n$c>i|XKjQz?p*`nL?7wvG$c-z%<^QseC&<6*
z=X2GUfd9I;k85aXNPJ=viJHk^WsvFdDH)M536U|0v59FZ=_GW+aM&y!6D^Q5F$axc
z6n&XYQ<z)Ar<4h(;D6TtlNyS1%%!<F1$rs928E#F>{=P6p@eR&;Ni2rm}XM4%qqao
z#ZM3k4p<cM{7&ajsC}~>XTf<Ka2E_Tqx=kMpR%zI^|(zl+7A9TlXl&7clEP@`ltN{
zG!IU6Rlb>Qe%z~{n*BMRUkAW{OHqFVw-X5;Z5Gm-aQ+=-8SSO%_yXtOsz?R@2#vwN
zxhTF>nbM}pfdB8V%j;{BL3E+)d9ba#zpY|s(ER3k@62%P%t-q{N6nKF>&uy*XHSOS
z{_x_bKYjn^?TerO_}x!GfBW;#FW<d=(q(Jx?{1nI?HK7b&J0<twS`7Cr?VdaIU;m^
znak+)vP=k`YGG1kK|-sB)uv(n1^)_KD6AMAj-+5?3Av@5g(D!!3NR{JDt<BqzYK3)
z={|x~54?^wDrqS7=ft~l5>fVv^09mH_9%6Z^}Q11bLruO(|GnGMSJ9=2B$uH5P0vb
z*NvmDSB_pkwf`KpL2ci->#KEJzF2{@Ct_Y;Pl*4&D}Hbs{@?ij_obiyAA<iyA8uUw
zhp$)7fzOBM-?e()_Eqz~UOCsD5(D#~#(2<Ty_s<^yyW?^WBu9Dh=oxRN+kb@nI{+x
z?R78N-6h%ePU1cI{~Hn4F9lvW<$8k9|7Z4YKe}V<;jLSb?A(5A&-UZr>^OOFHx?gW
zxp4S8T3~LTx_|eK>wR1Sm%Ke;mwmjh`uf}s2=u@f?8KCqEOLAnB_Wd%7MB_r9vd2&
z5Eh#fm5`B|Mb9GBA%U2bYyp#;&mvV6<(CtM5KMG|mF7|L_*)?$SBvS*6$N@3vr0^@
z5>s?yQf(2fLBVQN5@smIcP74=%5s`ioEEL1yS@;>J99pu-Rb-h1;ctxsHiW64OuZ%
zq<q3wIns*u6TI_Qqj(<ndd&P{sByTh;z^(3`-$dBd)3py`WK_7u`Vs@(1xvry#{tq
zHId{eP_M`a{BZm5|IV|3pFsR{s4^ga5CXvega3}I96Y`CHWl@ni!i09*Qy%pu7jR`
zHI4YUWvHWO#HN2XZhP@$@bz~yKmPps`!~;j`03Rj|MR;yZ=e12<BOsGHd{;W<WSp4
zk7>Nm+}TuS((;LF&q~Djxp?;dmH$MYkTBG>*2&T}{1}KITmjHON_wcA8jPGz5h+lX
z?cY+)ROcnhsA0`&ei1!VnC>r5_Z6pmi_+otJv4&EYDp$udvg-d@a4&j_eA6y>wN(a
zz7Zbh18yI8zqB9K{{DAPhk0D}xONQFPA(nYacbX|BRju5uzkahFV}p%aTUrQAp61g
zd-&nr|CZs8Ie+Ufu7*Dk0X|$pRLv2!qbLw1%0U-=xcry+_urv^;Q!zAzXP|x$~oIs
z&fl_Po(DDFgBs^aBg~r_5BO&-){hnK%bZ=WP7U=W1-bvq|6M2lv;N=7|Do%LFa>?b
zv7JN+Oz^*#;Qs`{KjPoR*RCABjTZR3rytxq>-ONhha2F(85H0W8sZ%p6Bw5m6rU89
zk`<Ls4v9?-h)4{IOnexd7Lk}0ok~f_pe3b~P!)|3m`lx4N%P8Nd5C|N`7HQ<biq{Q
z6Z+p+np-F5X+<<G_?Oc4#mq+JKMSc1rR>@gGz-w{l`Nx*18dgi!3Hc!qWiT0I$!RD
zzoS+J!%u$8Kd^^QcIqBG(2ZU*-CgtDxCQ)!`~hnj`U-Izrh5(FO}D>!Vw>tSAa6ft
zk)k1YsE+ASXF8d8M!vWOz&{2X;?q%yQMzf+|D6??4jr{qhuV9Vqd|yw2k4*PW@UeC
zMSr{Q#iR|)4?Bz#cEd<}^<<yr<&%Nu&qjXy>E#bUy?pcT*?;`sw>Skq|MA-w&qv#=
zwFBKv;{(<~M`K?{Rf}GTR<9Nmc$Bj{biCFImR^)xRgj<&#Hn**^pX^N6|aI51^%7-
z-^srmrT*Azk>#gkMYL#g^RhzWE<~BZvg}}Ss+T0q6GwoMJ<0BsqI7kB8a>9H8tq1o
zypK|6)VW1_oQ-hDw1;!PH&1w6IqGtD-_29IZ=KzTnuk;0e0BEVwnN)D?B0x-SF5&c
zSo!(dB`cTBb;AGNz5oBoKib5=Kau=_&FY_7|G#DBr`uM|K_SHUm2<X&|K)Q%X$ded
zdV)72!H4yWc{5{unNdEBaCC%uQbIhk1Kl$H9zgsgyWdT5b4j{S=>PC*7yZw{{~x}7
zXy3VSb{^lo<LHiUM|W*=`u|z}p??luy?pfMt>brGPTswH*46EbkMHf^kO$!r?om-b
zQL)|;(Y{e}Aqi=L(aC<1Nw9#3<lvaJ(D=;AWJ+`jIVK4^P|0`>C@kQW7Yg#(*-9a&
zB##dLuga&X@+jznG?fV|3aA<(wOYnB$a88-SUM@CN(TNpHAOVQU$122ESS}NYo!2$
zI~t1KD}OK#x8GJ%fX`VWq!?^2gYzG4Q%`kO!KUri<Lz4T|8m#}&ky|r=Rea^`+Trr
zv_mu6S^aXd{f8IbGlLCJhYUSNk)sC59}@QJ!pYk;WcYvRf1Cp7f1C&+3f58E%F-Zy
z?3!%w-=?Ot>)73mIDHbt)O{_b1MTWTTlJggeUpRD16>Ux?N#F))r0N&$Kzd(XZl}#
zJN45azkT%`K41Uv)3<NmJ^%i@nStIGM8SPsjf35$!LHi&Ith+*vzA+r@()S6SxIe@
zleBrUaQ+qiXcadS_1+D#bU6PqCXxIh^beX`NdDMxQJ96TE&=6&RI5&;6K9JugG6b*
zg_(ZRbWc&Lhakm+m+ZlebFY$6ikWegNc2wKC0ZSQFUNadcy#YfpbJ_ZPd&JB@W#nq
zr}k|+e{k!yQ+vVx(Onz&Z~1KJrq$3tTQ{vn*(XAP-^%{K1wVfNFZh4dQ7@hU!Lr%p
zr&Ir9Bld<RpKM-1@Q*?WyoGIDK5z5#xnA@{m^UNIhneWhN^%|n@b80M5IqbX!I%i)
zo`r^Yq&>0r)E&tmBL0nYxfb>-|2GcrJHLO|$vtR<+<t8LulncEt}{gbXaD7khpuA~
zgbU!m=I0Oi?>!20jfiv&k9ZIo=K3(qHzvU+D#1M@+BYJ_HzL_TGBqeVBQQE6ERh_Q
zLP<!Yq@|N`7>u$)3G@&2zf#D>`<Eh*t|an5^tuwhzA#5CVpK_4ItfE7rfEbd4a%N{
zpI)b6;e8WkssQ}?ZPlW_rqT{QA@xDNqmeiWPUS;5|0d*oRKu<6(RR$tubQ&!ryaH9
zHr;5u?%NUb(|(|+owV0H8)$qnWE#WO&{gd)myPx|ynE5}auU<kYPyZ+NoU!$SzYDn
zU><J2MV^9BNFap6U23vjLxKNqS7q4B(O;k0q0VltAa`QXMqPouK@9#8|Ms<%JDSz+
zp7%}koBP^pJL^$;S2^5im>TYQ`Fv#N$<Uj3PhY>8e)r=G_<uwJ-@Trmne1(~RJEBa
z2OY+qw#tq=NpmG`IxfP0@DE9kgip0FQJoj7=0{iXqEN}xR>4H@t73*B@<q3!@)!Pr
zy017hK#&n+trS*ENP-k!WPafJ#c5u`6c0hNdtM5fygcbKo^?eGF)2JF^iEcoOOpQ;
zR6C>8Gx#1xJ)CvFa`gVCLl=&0Ke}tfg`+!9@85c0+voc>e}+*fTh=e%^ciaY7ee{J
z2m7~>|Mfw5|HuA${eymiCE$O~@0NZF{r_)BfuaP+$^X`saQ>fy|1HbreYJGnuly&1
zf4~n31o*ueG2Zk@FIt!<B^1+6AbzkH)iurIUb5@GWH*=i2e+c{Tzz=;g5UX*t|t%O
zJi7P7!JVh}Z9l$y>(SlYj_%!l;=qnmM|Yh)vFE~>16QE`Z=Z8R*@GY8zx(hJ_;(3?
zcr!5QPUu6|$SBwF=)0kj4??58B9nX~)BK_{0%9^l<H=zOq{u{4d<r?6%q%HU@^ZL*
zI$6jhDe~AQxeTR%q0FP|WE?aCYsE~BkX9+CLjq}p6z~t=>l6?_OoM{0FJab~aV(Yj
z)+%9FLvd$AQM+Dh(@VSRknkyPua(&9WbFoNN1d!6n!jC3SV!ehyKb_(Zp^OlZZ3a1
z-27t1Jl$Oj{vY?&!TIAlz#TE!RW;$zPYodV-uK;8`$$(6(Ie0#?!udwDg$r6t@32V
zytoBy6(qZk(WPUev(i?H2k4A0EpCDImePz4%o^46ZTh_4CMk*^dYa_jCgot8?uVy6
z6FrT?oi%N>QZ%slwO37#*}(twbl>ymW3OLNzJ2@nho4`3|L)26Z=XDQI^?i7HXF<O
zY}F%O)y-7}#tN=Q!>^aqYGqk^aVk1_k?_}WqO|`XZ|@b|=9z2@zRbO+-Hwu#C0n+f
zvn|OAq)1WBITHj4k^sRRz<``{&Lo(@9KoD(6tg5MCrh&BcDvghPM>q<-dXoy-e&)R
z%~rd^={x7ntX2K59}pz>!>;;v)mOEvia&~OH#>U{%JA*lK^v>r<OaWqK`b`X6@dQ`
z_q^(Y*>*ixP@9R>pvIk7p;MLTROLA3Sxz~<R+8-yWZSs3i6(7}k%>_kH2m}PVwjoX
zg(-nKNtg1;LDcw5qzLb*fYX8AA746$%I62@5&q=e*NL8=;|I|2;GsQ#58KZ|{jWX3
z^Ubrr!8@)14jp*$;6cI;9enY~{uhqD`oe2_fBD9%zkGAw^Y0#d`J>nNetvx4$v5|%
zdUOBDH}-w@+N+-&+xrv#3I5{-zZl@hT|l;(lVvKW8%i?u&;=^MYIvH8o}r+n%l-=g
zq=0kL7f%O!oeVgGB<G|*_`mO&@WAuYVLtKE{>c!+%81F%LGULtO5g%Xg++J={?pj(
zG!{Fxl$lb(N-yJOvLsn-c`jE4{FjOhY?(<QH_A1ZmbUJiYNyqVi(q}D)qpNg^uKl0
zS_Ydd$J(o=yX%N7pb^M757g_2n@r<vmc{;>sZJyUw(0JQx!&sKp$5!7U6}#kT~}s0
zHzuHd(6T+%x;fFjHQl;B)3!d@d}G=5_}aw7t7G@JM{lkJ^Ot}7Xzjt(iLZ91e*ewZ
zZyzGykAn8!S?#^M)`zmsBce1k`rW<xfBkOffBf<OH}{tAZVlgG?|r!1c54y~0-%>j
zko>Ja_4QHnl|lUu@IQfZ8~c?p`}%<XD#R5bJvyMjGHSz=B{~+cmKPd^(B!$j(*Nkn
z)bG*Zd1v|F_W0IJ>%%LfH`Ygf{msq~zkT@Cqn%%We;+MCfB5~^|Niga|MquZ|N1wN
ze*OJdPaa=iTN>C|8@{nRxHMEZ*H<~`vd^M*8J8eU3gCajDH*Hck5qC;9lYTR&VZE(
z{Et-$1}tU0W`+xcEtu`q7q$Zb>Vg(^!E}qNr&54_!Kr09Rlt9KgEE_l|592lia$cS
zg`IBZqz^YJn~f}CL4vR-iIWpkoEn5B7m2~AQX;+ML$C|w(=Q=%^3IWuP8|98?bkki
z@A$iK974k<G4OKum1o2L&%pfb$Nv=m_dI`m@1ODi$?<((yh+&4@SkH96J`@-TZI5W
zuEXdiR;HmmQ(r>Uf&3SO{~-QTm6;iT75^_L`JavSIT?8F^Gl~cyL1NQ{bxe`&P4`$
z$3|RCj15X5hh<Qsv$A7za^rII;0PXHNG&U)uvny$vgG1&axp8dghwqGXELQZ9CeXU
zS1vKJ!GENNs`|F78mCIBGaI3WYiY8Zy6Y?5EkM@6CdYV34P2lPRnSJukQ4YfzzZhY
z5&Wx=1y2$9uUzP_1^Cy-nr|%jUY+e)A8)xl0wP4<AIQHt*MaA?g|7QoMju`qN3_4U
zJ%;v!-#yv*75eJehwy}K?9*FQch-7PR776{N<BY(wfN-v*zGmfH+N?K#~<&0|9I{0
z_UN6Bfjem3U+=s+OwfL?{VTZk1?L$xZw?r)44bb{R9qXcygKgK93sBi=+$iW>$iq2
zgk1AV`@MCzXUomyuKOD!kFHMt>h=;UKluN%HQoGRYh+`t`vER_|Mc|X!z+)zx<(M8
zZ}0q{|Ni~&e)sh0w-28_z5n!^+xKoP-?=h#WwCc{ym7k6Hr-{pJXkf+teR|+k2i=%
zYax!rgRmRGk0&BOd||Og)n_gR_;>LSsfR{oZa=2yYJ|0lJQR}$>!eO~L4z`{L7rVF
z%d8Pot3~NG;!I-Ib$N=OmFB9E*XudFJRAW@oUEAKB>$9%a|yv;MhBn}@+qcWAaeN0
z+poQM{LuSv9zNl2bt4*{kL-Ci^L-ZZKYRbr;Q!d3=RHTjJBMES@Yo()fqni4VMqiJ
z{69Xr#}ofi{K+y4Q38c}1>MXg%*dh}nb1WcjsPtNe)DK(dsEUgPy+o+{sa7<iMRm#
zf8qb-7yf6z48CwK9E>RRLR_S8LQFt13FIFxfKwoxO4}tuz<&~>G`_Hy%wVQ4*l8tv
zYPlq{T$an#6bto;{|vrbZLX<tb|_R>fo;PEygIwpX*0FgR<>0^2n&s%mQl1oJFUR~
zV1o%Z(qtWLu`Tx1PjysGcGxD`tqA^$19dCI&dtd-M1DBJ>L}Eony$@tZ%=o?BerKd
zws0=2^xfGUxw|#;;M&xaTl3%CSp@9y&HXE5U^ri08N0E9;(W*TC4}OEC)Y=x+#J8P
z?z*<piOIx2eShPt>r*!`gQX8_j#b@TaPEv*mpi4~LxyXkmRnPfoiRwM7}ooAc)335
zxIR?_%;Vz*#2EVxSU+?V!GEpm26Bgm791}3)`p*4oB8_s+{3HWcQ!_@%(mWM?Y+9x
zdu46(Pk(sy-M6>z-`#xr^^M<tckf?+|MdG`KmPq6zI*cc?t?p5o<6+xaA$dY&UJaT
zVWQJAi)N=m$9NOSj~pTo!__?CA7LNhhv5ax2V+5_Sa7);|8)f|+Wbak-lS9BVJfXx
z<Th!F=Q<3XW@f!2r%s+-Bc)e~sTF*RjZ3z2$htD@xlWW7CAXL*tu{G7kHpQ6V`atV
zr-Y?M`ospEjqv}{{~XjFJ~;pR`yae{_^qQ5`6s$P(dxDf^1n%DcK;MF|4IBK1waev
zi9;`aaO@ROppReQ^U3kOpT52a@9+o@{t^7$_;;7IOq@&;hi>H14J_b4QwRKG;ZZ>*
z+MZQ8RC(4e{$mQGLo-8tlP;YJzX0Zg_>bbxxiFN2g1n-`5dXn{E+)kLC6SN<1g50I
z1#l36|9}XA{{Z}nMR;MRGPvob!i+L$CR35e(-jHz41rFpt*or?l&Q_Yzu9C5{;RE)
z#tLgwC2oPOZPjprijnr(u3D3;&WMXJu%MA9+f+x@Tu&_$;qf-SSI)Sq7y9d0h5>lj
z@=)XA0EiGuo!vk_L4>B-*1(14yKbxvUSA#f`t}04&K})bfJpK;w-=t=oc;Fp{4Eqq
z7rU;_w`^g$cA@3#8>7FvH}mzKsXJSP+e;nyb|$`mynbsPy%2+UFL!Q3K4R2%W!Sjf
zDc=}0LQeViY|Yg%s|Ww?BcO5zgT~mbH(<I3p}9fx_OSitY~#I^&RZ+Ek88d~+`bJx
zy}9sYXZHTq#PyYaq91Ri`~LRW)zy(-eZBL?-#@x}ed*DIt?$0R`Q77NzxnpTAAa}r
z$)j7hch;_N&Og1sb#-ZQWu$(r%{bj@Uhc2JmF~C`lM$l9DoCVq`^;r9fFFhzxWGh>
z0Ld?kpD?t&0smd_i$+C*BBueWZc;iNVXj?+&bK-_K4v>anRZ^PiJ4$5Pt;@7w;+L+
z6;qxOQz2(|I#j}ZQW-6}kP?wkiAakJh!4FG9e6(U;u-(*C%sR8bovuqe*IYd|7-rg
zi}Po}|4+Vy{1a0H2VXjL$ZbFIf5iVIdyxMU_{W0C4++@A3kdt*=qv9ZdHL_*Kg-CY
z8#r`5)PYLrT1KV@_g_Q_2=PBvl#$F!NhptxE{p>HJ@Fs-NAnLBdwB4FJ}Tr~RH#>U
zxK~V+Z$jcFQZn$5Q@}%hNJYg-Ma2n4B}pJZ3|0z*OJN9TOj$NtozK@7^0h3P#aiE`
zGFC~HMwP~>(HKnz3-DiSG1Xa3t<`q4!(k|(tIpb8XX$US4m)i_cs8Qf*9k5(+Khw1
zI@47>*IPH=*Kl>VcXhONd8m11xOr_1WuJlV+3wZxmeujr<*}9<mxmr+o4$W#@()k1
z-n};S<nGGT2dg*&e*0+k_If{LbFLC?67ZCooyE2v9xeUbciZ26wR~f9=-T?=HxDj<
z|9AuK`e>88GFb&Ozd2w8t6A;UZ9qP8y7umZ6Y|SQdskgxJBDjx_8n*nVcA3<YC-S<
z^R+R@t=Z;#D{#r?&B^+$=_dH*S68M$fvzp}Z_ak?%(vfLbL}j<Zf{RMeYAxsnEQ9u
zZtX05jkdQNE5CYj{ny{#`{wb@o$bYy*@34Iw;t}yLg!(yNi*84UFfkc_gaUl1-S5q
zm?y%1zopy*d(X26BA;*sh&*GrBSfx1epESw4t}4VgM_zUo(0ZdFU#t+voR^`5M#F&
zy%INHBC3^5)|V#A3t|OXk^G!kUT$I$C0xzU>ZwwznDmSozr>L9aY5&!0?zxK`tZyr
zZ=d?`t<T;)j{Fy6kGTBZB|p32efI@k@K0j?*`NIb{2$%>i`Vx5^3B69zH{_t><Iqg
z*vqgFkG=f<(O2F(@-ix)Zo@4ALH>97PnL<7W#ncV*qOR=H~x!g%DfCkcA7LZMVL<F
zlH*F_q6#8W{~`E)gwGd&XOaJ&4)#7D=I0d|cs3&FOhoY62zUWbfdm`@z(4p8@Q*71
z<iCU!mysAuGJ~DM;H8y_GMUOOmMVv%Wr&RWs&=)_$&(qRYLiT%RjKsYfDKWsN`tP>
z3N2g*j)0-|dJM&M6NMm4f33N%#x&Daha&)I!B~rZvK<Lv?RdLmb+m0|xMiW=iRa<~
zvS8QMS=ZWR`}$=2(n!<BOxL~biAOi)?p>RDaDC>%&4tHzFF&|3|M=GY*EeTS>PI2{
z>TL6^<xbqFKG+(>)}jCT$9w<tPY-{CY25p(zk0m++oxB)xiy1za+|}}?LkC!)2a(4
zNA>1t1@z+X%sa1+Rp21F+@oG|X)yMC6WQ)~)n>o>$^fA_1kshvk?I}dztXTh-LO91
zuszd({1^Nm_}`xI-kxgQnQOVd+JAYb>*kf2|Nigapy6|Sb^Ok?>08^Ax30`QeZ2F-
z_xG=D&CHIq-`tw~>iX>VZ2M5NcF?IBb}G=ZGu|M@ZEv5QwHx*UdzY~UE`X=Y$N>LA
z<%cN#xa0qDwE!tF@Q-SLgN)vY^+tm9!7AZEjRXo076Hx1%c$UGm{}BMQKBd}hMgIX
z33ql*Y$-K@my^_9f$PkIxPY@^z9$2{zCh2z>5txe@AZRk9^QNW&?~PWBqqE7SLDC2
zUD)rw{J+CL{`%jn{}Azi?=Oz+dk*VR-#$Vd0q-AsiLj$Dy?69wTnf7@|8f+6K!LKY
zA`kwFc2E-^M*yU;a0Jk`C7GHcsxmKKmQ9h+lKGTGHYv6wHZm_FnCixVq|d3K^QQvO
z68I1IJ0B7FWmw>s;X!93aRd<YANcoACWoY@N72y^h31C>908;fCdnQAQyBd8QVGaU
z4qIC!vdFDXI!9}nNQe5TK&l1))oN&>=uKL!RjY<BIz(~N4(DpFX|BXl0Ao*`rLP9P
zK$fZY8dtR*iGUmbj`8-2Sy$cCAm%$7aSFf@E|0WsPxow3cQ21L%?~y#jx^m|9e#3a
zfxygi->vQOt7{`q?yr4yYw_-ti974V_csP{FYx%v(5>a}>kAz_i(U6NM}By`^*{gh
z(ZBt0|IxkmJ3DjVJz57xA_Nw?R9pS}TN8-t`h`vvHfljK^qVag1lcwRP1{8NYrj5T
zadoU}8)8%4`psUP6y_@<j-?*U)hW!sfSAEaYmlTP{y)Y^bYp%4%{+sZn-ldrOWo`9
zJ#*vj7<>Ho@s%4}Q+KaU-QF7ASn9iXYv~WadvO2u%EEN#>b&dm&H0;``$k&z0}aZ7
zTFG3e;j+t2;J<=};19wB@OSHryL3f(67L2)ixB)_sPjYcu+voPROEvEINkWKL%~N%
zuMlMPRtmc8eBj^8rP|mSb)g!{De|HOJ}UoN;jFBPGHQ5Ef`4{GfJwxt){D~PLjt`%
z_dEOPnNQyN=)}>tj=V~MfB%bz5#jg0u-o(Lp*{b3{Nvv|xc`L%2MF7(e;(WW{E=6m
zJG%e5;|HIA^Y9Be0^T|L;(Km;7w>NT|8xZ4A`BNBVoxqi^aYvublmV6m^7pSnqoKp
zv(m(LGA})$JUON~HX@h6zc0!E3`XEX&z-~-Sh(-GNE`=2Uxo#L85VjrBGNl1)+aI1
zmqhYUNeN2Nh@j_0=M=={gZY;xGngp;q%t^^5^hGRh+ZxQ^N|?q)sFU3nWa=>beDki
zJP~w(Fc)kz>P;0Ei^FEEvRXSE8(M1|4R#YT7f@xwT-XrC<QpB`)kuV`<E;*NE>Jz+
z--xikGSWIfP!GE@+qE&(zB$!7<EmL0Y+Rk{yt6&|==KuYhOaLVZO`?t&-HxwXcJRc
zo6CdX8@Px1>gvSTJL6a8+wPzvZfg(?Ki}S(#Z2!1^>5#N`}NJ8t?7qXCod1zKtyGw
zLv^`R3FYK#6IDyS=GzNx_t#v%xjS`r*10j}fSS-^mu_>=`e3#Fn{C(XfMvyHScAmq
zX!Sy;adWsD<CfRv+m=UaQHi~~F#_g~F1TA8qYDF$WsGXhv~9u{Al-9g?!mR0?Ztt6
z8>4sDhUQ0`uWd~}{%ZT_*Viu3_s$MB-dG=dcynQ<zh)E^s7^XoBb;wn4OVkql@K{$
zb{iR82J|@;w`&VJ^@W|N`9tB2crSvrY4UL$)}+d9ROU3uvl<oI_0r6G84CXNDq$vO
z8(nrjZUiiRs)>hH?-V_U5P9R}M3>XT(D?@Z7o`SgCqn1;0y8Jst`%jHWBks3?sf9x
z&p$eW%okI-$J|5ih}9m(f-GPc{|EOyziV)ZUGx0lKYRlA2Z4Nd9}(eV`(8NmDuVwn
z5AXSxBd`7oL4mLi`0&f`9DW(J=e;AZyaj#rLofZr|A|{*n5P{AO+lF^0s2C*&y1!o
zr)e0O>Y@yJE=7__5oRWH(&9@=1pf0PLNNcH9B?+;=M-At!oAN%T{;(qa%j+*@Q^RV
z!p=oTd&k6GaN|FKLJ7{GMP%hg=N8817bg^!B^8#s@z17|a572-xojCnWmno7S?bC{
zp`lo$FBNK;LJdcxmMJj~VAJa@MuS<W(b<f~&c?=Sv#HKva@x)HHiN6Vex$P*Gh#h;
zC<mc0$Trq&pLaDZ54SE2HP7`m%=XrS?QhL=u1~hk_1EG!SQ%~Gn(w=Fb^7-9)U}nN
zt?3>jHMl(Vo5x%Cug`C-jNH63jfs*+*Jf^C?z_9*|Mks@r#C0i3yRCKdsim@?S}`y
z`^}?=w=UmVA6Xu#obOOyM$GR}uJ@a6%{Jay?6|+&^I*OIcMlh_Y3u$*@5X2q;_%jR
z#m-DE^gqA7+P{p2+CvrBChM16mdkxu!q{<Rxf{1#%OmyZVZOaFd}nJEXkQ&`T!v!p
zOxx;M!{t%u?bX5C>q9H!Ex68D8*N$|Z@s!P4)8yExUqeCbak@p@^sg|8w=|*onuYf
z;c6l3oT&Vu?V;Dfc0sxdV&>5EN1u;2A44vL=?l^A4{KKCcAHA@0?Y&XU_Q9*!n007
zN5Q{Vl4a+mn^>uh+L8uciJqShohxxsyoN;)7bbACqsyt`h2+4zWK6qY$>W98=!<l6
zltIqVr>6Lv{ruApiL}?<^Wz?G2TJ!n58Dm<c-aL%UUtF1`-6Xse^2E9TljzO@SgvA
zWbZThKlJjuhhKr6a4#3|0RGu0z)v@cXa+tp8lY#>v}H7PF-=*JF3nC6Qj@s|{wcAH
z#OR`!$h^ofdf26uAg?&TGf}>0qkYfE_@9psI1BzCiQpdw@^c|B9{9hQoE$(&3#HK`
zv-4x}ieT{trAY;4sC=drvrzxZWQoh<W|66msj%h=bg*1p1n@QGLM>aQ70HY;g;6fo
zsIVPUt*x?GHr3Wv81;2FOO;WFbYQ5f1v~N@D-0MD?yf~^kZq*VKGRhT{Li@R#ycF7
zUDX8PA8th2JJnMOMzk>0w7o#!|0?AEW_z}0AQj#L%Khl(!nL*08(Y&4ZZ3U`4xG)g
z2WSDkKK^89?9N*6>KI0EDsHU}{onunSO4t~PrkXod3SSkrrWa6satH9U+z*}b{Vcu
zH9S~#J=q%i?)n6jf-pO9eZF}UtK<oBMaMeCb+N8!sRb_Z&34~?>>(X?%ybx5N9v$b
zdv~oL+3Re7)zzgQ6#u`zH9vv=)jmkNVOOsc_jIV%UR!jnPq%MPwO$@+m~>SwOtd|^
zxAxWD%gZy}Q@wSwgU&lwr)K)AMw`^*^-{EVx~ye`HN4(R7CM}Pe7FGeT!QMOt3jQM
zmsU+azz^i3{}V3&KdeqdgCpQsE2dS8V3}4<nw6WbXQoyu@@&#<IYgd|6WpE8v21!I
zlNM1<jVQ?o&x0szluuHaSCIGTQGwoEMqWNW)z|9_;2+cOyHY0}7DSL1ci0EKpT++_
zM*Cm?=|6*iXhGu$c>CZ>?;Ls=_9pb=4!z{=|3vZg&j#?rsCphv&!Op<R81-N0@LLA
z1oPpiC$TAUr651?9{h)Ag$GbVE+qJ$kG%*l@Qx2SA4A|D#h-JL;odRPK5+>i{0F6{
zhh=5~|IvAcz<*qR3Gk1buaqJ-ZhlMTW}cy%sjkT8sxw(iIva_wnjtcj3U#G?4O^%a
zNR3jtUV@<jg{Ho?0jq)SCR2q$?=Ttq+nX^RP;WCjZHA5-3yy#Rr+ugqw_uI4z0T1#
zToV)c-<a$K1%eAq^;FID)K0tVwinPaFuk=rurk@PF@>u!tj8L?zBaP6K6ZU`>d~Fc
zPi`-NePizFjcJU)qYk+~(J;|vn(KG`@YVXCzPt6u?{59-t4&Z8#OKR>mbqpbY^~1>
z;brjo$D92R*Ic)kJ64A5E8scf)my`MG@dQF^jNky)v8$NF+h_CLd{pFoG^@`p&9As
za`(-Zo+WfEO|{%u9r*Un0>Hn}XTQ1FwLDaFYsIxNSUcNSjmquTR15xSwB52e-tzF)
zB7AmZ-Zju{oE@xN8gJg1Y96eYxro7*G625c!R~Xgdmv!ziTS#MR!v@$G8^Xh0=(z6
zX>skF2g5r-d)!tlrq+n4RYIzRPc^er%<MD_`C`J$B%o^8sUp;W@}PDVQ<f20lo9~)
z!$?C5L|A&XZ^XqfLVQkooqRvg`&1?+J}EBr-06>qwx7eVpit@IKSaj*6YO_icHQBx
zLH-ZE`6uxY7kKLc;R3KX4iPe;yT}LTJ&`}tD1^~;T&k9pp(&@TN-`7$=@JMY(vmrp
zL}p4%Nz$M3pB)jD9_E)E?3)zmn-l;?z<Oh^m|(A%Q16(?3$Za5;^HnOBq06=ru{ko
zg8~8nqyi?nkVR#Riev_swxUFC$z&_i%cL3QGFrJJo1-q^>x%e#247z$(6hxxj!-2K
zDf9+wU2UUUsnKh+X05KHv9YzT#$nV|nY0ZyBQ#Qa8Y=o4?8D8DrNOqo2FqZRZK&CX
zL;w*UL}+QadD>Ntym+P$G-Ggkg|NBdrp>AD%cE_#H^!i%aD8L^?#{yPYqLLmef8<B
zg@;=sVCGxXjZ4GTv;CD1u8jTg_2#wZzQ@<+|J!fwV5#D@dFbs|VN=FpyL!cC!hG4~
z;p%U$3~h|o5VWVyveIY9;K@p_VW~^IK7hkQGui;vaSIgZX1ny`E$Znm19q~wXE34R
z){0O+(_49YtnsU>V{<Osbf<9#w_r;cQtQIS)_A7{qzIYvbf;;o-8j=%eRF;E+REVC
zbcd^6y*N}i(4<ADD}>!#6-=zF?l2Vf*vfk=AaC*`{(<>sRSpczJa7R|%m>@S3&8K@
zJVfvZ_^XBK)xr$mzk-);<)#^#DVow`41B4WWHl>ASP;v~#K229lNMf-5|~E{$R!2Q
z<NaxI{^ZDuVHZ#PocRb1|7SjXC+O0-i{2;E_KEoK0Y4%?_|LO3|9|!US$6>b_wRrH
z!2TcqL4sVp`ut&#|Gm$F{2YDt7q1_99^i)-(24yoz~BYP4-!4nU_KrfcozROgMg~#
zP&F*9#-J)0>9Tx^C@Y1RL1LvOl#ydgNU@9r@c*d%$cXI75Nd>fN+?|5A}P=((H}>E
zcO04@!g2Y9=FbapvAg&Wp=N^rN97g}DL@{Bl+Vm46XuJxB`Rxy)R@kcB^L`R1pejJ
zaycEfP_72oVFkDm<LgWKhBB^FEJGd0X0ND`$y6$(x}m1Fv91nc(UoR{!>FyZ8laH^
z3goJ{O?5R)cGmY`OrX&=((0ITIk#tem)*HA&Vq>!$83KC@?Vgj&87b7fd-@ri2pb6
zaiRC#&fLz%#5WH&e*5Ig&QjlvrS9cn$Hrv+Y@hx18q(6GwW-!?3*DPjEq{1&<@-ll
z*XBDRC%)=JJ;$=tV_fRDW4j6@L)S+fi&!x*V8fv>+oqWBL|s&~)@NGiP-6&vXA-hJ
zo$I5Hr2+fWV8xC3)}7hrjfsZ&{)*MHdNeRh_1Nz~qO-%eHdM9PixD0>SkOYhV_|@3
z=NoI&PPA!e;T_%ft?7;{xY3^L>TlF8^jG!PNU+;ws!i2f!6Xz5QQI+<xUA(aOBvXX
zoACS?{DJ>Q#ZP&zG!rfW=7T3_53EX%=HOEt{Im*Q8iY;E>{NXjSy`Nj-4;?t0!&mG
z=hit4FG&jq{_{yev>3mXFt3y_wEdq8@%iH7*$>Zu@!luz9sA^iH%@)_?q?sJz!5-P
zRQ<&N|5q{p?C<|6{O|e2kv#<dk^chxe}RA0U=O^o3x33WxBwCP0e(H7z`vTAt}M-v
z7o~}FQw5o1PFhlVa(r2`2mi5*xah*@$lRz<T4X?K*d=l>8bRGhK){9gV4wIf--IZi
zIA}ip1^&b7xluU<;6I5)<<xRfp;*UInDa!s)KYO`A)j0#CYMSmrLy!gIgP2xVrz4_
zx?G+<pJyoK8%ua<woqlSYEr08e1Xhrt!#8Q>$SQ{tEIvW7tq&O4b2#nbeLSuiqW>Z
z(bk&2dh0-=y&wBxTv!0nGuP)tE-=z+A8oZS4mZyXIyV;kF3)z);%a!XX=|z*@&EF4
z=YtyyPwuV#>e1Hit%<GKuFa`t;B#TXc4N8o>FwFY(Z-GGwuOOeG?+cQHvO-^zO^~i
zHs52O@6fFF8i{$^Ui;2M`}%k_gh#MoVyRQR0sObfp*=O*CSP+I;VH}A`lTNIjk%`v
zv8u6l?R1ZEYXSnib&#Q38LnPL-$J_qJ#05tdM*#4nq-~sz`j2zS{hbIvAV8vbFyiu
zNikS2Mu)>llXhjWeyYokLt(7Lh~M;AW7I{8Y2U$WPMaaWRhR#x3s}lPdl2^V1ndF+
z1{p9<<hek5jVJ?8Pv+~UJsAl8aDhr*S_PM4=cZXX6hk>#Rh%fwi{|G=3G$<bgw$s=
z;y;Q%=zPmd45Y{Ur$u7VomZsa$xG)x^!f6Gv!A{7*$1z^2elLIIXZC|&2Omx6OsSb
zU+$9sXHo5W{|jvYonzuJe)IR@|DInU|NWczfBnD<NB2Ju%zI$}7x>rlK<_eC<yhm9
zE-6S6W|4VR5-T;KG$|hBza%NPBr&!mKBg!pDla-LGYUO_0jW3wfd5Mu5(0hVLwpk=
z2>iz*|BZPz{)f`CkpjdQmZUKSIXqRNSf9;Tr4$Pj@;Qk`f<%TWsYIMwB1<ckXOtrg
z&}6ao*&IVI*O<pM6|$9Vk;YQl$Q3AbMjLvAG-|y@jeSY_3Nx0)8>%e^R79ONJt)v%
zOEtQKIx6-3*cw)6p6_p&aWx<j=&LsZ|KshIQ)rWL)d2tN3%wJ)wZK1a2UaILwwL-J
z-dz0EqwRaw7p^Q1Y)p474g%_ymEp?A*G5*xoomx=i^Fx3o#yqC+NBZaH}_T_Y>m%#
zo7eiS(3B<?4=n<p&ha+YqRW86TZru}v?yj<WYaB@(Rx1CZLIbf(2xcxo~`lfYjXgt
zc7DLNIbORpUVCMteq+3DzTZC4X~KBIjpeSzUfXoL7VC<J8e|h~y75*$8XHze>t|h<
z^i)8xf40K{`A0Onj<*|^M{0%|mHm~%Zge<Qb9*bxTXeb2+Pr3U9%gtk<Ou!`{Dc1h
z_H_~}@Q;2@P@QT~2A~eWd&p0ffKtgz^~8UG-^Na{uu{SQfq!)gNm3BY&5mHvLpfRD
z9F+YtLJ<E8$pQ4(i)j(QDPb4l{ITcsb96pq&*`a;UkCo*Jwd3Qcyz8j{GXsGe+Srb
zfxp1^-}%{J{O0fD|IjP{1r!KZVCaH)HvYecJBa<i%ruJKCZZdJbc29ygoZUgQ_H2P
z%V~;|bXg%qluH)Slerm5EC}9`;t24Q;0TGOiE%}7F?o={iV9AP2uuz0BL(^<2KXif
z`NoG_1pecrV8H+R#DoiE@+C@IU^+D{Gbb*eK`voua>R6=G=n8fW^m&3nDP1SgaU4S
zA&*onCKpTL0_o*S8cR)Msxz7J0$mnMTg+4I?e$81rAlYFS2f^XOs*jI#p~6&N~4k3
zmufX&Mg~<-3<&f!IFJQ&RU5D-pr_V2(NQzfQZe8(_tokLo#xTj3JAyH3Seul=ki3`
zWVd6!zit*@FxtAaI*O@V*v|SWT7xc+)h+ddW0*E3ooIf(JXVjeJlkViaM=)+7keG+
z6Rp3!w|H%~b+TQLYuv}1gV$%<r`il-PT4XfHwWyK_0oxY5!&w;Iy7tj=37(siygX|
z7Ufj4Vs+4VW4>v=&pd<GOOsG9Z@vu0h5^S^j}-~*S64<ir&|Y|O3d3X_E<-qis1%1
zDn;EDyfMi6Lf5*_)>|!J8>(4yRg5)h+AM5PFIOdZu!7g7Ekt1xfN$026AWLO)2=B%
zd8b2P1cnb6K%94HxCGCs6J^v0L3L8Wc51|6`xy>FIyjG=OSW@UY#em^kPQ&_DN6?V
zSD@yTAH$_buxMc=l%TSVFa{+!pA>)%7wM5cB(yySor}D9D%|f>u<uFVGaq8l#i#ER
zZ4d9haqx{JdygG>$#eg^3;qMI{u1`HBLGbP8GGRXF1G&4Q0?3`!WY~Qh3DaY&!hSO
zh`auOXwNSWzVcrW5l6st7zcwQ)*Ehn6QjWgo<H^s{<8@D^Ro0jx{jNvVNn%~41iyp
zmm<hY<{|S<PhwF(eh~i?%Th??WKvmDd{Jz4ZZz;8lo}pD4!xKd2=HG_2=R#v@r@7n
zMgM0Uj)0hRNeN!bq>HJPzziD552=JjDdnY=^Kk!*gCHiaI5w{|E{_?P$4)HZB^8QD
zMPf2TN-0xhFjZ8hl2)$Dgcp>n7y_-nqCsV<)>&&sa-B$~!Mbd?fI(|=7|jkeO&Hbn
zR!x&#*J9VTSD1S1D!_cYs&zdzhVeG&g<9QhqQ-%G<D9DjC7`)pL{aoe*k^jGCOT}B
zT@_2?ZClGDKYV-htJ}-BHpfwKTpXw%!g#lNb-V#fb&*%jbeb1j*6DWL1oGAv-PREF
z#+x2(k6d5s+F9thwbZ@PWu0x&p@nR*L%-2)zuaRUtCNh@iDz5XOPyFbq@8Sp4yO)d
zS*ruKWn{+#)`>P1Tx?~eX1>?H1Z}z*<QH9cH-;BS8it!RGo7ZH4kJSSP>pz|RpY`s
zr&{rHpS`b6-c>2Mv)DP_tQn{gPqk=zE4ba}GW2=2tMfZF1-RsG#Wk)Zvss>vZ&2F-
z{yhx;C!T}yCbdq4B$rabC0FuOD+Osb1biOF%B7f@$rcvb!bveQld;uQQ%b^|m!vRO
zn1fNr(9+c4LXv-Bau8^LZc;#ItRE%PCob44;?kK=Ura%L=6mL&3twW@>HDWXehdB2
z@4tTNtz-L+A0~7@kpk}m{%_(R$?UFqu5|zL6Zk*6@6U!?W_-l`mk0lFfg^i=#6MiX
z;{~rDeBs!E=V=Bp%@g?z{7fKU$05M4EK8RarHXQs`B}-_%p^_*iA~@i#UHRBGCQ5j
zN(KJo3JLsU1>`RNec}Us<AX261^dK>`Jn$ZHWFE|7w}I?@}p2t|BuZpB$u$z^O=}m
ziYyrJ5S3jRon0K0Qx=m07vLo13xNM*hBUQQ4i`u-Qy>w@C{xl|YPQ0rGS^CVl`NrJ
zDA)5vO1=>LVl+C9!K&5Ub(%_}s@AM_TGWkJO^eOYQ*G<4HncmmE=)=`*}AK=0}bZE
zdNhj~##$?9UG?KAjkZ+`HJPTns|KN<*lt@GX}P&M^{?OGeRO+ecDQMMs&%f{w%l)>
z?lvzBR&7l+p{ahh+q%?eN6}`YRXyFRooO{fCuDO1r-KWOWqaH?+hLk*)z37m-~xC;
zZr9@l06$y+HTj7K>708kWv)XDeYWX#%~-Pn$Hx-x(R%HZohD3gA*bG1>{*{|A=-Ev
z6w?I$4FLIMGnO52q16dV!-0DF3L)xJHB>L_tK!eKYugMZ9R>!P8~}UZAD9RD>qXQ?
zDZNEO#C#MvQRG47_wXEWeps!D;Q2OIvV~2uaZ*e~oioJ<W0Fiv5&`~llD3SbVkAim
z<Ak|U{Ol+;O8u$9Iq?^1(HB5_s4<{D=g25{2c8S`{w&};2EN{NFFAS}#UJ4R(|2A+
z|MNSqA9(X9(fs@$;{U+D=XVXYo&V7ON&I6X-VJ{2#eNa||JeTLX*vN_$3wM~K)x2>
zXVX;W8FEIdIFH1mC$OoBoDASUnXq&+H!X#mk;<h4|A{3Du>~<vIZ?oWU`m*OGB|$#
z@Q)C1A@<MszYrJgm4MCPqyVD+nHiM><)7l%oWdx29uff<@E@5~6rEifl?4}I#pm&o
ziX<e4G^JFYQX;36DyU3#Mwu#?Z{%yMF%(wJRkFo8u1LcZDg`3B6onwU+MrTfb?Pd!
zrq-gaGb@{HT5O7Gt1z}!>e{OetreOctjtE6biK({qwj^)Vn)~ocNlO&M@Ww&V5|-C
zzjC6d=GOYepT58M?Ssv6SMB^z{b(CTiEVRz_RFJS=1zAsHqUe#$6Hju|Cm#`&}oLg
z&`g(kveP)-g<*E*a*rMA$^idFgKVTmG~J{G?C}v{AK;&Ckd4)fr<{thdg(0obhoO;
zn&cx+DIwC;X;|v7K>7?X+n8ut8LnFz#vxVF?GVlNSSMO^gSFEBYSCbgXcXIxhpO<+
za$jYao!jdW0QuP7i_yg{BLlDp;PGt3(4rgvcs9w9@MeMhcq*Kp@+QFVPI4*W|5i@2
zz9ddpnxHF7GO)-7b_!l}rHPu7M0H7`yeJ;QUksgxoG2dfpBYs`3C&5kloW816!c|s
z@Y#gG)1=Vzu|eli_JQCn@PGQ_6Q8~RIt+qWMErjPt4<G`_~-Dydpi6q7kHNQKl@Hh
zJMMq+S^Jyu9}~eI{GT}Zk_Z3Db7@)*P0OZf*qLe;0e%I8A}LJfXD4wpJ@_X9@H}cN
zpO(tcNaaye;0xvC_~N*j{AdFIR098(NWsAWg}49$|AY&KUWf_vjz&a?@J@*JOHK+*
zql9E+gwyCz^qk1doQTXE)PhhBjm$0p{-d%=Jot~x<s}q~6N{xJhKyVyCzmKFrAP#{
zg<^Y=$ehDg6>`-jd=*Em<p>l)u|h0{R2+eSi%#vZXzFZm0aXJ|0kg(w(KgvMEft_c
zn$9YHSGB&=q3x_x;~?n9M~AkjT95S^-PM}GM$=@c1A}8j7@rww{rxvL{`lRU`LVXK
zZU>6(LrtpL-inpcy2XA6@Q?UE<t9JSSDb3rEO%qZR*(7&a^!_x>r|@_;_3_S21I^L
zk%0NY3kd#GD<;72lutFtAuWro1d|PNoB|V#@?Plh)QcCoOlX9gX(yB$v4LxO0D}zn
zi4N21Xv0jmZQf-^Kl50#rpwNARq|2W-yE%-Y}K_}Se;fjXb;v>3{-N_y-eKU%Ciye
z0e*M^tU*kJ;l&g7J(1s2@Ns9jsTI6bhcMm7P0=#q6-6<MqBs>J5tm$QMxwkhMw%b3
zC`thMA@ak|iRNTQvgzn{h%6)rgZ89^o{hWoMa;#|BYZvy_5LLC(wAWuPX&2@cK&m0
zI6VR6L+$Y6w~v1K<`J}j68Luyzx;>s|L@@+C8J&YC-Hya<^QtFe}I1+0dE|9(T#uj
z%L^GA76E?PPvB3J6p#fuNu10?4h=@)(NYBTG+|b{D2pPXBL0(@2>wYP{6}U-hGs+n
z{~-St;x3(!_PY=p=p7S`{dds6Js%kg7x0OXy_kqy@JRt=axl6eGHBtnEZhYX_^0QE
z)AJ(eg^_fI2mi6T+}M0!EL@-nE+9{4$jJ;vCdZH?u+Z6BT#Ds!)JTM(5yaxlz<)#{
zbVX<lYK>X1uC!oZ41xa|qrBFjgf&>yNC%qjTBk+XXi+p-WNmh3N2R)>LfhrgwcFL$
zAmBL#Mp|qGPGeuaVX)P{vw|D0n=8{j)BW{mmOwA@WS4bms0NwoNV689c)CLmWl<st
zZPP7vnkJjoP+C~(v%xPWkrXs}ObN6D^DyAw;|Mqfrt4+H4gp*M-@`|c7xX*0@Pf%E
z<#IQQs1PcV57o&Qx{(@GLV<8&ya}olBh9)Yr)sD{>8cX-Rf#Z@2q8z%p(b5%s~$@$
z%G>pXbQR`v(c6YlkKj)Pd`Tt@;I9*AG`LIpyR^qs;Y7k~=ODwyFjtC=n`$UemKVlC
z$wZtNBQH(>;gO)!nH4V1i<V%YU2Y6o-AXe;3R95wMlvYjxb4bL3``C=7jx-kxX-74
zr{DEI`w@bF@P*I4zIYc>KOX#l_|}p4j~{yX^#gCew%>z)lz@QVXF>k6_uXWc$Y-Cy
zyyx!B^ZbkN|4#fr_{uN#zx00q|A+Vf;^?ct1pZ-f9()P-KYrllV+UTQs#%1om^5V>
zRZ-#we>REW{4~UW@c&d{R)#c>Ce6zbXQv9N$t+4jDLMW}{6|G(M_>gul3xUWub7MH
zqAq$x`+G$PdhqWRMH~S>7=ek8^-D~+L`n)wO$|;<524a<7lvABC@m+9mK&Z~5Kb$K
z$ixv)9+S<9&J)HIh~o;SiABJ_l2Wdvaf}%(eFjsL$<}1E)CC+>nLy3r%eg#>6dUtY
zYMn}DF=!oDZH-l3p_f$XB-I8*wLw{DR>5kGiYB|pX;#!5rA-!jt6kMusqX|evg0fO
z7t*_G^{9&ys|1=X{TQF=s=0S%@#+1|seb3uXwyiWX|PE*-w$OWs|Wv+*eu+nMEqas
zGGV{Qc!PYXUIIEa;)KkIa;`-?Q7;3$3C@EBG^krP$cV@f{!=H#Rx+%o#FF|S7t=JW
zhHFLe0`#ikH;WyHl^)9^G|BrNn<Mq8?W_#eu8lWgmbt%PIpkEgSvb8_;>lJ$_E~h8
z%NsGPt}W`Z0{?|5W;Q9ZF~;RV{%+XEOAS8_E<mKZL_ZH&8VK4C1O9DnvWb}tGjme)
zOw4mZ$ts*n4->jwK$;)R%R;Fql#4l+JnXZMWzr)HQ-TXqLW(J9^$E=(1!u%wB87S-
z1fP%eI~8)_bKw8d*^klhbL!(aA$9o4yT>5)?}`5e{@wh?a|QM@%s>0lQ^r9h=f`3W
z>N##ZiaD>pWycP@_}YP&cFjYC-1+Zwp7?)w&o7Q51%4L)2lu>oU{3~+uVNzMb<>^#
zGH4Hw&!Hvr=vZTe`VUoxZFdDUSzekbi^N6=G&!~y%C`w|MX@pYQ4v|;A?e{B^8YOU
zy`qB7M}`3Z7l8lhsEcti5Q6h3;k--^Or@X~C@7s8oIwwvW`$6*L#cV883kd~;)u-B
zC^{=LR~VHqjxCVJ7s<(`>NKW4tz1Vb)26`@SlUdcDu<;k;VGFs8St-=tCR{QCIqZT
zY|YbFnB-=Sph7RLF{yxmhfz_flLG(9i0e)AYMr>jC~3ATx*Ud%3VpL#={W*=uu8Z_
zKiOH0Z2|+0&{sE44YdC7&8;84-kBe29Pg?eXw(ctP`b-JhQt<BANX%pF1f4=9eSwf
zgWL31^9dS{Tc=hvqB_7o314u^JqG*({P=#PT8ORm!2hgMvC?V45(@lappu6)VH_?{
z1x-T!bfap(AsDWe4b{mBE$c2D2A1b~(aurRTPyFZ5D<-RE!vR=MVEz%DV1(>d9R%X
zal%@01~J2k^j1Wz7iKu!VIS{6KJZ^FB&hx_^6`R?DDvAl$;NV$zKmpGlC@>Yvcfn%
zYWvg>CM^`HE=ryJ?8vh8AO<CnNsnM>N3*kI%4t!#NkLih{+V%?(CULx-^38FxS+Gq
zDEnVL1^jz`h9#$OV#7PJ>f$|O)dl+h!T&w@$1MO#pq|FgXEFcm`(d;$|71Ta?Cg^K
zU9@{%FkG-}o^S9M(f{DCfBwrM+ycMyi)Zow+JRS&?nnFw_;JrwN|6+i1sLF`C$Oj>
zKcGNv@E2yO7@6`y1b;Fo1K^KiB*hjc#1_WJ6-GzrMuumG1*e4uB%|OT=pBpZ&r9Ah
z7=8~tPvAeuD+*aa1i*hWHs&IZfP{Geq(n@_2d0oQ4i-$I2B*<N(&!=SIU#9qfue9)
zX=ElVDw`jjD~`>V#uvzvidD%a8j|~&R;HzvX{e=&>~aNzt6=ivxCK^XI!vZ8>$O$`
zOldX9ZF-r*q^i`bY-(wRPFiJD)R+lIRA-V`=|r^#vC|}L0v$3bV69eVca^TQQiG&m
zq#3=_C=FGhWys~UO!qmzyT9?>y|ty0mfm_zFP1C0W#uNYT)W#c*JHujtkDMf2*6*-
z8?F;iwx~vGB{MB*5c^R?c=X;Av<Li$;QzosyZ{KFYgAqCGA(x)ChL(Mi}BY7f%a+<
zoM@~TTU@yVm4cxvk;}&IuMjMDnaA7A+jEef=-Qd@>Z@0{YQ??Cf14Bo)dHsi4LsRs
zY(R$(xIM0PU8a%-jH?JUP|~jh@<r4dL3#}?wt#=U2-B(sX*HtsS~vI|0$l8-T387F
zBm*iurO0!WP~Hd6XVXG~d91PHWrZXDm!=07Bws2>2`EVqW2A?d(W3H`LsG-NV3a6;
z|6G*csW6|<{Lg-H>D<Sc&g0ta{d1qcgUUbfk0apYcaDAh*0B%XAnHGe|F5G2iai$?
z0eGfp{}+S@1^l1DkC%Upf4IZn#Q%@8HhX`0U=P6m3)qo8&mAM;|MPDieEH1-FCX3a
z^5K0ig8UEy?xI4dSCDwwi5xmCg%9xO)8s{2N=BBvh>HFPE;Xqf3%|%oCFJ;`#Mpw^
zn7qiSobWJOXb2@FkQCyV5bQ?2|4;Z2JQo>oJ~9w4;2j+a^NEf2ONjGNO7JHo1STg1
zkx9WRl#tYnkW^Z5Dm|Ez8<JKSmQfl(V@A<A(OKM>9AR7@SdbhRU#v=Gs8blKbcQ0M
zSe8>JFX5?5*-{=KyTX)8iNvN;o74)kT7eh4UIQ~KWqO6krju406?UBzCxSySt<;Ge
zI#IPjTx*op=)_KoqQyp}16>XR|F{>!LV5Qc0kKxEr%pfRs=K>6_2lOA`gq%*Q`cLo
zg1F*r4>aX$AVU2ON_5%`5W!!F%KT)ba=ab^Ux@{*qcsw&P#gpD>*W(p#drgV5Of%n
z*ujJyG+4<rQX>J#hpQ#S)nWi1A4h9tc<-;^!?0$n%fuXV2)ixp%frsEuFY&tcg*%y
z_BAN_>ckW6s*y$sq-eojP|rugTP;XOZ3i-LsBj{%SGoHZJUj;(t~<%4RN-z*;ISXi
zf=WJaxYEqb6jOPUshp%?#Az4_N=Cdm7fNm+<!J$oluOKvU@k3!nGptI+pKtBB)qU<
zN*E(8f*S2h3O<`04(4+<!uNADJOp@s>~-p0?=RoIaOQol)9-)z+1tRs=Lo?47qRW}
z*uFOo@9|7Q0<?hcvp9eD{aMUE`@S3ecVF<QKXU=kzmF&W6F5f>;0W0J9D3mhSK13X
z&==l3@XGQ1FCW?W(!p0>OjDF#T|ojrJDv;V6P%wc$V!*w(G*45>XIx~F-?-2g8MHv
zg+vtpQ%GegafOL7`KbIyMP)}sWQK>Pg#{&tUW^a=8U7)D0fP$eISc|sMO=u9^0)w@
zh)YTcASDJRrv#BHLFDwH6u3ZkFr^?QtvEEJB!b2wTp*k8b^%Fjfh?{_5nrq%70OeK
zWWaxJxuTe@U@#?Io=m`#sAO`JTCT^+JdF|tN6^V7YKc&<5L<LI90yj7#HN+lG$N~7
zU{eb!wKxT2RXTC40qKCkoemI1AyA-Ji>$Lk)l;Q$)#!R^b?BEE@2r{|Y<jpku|C#3
z(4h4wD?(7Tw^oLwCw;Y2)ZY7RL<3bqKpg-ct(D?AR)+->5MEGBG%8_p?Rq4(nDfF`
z8vJ0;foXIM<nr(WF96{2gzvx$dTm^Mk8e5*W!+}5nxf%GHS`?dYXkL)CPNuYuaGpX
z!OBH`nx~!*z~cpFPh4*S>VUlm^MoU~TN*t02i0-#$W`1F2VT(L!1vtnrlG6Bh=M2T
zoQ!xiBVJw@C&KDpYDh`yrNSiN;^a$ZX+cbCSSj||C0(M$T%g7I(&PQm<cZ8T6&=2T
zr{e<8L|r-+dI4qskG)R6dlq}{K09&p<2O!z^oFPSi4xEUCysdV|K^dsZ~+A8|4;D`
z@O$uo1osHQKYERbUeI5<@sHxqo;MD>;=zBiBtJor9m}O9Z~=Z=vXGu8%|*9AT~$O^
z79#Ua6K1CH(o;ATGAA{ejcZ@T|D?Er_&6K^xzUmI$k4R#zs5g;|Cw;q!Y-W&3p^Vh
z>cKx;z!zL7J{B%OcmamuN#sCMYG87D5Sbp7k_#6IrWA!`ltun{3h-ldgt2*U7buh^
z7D!Wyq-n*{tWsG4Q&!9r1OI%!1mp2qg+!;65~h-=Wg>-`57R597M&a}VAe>iz`s^v
zQ46hVVWmb=g>ykIc4$PX3$@tPEmjaAMYCB71ODL)D3i8TD7tI31C8eHI^B4OeZIGP
zs?$E%VS&x|5GDHldc`ofdabz2&grWXj9}QL3Q7wgJm5dlF{fgtO}mJttNj(*qqS2l
z>cJXue-#!_NQbILJyv#K1%JSee?)u0p7<+7$3CaW$^r5Tv#{C?WvKAftBYC;Whnc!
z8Oj$rFw85%a?(y?nddSWPaq$K9RMCI$OHb}?j3x@yPZQuk*9)1va^!$4%$PsbaK;d
zymWV0XObKI35w!)B)n*E<IuxO(*g@get8MLc}W+GDZyA{4OPRmNUyXgZ%Tw$QqY&F
z5#DLh7ZZce#RQy*4LTR<^M&8p56_)^`^(SXJo)kQPu@F9w0*vdJ@1E6|3UnR_!amM
zjsUm-z=@!Z(EV)8fA;;^nE&kiZrI;_`J3?{|Gvk|&;<eT!w&7iAlP$9_Wbg-eJ{Rs
zXzz(bdyecU;(wwL>J^#5e-a<UCfVt-yi651Khz!y(!lxoG%_y(WzTdWHI<u6Vv!Qj
z{8LOyEKW)+PK+;zi=;<|P{IRAVHe|n<UgqYK=#6e|I?v<r$hYDgoSwc58}Va+_^xU
ze?o$PVv>I%**}RANTLNMX9tn<f>ZLt(ijmLrIExbz;>Skf|xua9Z1NRkPD@hVkwm&
z$u1QYlnKh%BEA44F`z&~ja;e_3zbrlTqKZ-1uCi7pivk#a=l8TSBgz)u~{uPDg<Vg
z(54aF)gr4>;Lu8NLC|DVH<+Z2W|`9{b{a%2R(YpG(@_CZq;}PzcS4JUpua%}K`l^`
z$qwtFQ-fQ8sZI#Z8PROkRmp3!FtHd2N;|`~k_o4BxJHbZ{#wz9Q@%P_xr3I@e)~d?
zdAI?$v;v$IgVmxg3#-S*0c(P;g3Hc@z?A1Bo-lmG5891osBPBEbLwQ!A<6B-&rGcG
zI_0!e*Qze>GBOeN0eHY3nXIRt?|BC=0_JzEf}3Jxk*v%l8<S*VCK2zPR5UqYn9GWh
zMm7caT!{Qi28#S~NO-~jQRFYDh7_j;<|O*kV=o}`qvnIbuG9!GsG0-$5H^qTKM7%f
zQuqb5`h@zPM9+i2*C*#szjNx7<6nIA`ls)s>;vQ<e)stP_ue>w;O_xH`2YV8_y_ns
z_&<s&EBcG}J_r26fdAL`5%K>={O^4c)BAC}tayGlS(KYDEu<=d{6d<%0KR~>AL8a0
z7ruO2nt(=TrzVsE|7220YI11`Tp%t#E)vaeDPdUe;g=BNuKWj_#|UtwKk!e8e}!Fw
z5H9$S2mgrwFwa@w8yn*n8w<M_8}ApFd?_K#KamzdB3vLOwIGyI6rNrZK`nQC0XI5Z
z5SuHC%M+3c#Hq!SjAAjJ!Otz>6_@b@BAHYolZ!<#sZbyj3Gq}&#9D>IfQtd8OeGcJ
zTrjAlNC)(Cp-~~SsKJHARyA?~X`M+~t(R15MfC=W#~R!<()J2fSCyvIq3o$q_10>-
zs})0y`kC%Z;J>$4foeR|6^0tsxEkw4LRu~Cv~z~)r6Wj(>t($ayq*d!oWkYc&!WhK
zd$At#Xp^$X!G}lm!c*#HJ$7!Jv8>M_fOT8hc<;2JV^iot1CNc}X<@bKOIi)3c-BL2
zR!l`<zfD){vT{2NthrWGld`m0L<j5vbx*(t@?rS6Yl!o30go@(*vUY?+0FBbt{!%(
zg+sA$GfY_8h;@x+$sX{du~VFn-p)wua4Ahkoj))?IUp;}CoBG9PEtTlQXnPTH$Di2
z=Tt)QnHc}mLD=o<`$ZUnoX^QHzcX0u4*qlI^An$afDMm_J~(j@$bb7a0Dg}L_8#(s
z;QSx*AA+m9`*}Ra#O@KW8}|SD1%%2||G~fSiU0Tp@gLwnK;*xO{{;UzzW*ig|2Gc2
zeE8KD554+gvY0^<7AA`e(_|%d6*Ci^KhnZfA#!1~y=4)JFsh*ZtP~EF#7rYGDJiTB
za(Q}UadKQiLQHOKWM*_oYGhzixL-o3Ph1dOz*GM}A4TN9L<$g&D$tMQA6{@iG92!J
zC*FOdB7LHwe4=7}q7!^$l6_+-e(}^x37J8p+z@hpXlfA-g7AzoWW<q~oTw~bY>pry
zUqmhvrx%N;Mck|+PJRg+%aUX=xkSJR?4@q-%fuqJT&Bf%j6yCW&H|xICRWSD8Y%EE
zGASh{rP!hpp)^#Xl~ig8E>x|PAQwPVfR_fNyxFR3w#quI)SZ<IAiu8;AXkF?cUDTr
zT8&WFMI91tIdB12t)$bz>#XE<Rd7cd6}{Dh-YNm&d8@Is39UFbHky_P>Ljiz0hWZq
zBk<m7W5X}{YJ}|;W~-^J!^&zgl{M)Zc*h}vzusNJ#cx`SrH#7cMr{!xzaY-2!!(wZ
z-eqRC>PrWzC4E(5hmb}TFn<d59xp(schj2`ya4lfdcHx-H#3twJReVV_?XyfMz<MQ
zY1r6KOtpad=VP7`_>V%7za%3xj|4H3i<z;$)L5U4Xdi04e?}bQ9J1ZdAnA{4XXK^R
zkT>)@`?24-PXoNaxOna(fFJhxhp)YV;=nt{Uq$eL>loP1kH`cdcg+LrXF1Q$-v4*u
ze>eVn_`e7LxcoY__l1LdUr3X4C^A->oJCi2v(-GByqqE`O5x|H3JWqM44SNzDlVe%
zbIB}vDmyckn?+$~l1kHJ^Ae(RVxqF5BWTf~l&C;b1cCqfAdd^6{|8sVUePFl1|t6j
z{(*KpfqZbG(;-24o(m82hK1o7aUmksJ2K%yRI*PD<zgJwKY<>Qm<=xoP9YA0khGGp
zjB+H!ky)JBTz*o%Af=F>md~OUFzNZFr7WIUEa7rk0v=Z;5lh5EnN&<12~x2{B#`j=
zI2Q;n5b?p0v{I2lE;TA7W|hRQm70|Ti<<>i>!c2iuu3bc)=TS63gExVA_f2NsE~D6
z$$P4#-41DAow~`y?Who8v&Ntkt|3JGCE`Eu-&MuOGzy^JY2)mo*8_MU9|rGe)H9s8
zt2LJc*g!sT-d9byK(nC)$j6t!JgmEt2akZ^OZ<@&QvAxCT3lL5=?=89VGo75tX7>r
z<doIQvx!j8OZ9*pp&l>5IpRE!?+N($=y@mb54pmU1RWz$SDK_RCu5A=Z4^B_O;ett
zU?d{)W0(a>4yf}pGo#9AkwvK?=}}&ZK_`=fPA3F?NeVlU_)iMQY|9t^XWkDy_fdr3
z>45W}Vyh#Dyu43+c=7xvVEzdHNCCiq-aWn#;C};S=Ge^ZZqC^a<GU|E!~C-!|2N|Q
z$NG;u|J}tu;{V}&FHu#(3?-kc;%93`S!#Z|jF~Dfp@<n&Ss7iy%9JzH1chWyZYn1y
zLy(^izhGr07Gs}#Yy>?T!5>wy(6nga|57|kKv?*T&Np}gxKN141-u}E6&daU|Jjh>
zFN1?l2L+x9#&leeH(|lvp`qSkQQi@8-jPW@Q7Jw#>3(rEzj(TTVs>D1ZV)*?IJG!5
zt(0&9Dl?kSiO=CA<#EWl%=EnS^z5R-5+;`?WE3-4OtwTUm56X7EEDjBJT9Nl<%@+P
znM5k)6ON!1i&SEPP9`>~q^JnlH4=lAYm)QqDxpIoK`vYaB7}oLFRe35ohEUUS<+?$
z^5va&39PqP-Js_*8M%FRO5nf6!s)0Gx@x6uc1|~VORcE8N&tp~T(|}LQMzJSo0-{R
zMI<k4&=xtdHdj;Bq%T2KZ_pG1+dw;Tju(8>=JpFj`wk1U6W}+Nwirra4G><I=T-@6
zRU#S`sOnU?%|=GOCcjn<eX1OS-ne1!3H6Bl06EOcPWAvEzCf58{HF3m9V1>_oS-d9
z)DzA8DO$G~*lAj3D)28Uh(~(^lpM-vM70OtFHFHE7vFgQFQR-t3G@C4VLv|jY+~qn
zEVc;t`o!ntI|%+4PQ7>LlM|@+pZV<Vi|0N$_3`odPwe~R!`CqL0{(O2=$_-)L-Zs5
ziT>n2BOFEkzpV5B`H$Si|DW*x=dNaa^VhUz_nT+<4{;F&3gqEGxLLpv;HiHe+57zK
z`(JwF;46s#M^FOY|8j;(L{p1tS`mz@;ZqcBih@a1bLc7#P0mUam5_P)$?O~o58y9I
zVP_?kq{ZbY#^xo)<RnIC#z#@(!YI*!Nn!r+A%1B4j|uXLh6TeeM1@?44DpT#_l}4>
z4=tcz$iWAn2?#hF;D0{A-z(7nLck@TK)(w?LEgckULld^Lu0+d6TBkH-q95A=yac0
zs$T-#FCi;{loygx96~M*Pi4eV%VX)RxNK&8PAMs;BsHrjub5q0&dbRuC}(no$N)rA
zu~@<33Yjb}n<Id02m~T7m&fODaS#w`vP57~itHMZMZwdHS!Nm6rV_vfD%9diji_2H
ztiv6dPS{`&H5$cjU_m$qtkQO?6v~!$8n#o<>#kCCR!VxRWu28`5TQ03uiegXwy--L
zf>s-=(OA-;FK*Bkw;0MH71eHLH)^pkw6H;4T(2sE0sPpXSg$Ss=kZV=xB%Xr*fD`k
zb2>(csT`SalcuN<rY@|L=0JPOA)wj0>9x}AMr8qZveZbjYQ!`TxA8=HJb`&rS%R@N
z-po#d8Oq{eW>%7knW!&~)0f6;OHkoVK#?CUof<}xmXWM3O462+wNNrFBg;!t1bK0+
z%qW1rG%dV{9Fm(DoE;xb3O*m@`+2C>C&6I*1noHu&W~)@_w@UIXFk06<p*azegk&;
z!`IJ#dgAl<p>w|Pqqh!yc;evO$M&2!^70#p&}sZUv5IKl^SFLGx({{u=ML@r(VoS;
z2k<{ViC=mQUCO)m8n(LpWV>JD1z*C;cFpq}x8HfD0B{S8K^P(n@Hi2k2>!r-mR6Fb
zlh8F{s*;~3XJ;re`32*o%gV|8qC|F15<3U+ANa=+kWi8ySD2hoNJ=P7j?GPqp(pI(
z{}RBDw&&=;3sEq_1%Q8Vke~2Sfd6br$eF+(fd7oY{~3S3vzPqN`Cs(D<O4hJf9ae*
zz7P5`ApC4_^tsUZ^ASnsBgo!SslL(aezBSU30Z-Ox&HClL5X=$>1ENhvPf!4480_Q
zUQD7EW)-jt8SI?gLKd5kU@sEmQdq)d@k-0s@CdxHIXpK0$KfJ1R-i5<;u{qx0f~%q
zu0h7J0Q_1pK3de^LVSms5AfG$1$8=sQ!i>UOWSSoR*SURB<`>)>olAO9T)g-G;)y(
zK=&Rl0Q|RDxQ!-evzgTZ#sd5s7*0I{_=mL`nLvKEEDtVFt0;igKvY?g>yT!_5kPN%
zd>GsT@9>Md|A)2r{E{rIuLZx29$hZ8(t8sb8JUscz4zXG@4b(VNbfVtS9f*!s;;hT
zM?*ItVI<xNNeGRE5W>?92oMM(3Eq0M<{y}^vv1t0q(M(_X3b2my$<Kzh|H`d?frA-
zch0$7#v>GgPx11}<;rV4{@>rmd4mZE0sMdcs02}<fO!v1YwHov9(c7&L^v<E{pexF
z$;`m@7C2ALN&e&Ua>IAh@FLYkV;gX#6~5F69@hewS0h)t@uOxKm-epboG5jI{I#&T
z95iP=h7c;8sPM>`h<N~$jA4{9jS3FPPf7sH=PZu%mL}PY<I8g+;5?WQfcY~cXG!2H
zMuPu6O8@`G|5G8Hvi*O=e<F__>HFBw`TqqNzVJuG7YHE`%K<(97q1-@uI^_qZKsYl
z(#Pw_0<xDu{?+hi&ELuUI=S#V7JtiOTxQuydg`%2Ga7EieASR6=XGYhzvDmlJpMla
z<qnh7hPto`mLKF7>U2V_MyOQ_)heMzAyUc3Dv3lX6)NR?rHZdq^VJ5C-YPaar53Nu
z;#1gyDtkoZjB8wRjWY@8eFaCd30M;~XS(4|Rs-o;s?^DrTE$Wo{O=k3Jxm(SPP5fr
z?QFmcbk{bz-E}yF{q3!z-K`$~_dt8_pCtb)C)+Qub?0OgiNK?)JCCkzKe@I87eJE#
z*^M1Y;Pq=;Pp@yi|KQTo>l={3dk+rZdvJh^;H_JD>^{5Md3v)A2|T%8e+H9&z5VQJ
z6L@@Fd2mp~2)uSuxqpxcs~;W~9vu~N!NlVeNTUL_Ke=3ebX)|8`-ZMH2<BhHr=`b7
zg#8EqZ?EIj{lwGbGAcjt1$Q?R*p_=6`?FBbK?57su!!*9d%fYsViuBIL^}xhF~{02
z=9lXpG&3L7JSTM@Xx{_2v_wwop`%9VxE<;7pSZxHg~qmkz8o}`L*}d(?VU1QV`r8z
z^x|c7cq#={wQxzvr{Q7?-0dRcEP(kdb3<(E7-N29X?BRT`Oj%Td<MoJAa`AnUI6?*
zmB9ZO`+v&zKbZ8=Z8Sv?o}%Y&unUZ^|D%Kp_@ka9Aja_le0n~2vYp&tkMFG}54s3=
zGRNK6ZUbiDyP5?;YlZk$HMUXmH4~m{B+!h9+wo8{>MHtd8P^}gKST=PF{#{u8TP-&
ze}mX$5E%?0zd)lAsFeiy)pCJSCQwR63XxDQ638SxnUp6}2$ZOx86<kEL~oPm?J|Q?
zVe-h01QaHp${f^t($;9j94c7CMOUKYj+cDNvOif4Cky3D1GMiEehbX6)><V0S3B$N
z)o!cRh8T7>)(*BeuIz84{&V9H#vlCOMGAc5aN`BmUp?Nue--J#?t?2^@B+_n?qfv^
zDtK}g%-?za+BSsn_N_g5foC^2KXLcqy$44hzq9-7=GyDmJ5LbjTyMR0t??S_HZK$8
zC#-#s{C9VAkN^Nj@bIv73OoY%8oa}ULpX&J`S|*k2Klpie0*3Wj@;i)VWA4ZAw2hY
zaRP1P?p6Yu6H&|ry#eqXB)?ks5NLX@wGiicuQZ7LEZD|{$mdeceNb^7RG<Qa|G2l}
zu;K^#Pa5I<YGAkGKWawzs==*-ubr?LyxM|KpZ6J39(~lVa%+iFr&heA5-y6_vvM94
z0Z)(c!~EqbE^Pwm%Hn8cDll~s{3ppz+JA!oLpXcq+^Gc6he=AHH~y~G=SAd02K@i`
z@y{R7{`3+22T1<oPvFmyK=g&X0Eie2?GSx`grh+x2G1pSSEE}^Ai390BHG`qz~<v5
zYd^@}%_HoIZ<d1Xl(!xYw&Jn1Or%Z3|F)dlk@NV9Ay>v{jyjAXyFOr5dkGhy_Lvd>
z5LaN>jS{O~WFlgo9{E)wjY_0d3RU30L?9Onr2@W$#})IrVj)i=;YnmXshlrU@}(-C
zR3(sUg$kWet`{kcQjJw*aA|BojWeuuge~5rEs(SaQYKH-?2QIu8ItW7Vfic7Ciss@
zy|Icn^+v1JTJ5w~x7Iqa|HyxD5>?RMS1#|~JlVc}w0UK(3tT-|zq;4Ga~Z3{`*$yG
z-@ih{LQih&J-xAqMX|@1H-Kk1cHs^nfj4gMy!rCZJFo7)cmL?^S0R)2C)YZUuQVTB
zuD^D*@#K2*$u%O{2jTDU7ck!1Ox@YZ;?C$E;Vbv|i}&^lz{5kxp^S^9A0Jom?HBGJ
zmPkAz!lW8T;&DHJZzm1Hqd)mn0(Z7?8ddz2PUL1A%n#jM4c%xVp7&qD)|DD)PjoUM
z&%KPthL702*b1D~JbM+_R>8hkb|2RKNA=)Q4fmD?_bb7@N?@z#-K_+6%fZdOubXyP
zL*|@YlW?j-Rz=*Y30mbwIYZ8)NZ9CVm=Lq)5c9z1FHK*h&knLs>Yo{y#{tWu=jO)F
zgZxCmKlBo5`3d7cj8mu1VaqurfbAz>AejjC;`ZmOz`sv^!fu~}4DU%ke{>|{f4m`?
zfBF#~PbGkei|qL%$KG|2&%2<>e?}bSU$2L|<xsceZ)ZJ?l)IkrHnGWtAb(=Jiu*k9
ze~v&imRQfmyQyF!;>f#D|M8YX-a^oxbQ>c!)c<wQjlU5ou*|NP+5m&dtP>iv2>3<X
zUd*poOSDRnS|*f>`BEWI%;N~zYypcc;IR2TfFs~@1VR>1z~Twnd=XbD;R<8|u~ID4
zO61UjPN>vNltz`-rZ+kbW|zj~FgSgoL<Ur^)!WDda``fR0p=J<Av9a7?RKZzUftbX
zNBoZx=(U6G>&M&Ij<>EHY@F<Nfon%wm-g1K9IV45+(A(kMWIUs9$njk7a(wX>*4Xn
z8#fUS9X-8HcmYV@tygy6xeaw}Jh`^^+SS!ZmzxhxYL7109$#rZh7gV`AUrHTfJ!4)
znn?bG{C9S8B*6SfM~IGy@y=czhtLDKiVFA8-aoF~KQ0q*cC)W;Ctu!(VM8K-edrip
z*^Fa<F3LF9K>HSUDT3j_D-Hjpnio7rkaJx3AkIg$e^7CvrR`cLbXX(Go9JrBi6zK$
z_iI65yW~T_1M&lF8TWQEfELJZ+S5sS3SND{AU4aH9)l?4P@0r1DR*ATnr1Ezvld4Y
z^MLu3v2!?79}@<u{Sz1FM$dx(zcc$_elPyV!E50EM<sBY4wI_@|AGAfkN8g#{?zRM
z#|Z=i5l!LbhZn$IsF>p*6oC9UMAj=nV6_0|J1Q|pB?|rrR<lvmd^U^0)wH`3@l?aH
zZaTS<Mda@;`Rq9->ObJWGvl$u90dPE1piV0>BT=rsZA%gXvG#R5Nh~3wMe6u=(RGV
zR&LPBbSkM@j<APESbh%3&*Cyz>=h=P@d8X1hs9!Z*c=|0C*bqN0-;18l<@>ojzGi{
ziUk6(NFbF<R9cNuZ?ITgo@9=Qe^Ti@0RBS-rE<MctYXCbMzh^PCcM6mHL`=P^^={A
ztNR4=LHOgH4sdxN%-_J|8p@$3+lY&99d4j5bn9px61aD1<L>bq`2Xbc&Koxlo?hQ2
zW1*e|b`c0Yz6{I13fw<#;0opY$JGZX)dxqg>cv;L(y;umZl$om1l&a#2YlYi-QFgE
z30wfkkMS<ZzhAg}SR#*~$9M;~arPc>;jr;dROusNJ^&_PMQq;<-|R-Ow?bDM0pMB_
z-J8M7b^l4#dtB*l;3~Ng=KyGC!+tI-ZxILemr&a=;;gETymPM_K(vGFx;iOmH;slL
zFT$R+jCT`xUfj_H5rXER89tb%=2PuDzD<k6JbV17jW8&~NO)oYd&lV$kq;aJ#)V$B
z2mC*k0QiqN_FF9s5$H((fe;Z35$P~d4kDwSKP+rc3=-5O@t@iMQ^J$h{`8TYKph1C
zUoihv0@K76$d_h_jqn7bYn9k~HQXur8fkYe?x`n&t67wKQ0fnNbLjJSS3;3aDz%w|
z@efo(u7bx`4Tf8BZ#igByTN}`)NYE{2^ZkE_DY~gdv#*7MrhLT^*99#VUJp7(8^6Z
zg%LGR@Lwj73wUC#K*ZtlSu767PiL}#Wd>t;g|W<Jt*}@OHk-}ma`}8Amn&ei_yCi5
z#pZFiLY_b-k|<?LjYj7WMY4rbB$^Baqrp%tnaX9d#Y8fj&6S{qe4$b*S6j{YMt6O8
zbK_uh{djBRWP9ykb9H~Cb+Fk!+U{K1+rWH(qkZLI^TyHEwf(i14>w*Ruzvew^Y-D|
zt^L;h<MpT4_MTlwBCw4t04|{C1$G}_Ub}zPdUOfKzkTnhetVy|MHzPc-XU0>e`Pa;
zHC9-DkRSDy+uIrN{FTkjD;orEZDw)PCN5ikbvu7+EBERyZXhY&+AY4alYey&bKFT%
z#st5F*oxjefdu@&(hk6$6B58_JS}h@iLI}v0>bzoSKNnXH#omra2!GlHUB}`y_2`^
z6<sF{gg0(<@}sc<MV_6qf3FgR!AG>yO5o^b*J{$)PB_;xi2q$BzcJ_1r@aPrH4EvJ
z2>Y#Co`j2L<_Y%FD3dw@gTFjOaGs3(ajy_<st@jfBtPi|h{$Jz@CZ0{e*ve?4io$b
z$Xs9wRZZ~!MXCR^j89JfU+^C)m?ciNMF93Z`6bFt10S2Y_<O*0z_WvAa3e6u|Kxft
zzFv*4mBO7upq+;0k8f8}d)35t$=is7|G{=5xtU91BXBe7t@s1AV6+uWbW+~3-<-hC
zN2?)fGsPXIh|?HyX?->Y%0B4%F(d8O3ba}jd?b3c+^A8Qv`UjkX;6w)BB6}Wmw?+`
z4u=J>*-Q`~<fqe@7z{d#N${V;A^6YZ2_OPCi^pVe84UIcg9X6|1R{x42J*YzzC<$Z
z@dlhOpW74gc!QyE974b(k<7szWU~2up;RtcJI(e^cYS{icMNRo!yT+Px4Nyp%`S=c
zc71oVdubonZ){!O?Os1vM@D?}VEyHT^_zRE*LPZXkJn$nvG@9o{l{0go?L?jwjN(z
ze{y;4!BP9pLG!_J`{7CZ?qU7aJs5BTw{VN>Uh(E8j?l&_s)^%f7{DACY~Fwz)(G6#
z$lls1-r6a>yoqn(zMb-|oeCy5w~Dv-s;}-<UfC|;5x1+u<^zxbMv%W3+xV`uQO3lX
z46ehf1Dn>5YR-cSIPcgk+PCwLoq}_>=-w|Q{t0a7JsTN3`uEHJwUoUbv%u!Bqvx?0
z+^IxbNmn^!sYS8c>#W5b&4e38?wr>cx2d8|t>3CdsR!AvLB>L<heJc1bAmB9xH5B*
zKK=YIq2;LyxKU_v{OrOQE|}=UO>;2#0C)~y(tC4u9$M&OX6U6^q9TL{<YV9`Qq<E(
z{xpWi$yAfWAH|ueku%ey2=$5NqKA*p38D2I%{d|m896tLxNGPP5$F5|{)71dBtXVL
z@r@b~TPOG*>f})BN$pnA@DM=cQ;P=LspMuMwUq<;(evP|2BPg)YAqdZMC}=;DQ?w8
zEFa~+#!K*DW;M#pMu|a(upf|Ok;@G7Yn4WgT!*r!NF)~sB)sSR=MoYC;h9YO%E~g6
zxdJ)#biw%@|Jht7^Ev+!9w9CgiX;l9+G4fa>`sHxVt05T0jterv%4KmugmQZhT`xA
z*<8L*ELW@bZmYey+Sx`Hu-e&dG}qgW&2D?M)7n|@Y^*j{>lMs<)zOXP&Gyy(HSqu1
z&MI(Y7iix**+3S6qUhr*TaPYpJi4?7JUH&$*>CpvzX~sKcfWpXxAZb@4&NmJGkzUM
zsIDbXR%3^a&_O+TbuEcYbsz^!0Gv02^Tuv&m9B3TZfq1@-YQ*N%U$heZ*CS}-L3Yt
zP`tLDzt)Le?H#;^E;XD$h5me)bnMbauQI~>!>V(?>^Q8sFz%P_c-$>HHnP@@tZgG>
z2exu<pqq4TWL)TTUr#$%<5=6WG~+nH(6?0zwUSuLH0ONAyx&wKB;c&aU3e2TO3>Bp
zHOq0H14^B8E>+B&L)cH$`4>i5^N>J4d6yQ+f7-;^-i<>2kN^z+JPz0g4+;LCn<qGl
zU02UBI}HBg9FmVs58-C6m&ONQ`j7kv+fP9X;rZh*kyaCK<wcZ*IYE9xS;XHu1(J>)
z|6lq8`%m&8@eh_H00cih2_On8$2ZIA-FjlH9PVaO@Pq^s8^!ciF|wL;6}^aj!p&%+
zn~1F@+y$>WVI}w<HiQ4>1iC=63XFZe7KPI!N6(uD2O$wlU4Y!ARay0Fi(X++OEpsP
zU!oKVrF?-9OlATcE|bNk_W<$}#vgzR5btm}@DV)3Jw%?+%Me(IfjB(AKqL@~B~m$5
zpw$^IRwrOET8$<fB;XH3;0RLbTrOWMS86rn!;R)@qtU6?*O4k$s@-O7tyS+dsvtk+
zt!fF6`&;XmcGeEoo0oP77jS8-ePye4W3Th_!P@QPP2>W1j@Ry;tlz_UglNBW?{M|@
zUIP-ici6sr(16i~#RjXdZ)9$6WnbRTUBhx>kNj9y+Q%|eCw^@`i3@}9@=7;>uU%Wq
zTnEF~au`AT%d2UeItd}b7vK@^@pz>Zz1$7~SMY;4kfG|q{#@+LMZ4Nj-M3dJ2#=2a
zouUJK74gvyc5e~4QP?^uYb#;FofJrHQPN*Uc%O1W2#E75K|?KU>1KQz#ZW8dDTS;_
zrzYW07Xs#D&;mdXL1I6<z^)ZK@c%{$L&BOzofD-VL_Qqq2%9qW2lgLYpiLqd=wBE|
zB0vCUf4)aZVlsO66c7R-fuWCK>>a{Mtlkn5Kt^!7k_%IRik=ppKla#;m<m-^GRk@$
zag9P{grhkHdD=M<>i>TH^9TG7c8byUN@A;)+ODT|YT$pglM6PJkxn+bS<3EIqn)(7
z<hNy<{z@>pmP)L~eMO%=?Eoxst3G5lMQql%(-H&!?N|av@l%0iU#mfC)QR<4u^z@>
zudy37R)gH27OTV(wOFbWBLpJ(&*QTo0g#`~S>bRQr|9t?t^pF@5OF^t0YVUP3iwZe
zG*~Q_$z*c5T&vY-wR)Y-02qyCv)LL5ghJtHES}8f3dLf%fDouyLOrNet3m?pYPHj-
zH!9_Jz1C?qTD3~8RK(+EXZ2)vV{g56w7H5X=y<bvvemi@7qHj8x!-;DaP5_Y?koGN
zcaFMu!2E;O-NV&8`>or1&ASI}2;r6O3aERflRj>N-l<o1iqHa%5JsB0Q}!J;B518e
zkHJCR4|j0Vj=}|8Sxv#<gXH9R)QDpaUjXt$5t!rn?aS@xr53`T(4|)R7-uh+aP}s#
zCl~E%2Q}YL5vQ>^$dTBzj_r!X`8)*wTM2VLVr)b$&6o}K%x26=g5ZD1R1TTiY0qlL
z2WwvlS|T{KRxb)$<fwM$gO;RQ=TLI&8sa1$bU0v3yMng}{-e}`RF_2=X3Sm$;bHkd
ziY4$Lm&)O$Q2>`Mj-Ow^t&2UJlAoNAE&xK9!MP0xR9>XRh)FOf!g!<-r|6k_u>F6I
zQ|6NsI08Zf{|WyQ|KLvpzSTl#tsLK~C3gt^$Ja~#dfZ=&#k;xOPA$2C<7&h9tjm^m
zgsP!LJ07f{?Z@dzJ8UVNDMIZ0HHNL0m>sJC8lPG2(8=vOh20>t>SZRK+^knxjT*aI
zZ8ISLk!lqprAVrg5b=+cClYh{0+v7sPOq@o%Us-&#${k6odPj(*@%(wA9zo=0zwE#
z3K0JbghHW2A_n>KpG+zfi={G|La9_k0^q;P<pKQuU^pC2r81dJ4*%84m3pnxYSbFF
zO0`@n74x-9sfJlG4^%6qcC)^>*+FUy9B-{2ZMHA(tX|q~U*2wC+wHu9bYQ=WSm@P*
z&fTN#?E^&q2>aI`o*)qF-ampCnzwdpy<i?WZ29g{?ao0J;n039gz6&3lQwz`aK?ZK
zz5t7@SZ_LNL}Bwuvk&qg)S@6fv_Q%M?~j`aeC@CiKCFi^9@ImS5^fsm5;u){b}Oh}
zqIzQm;n%asVJ$d^8EB_$ZS2^Mf#mv9Kwk_Pihg1}w}K_zfUzF4H)0MPT>|DelAdbJ
z8FQ#TXl7P%?Me<NkO25^m9va87A!x4f2{L@|A>E3^FgVfy)ew28~7;mPv=XMeYjT?
zfCLtMLZFPC!5n2$00}bqPmV9bq36j0(gc*~QJ+U?2=iV9g^zmi%nPdypF)}VPjTvZ
zPJb5s$7#2??ScsSaqQ2F5tm<3kfb0&8@&)%FGn_O@tr#OkD6zok@A$oxXd`YUPy12
zBCV9GKp1~_-W{%mLzSRA<8-F%u8hl(w41^vL(pUhm;rsjqIByeHWltam07h?i&kOP
zsqJR1)2ek?5dXmVOSB3i{!xmE<X6H4|Aivd``JCtGcbW1NJU@{?*Q=mF!(+C69Qm!
zIV`La2n9TmkWWA)=uLz|zCa=p%VlznN{y<h-D-0pC2)JZUVkJK1E7IY35jsATE+hs
za+yp%n=Rz?xokEMCy~onN~L<W(rHw-I!*BZaI15;**@H8?ssd4>-Ec<t?RoTqytyB
zn%A})uO4>pU)n${^zdZk;nDhoW26Iw5FVa%?;W<_1$rDtY?FgA2eDCI+^d8R>oKe`
z;liL^I21gnhrxfm1aQ*!UL}lO7a%--7R)CuUT7xBPoWC{bBsHs;A#?QPg%PeCnV6#
zU~b06i>oOcb}3fF;5qhc8F5epj_4@{bUCj!?ZPRY8m#9+1sRtLZ%QF^HR`CwT$Px!
z7<C3LGW7EsBy@B*7$tNp^<u#bTiR888ajMHepEX#BL9U1u<Q%_kN5}iKQ0peJuiT(
zr6<lWjh_Q(#CV=Ic6MR-V+$iMo$?>#N6CoP0_LFlDX=sK36T6hwc(J!sbT*E{$s6q
zaeCl6rUn)#QIC2K>I4ynkwO65$(V;6AqQN3bjttOZZomhPV6<Jn`Kl!{k24}o``p{
z>5XEnlMU4q{z}+i4#t|Xcr)fNcwK3SEA8~=JnoFk8nYUMW}VNZ^%^u@gB&}*Y$};U
zE4ORqHm%xW)Vj=Cmql$eDNGufUM1J7$odcLKOmNH#S&t@i!T8AnIbVb&jcWf7vPc-
z;PH?Juz6gzfX@Z_0Zj1NTQS2M{4ds!MN+XuE|uvtTC>pvIBa&i-RW?++-^@gozCa;
znOr(oKxs6SOeByYBx3Olk^~e?^MxwBL$z8f<spIXHKH7Pu+iRGt!=j}`)l>1jRw5H
z75IY9#+A*+we9Bht@_QK#_j#pTieZ7wi>s0Vf^dY){4i?)L}gV&I2GYm<-;cz>alQ
zFdX%E+?IvqCzx(5KOrar=kXq2?yWufNf&@0JnsDlJnq+`d$q`3HG*-c99qwNamQ9Q
zVrs;R*uI${cB%9(-m-xF#Q=7us|rC~HD<2H5Xl=;E;Y_)i<qTJhbrw-M$OWQMT(b&
zkd-8VA!5e{UYuS6{wui@wO|oVKA5BOkC<P|S%Ccq{{gf*fd6>p(Z>0-G0wv9so8%%
z`3V0Z0a6Q#qi5!a{|l&oDuGi+(2IqLOqm3Pa9T1VE&nO=$;lt^A1!u}0Exw^i?peM
zj{;gCRY0(P7<E_T&Le=l{ELD=ic-~BUxsM%UrX#Zg6MW@rULb3qLa^{;j@$VRige%
zG~9?M+R6MzF5ZZFb8cVO<w-j|X_qJEbi{4eu+0*(ntCok>DEC4YL7wdF>2i=o!g>u
znp8G}!mN`TRN%h?RZzV`q7otg7b^r3DOZYhEg|!x{Ks4<V!hx$guo??ya)?nM7<Qu
z$LbesKB)rI7l8h75mJdnCXpg5R><UPrAjGRXw(|y1y-xW=L^K+@nkv`OGM+zcq|b^
zdVuIC7L8}pM1{0iKsZz`7K-(9v0XzhT-)h3w^nOw_2O2myxXbob*hlSF|b}gUaKB;
z%ZKg!Nw@g&PV?qw?MkP3wS(+6cic?xSFw36vQr{h4ff*x5HK9+C!%hcZgkgx|64^|
zn}X;CT{l>7C3JyrZWa7+6ZkUpvR8@VC4k?vi_<785fo}ru0+MAlX2H$HXOgxlK_mo
ziP)SN(3L~jtE<g<aTK#6<yK_z5^sXWs1@DIEW3i~)$sg!zE{U{Yq)-cC~Q;aLN*{9
zw4%rl&db?z62`QQHLKvvV+8oiljw5*;lX@lz6gGplo3pr^FxBAao*x6vH)bkzf-|S
z_z%kukSbUheu)A{&<lYue$@Vhha?t=I~n^HMlN81fHF)lmgFifY9twpk&4RFN8%r{
z%ak0cll`a0A1YXy9whls9q%K5!|89sBk=|msviII$DPc6GqF_(v@+gmJkrYMw`%D%
zqArG3XLl(OZY8o?#Z)&7FW|_yof#+L0#Z(A(r!=Ktx>xrVz-2C=72@-*6ZB{z1wJT
zoAfS|!EM&sO-if<Ao9_x6<Gf=sAXE2Of8kGBvOS?A{R(xe1hsiwnPkKgWfEWfF%(E
z96%=G$wYiK3dzMH83KMGUy8a>4-x?uhWR2s4-oYFCX@=PQXvzgrA{g%ArVW}YOT}h
z2Gs+B9<V+Tjvy<*i%=vIjm475R3TqL{8Q|$mNhEHR<+n|qApZlt(V%hLbp-gTCH!d
z)^=8_d)*p*!Ctqz+bsio-O|xo^>Vjz(9G>rQaj}Yc99;`5<8{v7LJh2gY9UoOs&I^
z1Wk>o4M&S@7jedt7w5-r7JTbjcQ@@M{Q_veRS2T$wpj@7R-#+Q5GKGzKCoR1@6}=(
zg+Pz|o@T;Pjab0{N(3+#L;5T@?@{J_YH&W~QpBuc9LNmXC!N^0sPpMXdeO3yv!Lcu
zaXytnx?+~ItxB#{#S7R}NxwPfG1yVzWP<#&fQUXVSeoF|CZtSqJ}F+A5iCz}Q29Zh
z!|dSl^u^`L3(T2|9Lf-DZh$e}zcSTF?=gRQ{2YD!41J8i%J^A2Fm{G9ewH@+G0NzF
zp^l$fn#A4fXNj>lp-!BoOkS8r;{<YBTuu#}OpqUoWqlMP?7@=38OrE6EGxXgdFnVW
zZT}sJbQ`UA<H)=FdTVg~OH=*JBqsZp@Dg7hLmGaLOv90kP)5#DM~Ms_P6q)}cE5?T
zXSACSHPfL+DzjP5Z`Bi>LZp)kv|`?RIKGz2V*g_^;mmn$X_qq%{u6Mf9FC;J8nc^1
zR>c2Czsck=nmicIJy^^hi_UI>@kb&I6(9w`ZWpydrPe7FYKcrClB>irg%DYQNW_&&
z_>jPJ+6&n-Ay+Qs$whpHSOCa{LYY7S2|x&h7O;Q@6$k|qKAMC$d@f7CXUk<Gg&aAt
zP%0(@A@~BRSgO+-EfyQ#c6*Tv0PqEX$LH~f{LyG6n@txBSSrp|3fN0iTCJ7A|J8aK
zUZ7seH7kS`usXKhs&2NcTdNq$o9!Zj)dH~HF7B_EcAHsri>wuhZKFGt=te%cj>Cg<
z0j#A0^@y$LGhjt^H3`cf>SR2vq!VL1=>(a<>TcG%o=4+-U^V3d*0O#Qcmv=Iu>AN=
zJLSR;g89g2arR)&r!55ZNtZH%TZbI-h*c7^NihNW-6~ek+6zKviBU`!(8k%6k(Jp&
z6mSs3Bc#{MSg7h?VG9xnIW)*~6^QLuX25?LYhJK41^#0$U6~auPVlMY7&(+t#x&SI
zurhUVd4h=ZAp!Q>AVyLG5XA~!PMrHFKz=+z6bmDNv^e%L`qVjKdGahd(k9MOCeP1J
z5T%}84WC#YUL5PEjSUd?xmPkh4V*~#fy^2y_Wuwh|7nx(NBwkQlECsr-x6+nA3MJ|
zdJf~l@EPjJS<Dy5;D-7EEO>=M{#FKvwsX1ddSbN@tR;huIC|b<opffqlv>Xr3vlIp
zZUFHQ0e4RY$bwz4|Hh!@Kk{FX;K!_!8#GFbL1i_lunJ&Q>&;rd3E_!EjvWAMj40xB
zq*9(#%9TrafJ_Xt&sK=JN(snM@E^TGQUOmU;G<VsDG@3qB83>E5FbeeToIQg<T6D9
zwoJmumI9ebfPPZBSgexCG%BT0ZzOU7liBC-dEFj2QURyS>+u8vzF^27k4Li^l!vfX
zoP-3b#X_T8=+VB=tPp?-kPbi&Yt0JyztO66>xH#OVWU;tXyrHBh0SJevytsq60KYa
z*eFLgDv@r^kJ38W4(6ka2IS9sa8HUAO=SQKNiAk;Bpl78tC4V`2w98a+Gw08?rkUC
z%{aE7xX2NY<jYRR3tco&w~0H@lb^@!qGpL#%?TMqejOiY?+5e(my+dDu<(e9mNRSM
zQ!ueemc_L3UK~!8A{lc766TDIHK*n;DtQYs_MA<{jk+{;H4iUE^hv}&QsxZ4BV3-~
zQb)Oz5#GWGXMTt=eUUjcK%XS+JlM{J!JjzGm^{y!y9lsn`dHI_%&7~E$=@SCYx4Ya
zOkY@;I=3`&hUEXs^mzdDrOC4k6X)jvB*NG%LtIXZ#Q7q9YKS&@5zL2eCxfNa*y(@H
zfAnL(Bj7uP1bX~Gl>k%#Es!b@ElqJ3#_%V=f8@Lb|J&JcJDc9DrZ*~~8k+yGy$gr@
z#8Mmi<a#bxi@I`NZ{F`K26`!g$CveZGj2DL0yvNYn1dE`z-;jW7K_hn@!3osi`uF~
z%~NeP=$x4Al_s^;tO5BAMzumIQYwW?l}IM%OC%hHLLis&6jGi_%2z@KA{It;3870G
zjzGfa0E!;@RZ<bqQvrJD_~5^o#}@Ng5<XKV;;3c#2u*U34<Qq5FEeN~HnYj+a(P`Y
zm(AuxGJt=b+wJ#xLP38rfm9%xipSFNSU#PC3#gTH)j|dm02<{y2?(KEFL$b?b|v4b
z5&Z8Vzt+gDHL{&*vX%>$Q@(mOuv!eQ=KZyls}i+kT^iEtXPj!xv0x5GRKnIu)Lsl(
zfoj}UL#Iu|hVpwkjQewmvSc-Etwn6ucm-gBH{=LKR6@9K!kG8!vu>?Z&QvjGv}}q~
z&N2&^%)(^@Z&AmgXuxeSoHim`8dWl<q)X#mnCR(#B%##NGYC{9^eH8WB4^FYS#y}^
z1T>dc;4_FZ5z@xtDFlmSJnATiGR$2V=2C{ie^_|@AZQOAJvaNa{jBK=pgq8u?dLw1
zzy;>i3&Qu9&z`!#o$1HOn;YQFUj*lwvwe)23ncKkJauky>O6Il$Oyn&#5|}{A_Z8P
z9;QzZVg-SWWJpSb)xbyR<eTRs>aJ7$#0CUn-QgUT9zb}K^PoK}KY&M=eMo>kHw;Hc
z9m8WEa^46$_i8S&Udrz@65V33n)DVU0b&Utli4hW>M=*!?aF!rrI5c60P_Pmk2mdd
zCG5_)-4+4)t>C`}=<(kcuv`6hBeuYpHQ>L&Wih%fYKvBD)#@!;gIS}~%RzXRN+ehC
zB~rFr!NZ;yxrC#Z^8uBNr<8J(QZ6x(!19CMOt2c9S4%~j7f|C7p^=a$<*{XamO{u<
zh*%n#K#SgLK3m4)$OVX?xN4agZpWh6T8%mg(PAJb4ztl|HHZA3SU3<3`Z1<rk$gH)
z%4N#A6i~^hNsx2GBb4&ZVh(7PbL~pDRnD|4nN~U7E+<>182Dew_-olfBj>9nof+a_
zKxNXVC<kyukOBP1h&e(aID(wtl=GP|VwtrRwqc%f;VeN*DM;+M1?@577EzEK6Og~?
zH<bdGTEu~^xEYr&X;&p|N+&LuSR5BqMm3B%^~$V@KBHKkmeD5hMobx&P=`d60qzWl
z*Uz2pgNbG!IGH>Ti!GoItJ!l3*!krNIb%xAnb!-bb_L6!<`~2@$;u>`f{><<yD%i7
zPly&r`II5{Oh0G#B73%fY3wB?Xg`a`KF$pIe*wRoH+PXszM1XA1cWESneIEC6DpYP
zUzt3|m^#Ou@41ZLkKiS9wx2$Ifj$NFgPh<$3N?(GVI&&hFdex@FPJBo-y=3Ac+;B`
z>uVpC0Ac)J*ni>+y?_U_Umm-#G<yD{SR6jPID*0~Q42!DC(-uPEu^=q$+eQV9P<{V
zzEU*WPGvWXv3AOlcG;6oUm+N(M8JO(f4nKDD{iw#%=U=I5+E$UHE1Q^hXgEUuSM@P
zYHZlyu2h-zCbv~<)2U1<qr+gZXw6nF_^(zA^;)Sy!IQ~2dYwe06euKYwSupa@zheT
z8Yjufc`6wfLXZjBQUPJuVfI0OEv!1m7f=cLavnz|;%KEDjfAa}@eB%~PRdt_xGEt>
z&Sya*dWBRcml#!YlUiX#BZf|G(Wsneec0;^`CNXtJsR@GBY|Wzn1}^4@o+8~Eu`b6
zY_gb1fd6m@&_XqvuH`cIJb`L1S<NMC`FJ%KYZT(me58zKz@)dFaOWeIgh!1dVdHj1
z&aEXQAB^awPgvzqlY|(}vY1(xcB<1(4a)LKn=<1>aozyJ<1t}Vq#P<BY!qV-3FN)T
zoX1cOT8n;j){Sds6#<>duNCUK3xe4J-pobm;+TXw%AXw|A(*`=p<I+w2ZfZ2g87S(
z0DpdvPZ{LRLlOP_xdG|Ym~3fWzC58oV<%%8{zA*0HwdX#8Qm;dR$+S=V-ll+IW3|N
z<NE}SX9+XDGIj=!yxEH!2xPX8J$)X(T)2Q=KER*92naCgeE~m$uaO{rya&WWjEQqx
zyq|mi5xxKq!T|Xv&-XIuKFaVJ$}qBnLFVkx@+30riwIy~tI5y>0ue4xAwvi4k-&oF
zr-=Z<ev<!~;3bycz<>PI^7sY%==ndu^2oWR5q#kSfSzaYKfYFk{SVZ!+7<SdV(vmX
z+DfO_^S)BhmT-8oUKsyiG3ZUZ-3dG3h+4pZN7QNy*+}x+Lw0A_VGqLco3wTv_^*Ps
zcUp{2i^i<cVsIEtR*luB)$3(?t;D34s&GHJif=Z`Gzu=DSMmX^jH6X>wMy_Gi2zf~
zTOnc_AsbnMT*y`mI2tj|*%RtzB8+M=4?NdNc}6+UAmd;(tArMn*dP;VgdC-StrBu|
zQlVBN&`JgPzD=ui=+#z@(xKBhjXJO0?04G$AK-Nae9maVlL+}z(O@<a0Z1X_Q?Wua
zUP>i^A~A_qGVw|_Ud}|T>2NvW&4#QAw;^Iz#q7$MT@km*Nt;jdKkZP(EwYea=tl)o
z%?)aJKnxW~tstZoMD#*DhV?>^isRGp5X0N0D<D6BT0KOOb?eejT&-)&xbz8|I%JT#
zl{}k-2zwN?G11%r2?4l0-G{k&en32TLAo#i35XX4g!2QSr4SxraYVE*#Gkt;q6|rC
zqcYmqb1aQXm&b(*Lt@&9oIW908kH`O$yZ?fM}(9?g3=HX!T<9B`5XAN{Q`KFne+VF
zKJmh!m^vt07y!i7VIc*7g3!oE@mw{?5hl(N;n3_wyaAvKJhG5DOr57soJW*Jn?R8h
zeMJ2%Y$)zUDUiU6q`MD%hAvq1!(ce+8cz9-3F4nq`ww+I&vQr4E)Aar;Xf(?lK-?(
z<i&k~b_Q(^*_}G#e;EG&GT&M%+(>6P%kkAT!X9_l8?D8{l}NA<@TNV!jN6amf6@UJ
z_)>0f((8;m?GcA7=5j}zwt!XdH0T@#tzB<$nN4mR(q5fSXLgy4Hoe)VH5ipTt=NDg
zAJpiK=J#6UVEman72lxb8&x1GTdUwf4tOczEsMCzNQWf?rd-5T2svsIS1ZBYnnI&o
zY>>kp@HJvEpQo4N@L{$|fm0}jR+Y#EX$V*<*nbg6FXbEL(2D>tDa9tG2tRI8$Si8y
zQD*cztUjmJ<Ft6(wt(Li3wq-rUosL%M+sj53FMLlvhheRMj#UjWuu{7Jd}@P4CEtT
zY=8?}a1yv8Xq3jSilkjh*l@cNfVB_nMGhPrB3g1t=yoyPCZu_#D<Ks-s9*+_Os|yg
zmor=<noGQ7<Wmj21q;qt;LMwaG(`B21AY}apRj4NZX?FHMIF>jy=sw3KsWPgdghdp
zHlhG;sY8<ai=x>+SbpWwuyScgM!hIs9F#5&Ar}xY4v80sC5xj{>adJDte}mm>66N(
zF%k-b|6_uML1+XLzyu>yAX*$2;mzVO02LtMfwjjQLQRy5!ufvw%z42a_&*@&39UC7
z7E?zAa5pa`K=K?T!XW@21OWfZ?{E<#^}-~4gxp~TMO4yyPxi6rhf#E5&J2<+fCbvm
z4#5}DdO;D$PgG)0OP#$5xd#Uc0{j9hoGA6c1uPDqp$(mZ1i=55vFE<vx&0qLzc_Xw
zvR=;Ywb1tCDa8WSB={d_rBgfA%vKqTzwUf6QconB>1aLfD+B_?V7LrN5b!_+S#KyG
z2xbFdfHPuuMV+>w4Z)AWWkSHCwdhSwtIlFV6~^c^nVe>$4SmX5jZUrwv@)eapwkF@
zpYs(GmQgJ-DEJ02U(D30IT|HLj?NPvT`pp4kQXC5!hNzbxCx<NB?1uV=;cEA0<Bc2
zR{}hfN?=wC%_@OSD{<=NCOIJi1V#9@CZ!OgQI1QQC3cO(suEk(GN(@M!#~!bwP_VL
zox)?)hdtJi&l>XEfJo4pjCzwXZ!GMNg<VPPbqhFBK1b4Pi@D7a-1z4*0!f!4ZdG|u
zOV^7+dT~I*^C?(9B`ct0d1N$)kYeS}8CVk*-V9(D&RcmiUh#ZHxfGNwhLy`cDb*uh
z@JSY3q6M>ns%6g_`3ri^JS3p!EEsu<P8rJ~WqB36kPg3G7|~0^dWlEDGq9-!){LGp
zrlpUNP%jNBsRMwHK4N5zYnF#pOM~j=Vf8WuG6X1T!wTvkpjjR@v8D|x6T0Ow?edt8
zKCWFES1pe!mPQmy!}6tJ8Esff8<H)JNS6rhh^X)ah;0b;q%;JBFJ2geXA#a{#6+<?
zE``1zmL3GK?Xd800dPq(1A@6B;oJ~+`Xa!F-5xo^8i#O(00eIwBDp|(brA_Q(i6l&
z@B&OC;=-Sv=FW^TCkI$lNHK<Z^TX_ERCQ1|>O-=yLaaNW&WU0Xq9dZzkMa}>vrD}O
z4B99b9nKKr$eG2VmzGA(ERUWgFm?{W<Ki$9ZOG(8X19^pDErGXUojl6#sk$za;uQu
zEr%NkY;O;i!<i1;d?Z$nhD(8HH5@KO1%5y3KWSGW;|XTHzNFI?u{c6z8%jW^|GNxE
zyTR(Rn(bzVTB$Q?%?`8KW-!|gTD?+-CTO?-om{OFp+D4SlIh_ElzfXuWQGfnvGfwA
zLBr9iumpg^w3pRlBDN>YyPTm@aP%6XQ7ba3geIk+XZCqUnZT$N8dZEF+=r59R`Bd<
zk<TD=>m(*A8<lOeuiMoEtAcBhaolQAz#zj-WnSD-MqJh>u`AK{CA2B{cBQ~7=Njc)
zt5%Hd`#zf{=+p-sI-Fw!;96(5R^-wMJvuQt@foB66N>G!h(YR6u$|Hshj`f`qS^Qh
z4k6Vcpg4GwfSo((5=;l>i*DhJTR7|FPln}`sEQg>(IP6ES3Kv!d(o^zFk^;NIJ0K{
zf)V^jOwO3qG3QO3MKfo?z?`-6s5bth9pxG^-Nc?((#F+`Lng)ue!qn^W?UIIGe!;c
zA^q|YMm*vpGjq(s#785B<sl=T_=1r+u3H|~(?`sZBx};d7&9`)Ow4ftj4=bi95X;W
zD<jI~LFLMjY<WaP9S|=LlB0BKSVDsw1|*Bn$fyL-4k3X7IXuYX2tdR_2!sglKu7@T
z34dltFgq-q8^y?<83fo9eT>oboXLLC{0JmK<j%;Te^*Dn!01(0nG=xD7{HnwU{4{|
zBH|_f`~YXJpD_de6EY#f`w1ulh1mi6L?2ckPC-Za1s0%3UZRbB4CC_nSzu}O3|>C}
z2<jNaa`XAjMln#0x-;HLIUK7-!0^I$0Xtm0*x{aaB^#0AdL~v0Bx?R>$qj^ZE`J6o
zupMVG_)?BQ+Tn{^-C?6EXmA7!R*%l&(m1^)2ZUhJ7|{OUFxVgmo8E#F5W;?=!eT~4
zwA6&lF^m$s0o3J?XGNpi)*xZv7<mhhmy|PQJepd_fYCO~akmZ+?O0f{w5W-rw#{;m
z1$>rs914`X1r8<OD(9IH5=&VQCD)@71av}F!W}ZERYG?u*&Y?wtzf(4EWe5u*1?L3
zq9$oT%g0NnjOmavoH9o53rx3?AJ9V}Qm<C*Rtud9zD>%(=v4A7Ql=RvUgAeoJeQj9
z)(CJalt;~T%UO2GvV{+RQtbk&Q$+R4=rHL%@tj{W?*{p03vmrSD5v_Rl#qNOrJ<#?
zw6tzHs$BF-W<4VCf5yR|wh88poM|(6&L*H5+4BY#9LKzoO|kGOX3ngMJ&n;WSO5&n
zDb4bjhBj!T582t{fQ>zF0pS@VMv$LAgo&Lq;p9y@xD!^^C>|}$5q!tWnKUs$_%RE6
z0&^R83b6C0t!&6)0x!+nNh5n)#~jr%Ml>tKiscarBB-Si`7!|&V_ZrbMx2kB3Y-T>
z36SI`y#V-+po{bla0IaM0BA3qA4Yo4orXt%7r4M2J1e3Lqf~_yo5-@~2YQnsWYZ`v
zp|Zpn?^_wYz?mN6Ob;+8F0dwvQYsNRQHF?e*3>zavsNaNJEIai#GV~tP7R!rpPZaZ
zU~%|kr?kgQ^3!D^M!JBGxKK6WFNVE26!!h$awNZ=%XBk-Y;P}kgC$?Cn@Thy@oF$$
z@rCn_Xu;`A*_?5+D`EAfY=i`oc25k(-{1@xZ9cu(t+9D^ZokQ5*BC8Ir^je_>TGs2
z;Sg!B!637k6gD$jA;j1p<F#V1jM$`RyUk*kNo-MZjWV_aH)d#sN)cTlpqmr|n_6O1
zi>xrwO1@3Sw}FQ}nB`ovl<m<9g9edTEp#cx4jJDr<GPeQ)JDi9A-j}r6Vp9Pj!(t&
zE4d(lNG*tHgi)<9VUb3SBEOnL!lPh$m297qkb_6gamm>(8QZJm0f<0cGL~Bj6|n3O
zi=2sxTgAl$;&R9scJZ=ZM6(MQ-I67@c+n@N1?9^DIW4511_Aj(Kt=(=%0<j$YFbXe
zls7D=Gz&2$C90qVW%GW?oJTlghYR3Mn>bSz-mH;5V`9%j3uex|T`&jlVB<~O_|q=Y
zJYeBW=n1Pn2x60Pa3?`{J7?0$g3TYr2)2X&03i+L7~bFuHV$}BI0Y+f9GrLZrd@(r
z06&P2@Y2E=H?l|dtWg~k{2x-SfcC=*`iOF6RJAe&U@lu4LclLy!kY;NGUX+t(j%hz
zi<l#D>g59X0w%*y0U?3eegr~-xk2RJr*RQy>H<E(8!XkSm{X`Af%&~$V+0dq88G-O
zV^~e@<HJXw<9donAK(H63kZY;2!VkAgu%bSM6x_JfbpaJC+fWu=fQT^d;oK-*`0oY
z4%dJ-Y-}_bIO8oQL)A1Ycea#2-p-@*$CF11DB>*z<LzV)2Y!|UkxDRHg8la=umv*Z
z@TQ%yN+?|L`BF}A0{pkSB4$_EZ1))em)~S}>&#ZA)uDEI^md2V;n12*N|Qn6u&O|Q
zVsngM>^3W~x6Y>FSu`xSN#HgMO)8>*<J8M78ngv1>l8ejPHNRiP|`6g1!ibLC2$~2
zLZG4%8x;bBoEtRDV^&!}C-!P&fLkMRs)T-nB<@s)EySsnb_K_+;v-7&YeWRp!mv&f
zGs;n-Nx4*Evlus6`E{`Kpf|L`waZqVIOtVMccEcS%>g`Wwhs+sS{_lo(eS(~wnxeK
zsyKjK!E(yzF6pvYPWQ-`0}6UX%?K%${8DN}wG>uR1JZdId>DBkq*wqFnx&#~xoo0m
z)ReS_nov<9^0}~lJ}8-m7x2pFy;6!>H0K28`Lhn*oRd%Sh~}MwDUW#8E1t#Zli*Vr
z?FlD;%FcOi_MN;*pLiaCjxe{e$KVv;dc2}pJi-gO`0xdjPA)zooCV}SJoW_Q;!U~v
z(>-S~X=RU_S)-6ZPX!|e&V-gZ4yYNUD*6cIpoS3W2z4&Xmxfe~F*Rchql`8vn7e=x
zmLFLFm=BN8lfwv+6oAj5_dLOWjJ@oDPy~1S0>UID4sa+aCGlqa1oQo59Z9q>hy<BA
zex5mgjyH?S(g;~WMRAF!oubgXFo=)|)l}vr%17Y;MU3D)2#@h}0>Z=cga3HLo<WL1
zXawH@z<(@$0ij|lSW3Bb5qCbC+pI;K8E-D+$@;_9aA~`kSxtfVv7#?q2^Q;-Vl|X1
zdQv4%p&rT<y~&&_mb69VmRQ0Xjhb<(T{LWrhKzoX+KWaMk1Fcd_;FW`Lr%aZi#in{
zn<QivhAg-}N1U>WLk3pZ$cUI20X+hhWuHy}_7c~~B4{y*y(W>@By?-}exn%gK~$W=
zjpJx=$+%4`aO(I8r!4D|#7z94jz~X)dQs3Ij9MgVw<_gShRq@mepU~6APE~JD7;6F
z(zH#Lck5DaWz;T>Ipk5BB4EV%mI9Yr<WdPd8jfGj@oHEeHPfd9`I%lV+pp(_4E%^e
z5Z3ca1T>tG4ml~$D_`*|SHfCmRLhEJRzk|9fD{IQA*!Z@W%D8FTvV|D#1zz+lA2J{
z@<vA4xKuGNVJzrr8O=gcO-c2H5R^hANDXK)HMFzn7bA9>1-zm;uV@CLR1khdO2Gsn
zlTSPw!FNQ{PR<y{5QNr)PdM%2Pr7*%7z5%Nyz~mD@QC*qo!kjX!$Ek8ac~~YC&(|D
z_6VjS4a^}HD`(8ogPA*K;!Yaa6M(*#6lfWvNDb6<(EFkc$u)DdhcPvCOiTrJFL+c8
z6QA142u783xCVqt!|)f-g$Sk8IV8pX0KNe5X8Q1^mn`7TbNsHFL>*#dt!?T&cc!mb
zbR~+V0u+UiSwJS#L4-`q$#Ym!=(&b|Hg=`;=zYq6P@SBUi2&w&9NRKGfC-@ss9<tn
zX>uT2iY5!;aN3it$BV0}Of8%)1>#v(tsUuZr<$E`rxRIghxRv;2W#<NOgh26j(@-7
zKkNqg+rI6ZbEoRq#aMQ17i@cF2QCo5)bJct9fxJ>)w=T%#+v;~#d^79JkDxQvYNw`
z;wY`WoK@e*YHt=a*Yoo01=*#H_#`DiN$Zc(mbI9+9*}INH9J}DR#MrG$aa$Y^{}cF
zl&yy4t+28l(d50--I8rTuRqDDjxwsXptSDf*WH4Wja#zwYF=@}FE6{rIj5-NQxu%y
zl1ozaD9SEr*(oWzBuOhbW@4r7!n9qKw1|>sNzx*V8blEzFJa*)t-`2@8`iUfS_a0b
zksCMi<3>SL&y5*)m_&3O0FMzZJErF(jGUN`5mnQ{e1tk6e^g10%IATkdNHkCB4U0u
zEvuuKOpLN&v1(jw+Ez;X#iE{;*Dn?fOF2C)sih`$i%H#bN>8K)LFv3-JQt8qf?&R6
zKB`!ZE0-{aAOYbl5Rp+rl4+k{GNyotXmKSKlMsB3aEb)x5!rk|H0|My`}h-HJ{S%c
zFyZFI;7@tTQ848d&R}k5k2yKxE_fF1n6(EB_|KUDj0DCJ6lod5WL_Xy=#vushw&dh
zg@QgDGYc&^AaMbX0CglpN`vy{LD>@41;BF@f{0ZE0P|jgfQryL;oJogrC+je5fD@Q
zdm!c#8W;{%pJQW{0c&=&F+@SUS;C13<_S-MS`*eX*aZKPCR`-Yv+yM6F~Z1$=MV)V
zpI-cb9^@zaPe=fm8eE<nSgQtBtNv!qx4oTOZ3ov@gIk@@9!{G%O5NCxUEc}6x*xx{
zAA9{$=8cog+n2NNUC(^_<;)+yoc+@~<<H+LeD-Gc^DpPV_-gS>cPf8*yYiQJDqp$Z
z_`65VuiPtr`F8GWk7{4NU;OI5+~3{FeCJ68`0$O|_g^o6=VA71x1!&^llt&c?i;s*
z|9IQ~orke+KgxdnPT|W}Gk<Zb_~*9^UwfnZ?N4<6_HpqqZ>Rq5QSD1NvtPIp`^=@t
zCr+YwyY{Dt;jh2H_pMK?|Kziq-+H6+7q?RH?|a@q@V&Pmd~4JDq~m(J;eXh1-|Ki@
z-;JDP)R(fF<D_CYB-{${_rju8A7|Ci--t*$e$1s!m$+sZl}!ACNl-BHOI8t3FblE<
zZqCTdoA_y?AZ-;VO@c%Zaj3w+i|M%u=*7&98(482Bc`RtG)oEXa#Fh-SAq64AfsC<
zm>608GC3(*7<Ch^VW!p1w2FaNH7%EnOC_j4x0o|7WemuRX=(jZTt$t@DPb8Urd*6j
z7h<x-w3?CAvC`@l%p;P8sB{7R4@zc8l7smP)nWoa1*apK4T@)guypS95i$WlbpW#J
zDT~k-JPTqZAAA8~B&dU^Sxq3gvhxX8;0^>xNhXjKBQ@4DM-8kojEW^xhlZ65WW*zI
z0hnU~{;Qee$byyh2^f4>e!N#Rkvj}ZX(*Etrv7*4y;mjz|M}BrB=Z;Kivs}GETjwl
zfSiV+6H;9)F`wliG@9$jg0*0N2>kCYy%CEJNIIZ04p!wR&-14HIaB@MKj=*o9-IeG
zwLs2EMPQDvp(ko(8Y>8c4|gN?cEYb;$vnQ4dU?}#xoy9<AANQu`_8q*lYP&-mm}|8
zi+$!s;<GmrpSzL#!p-EL-H3ka<@lFw#{TAR=C5xh|KfV&Z*Qmm{$BPU9~8g$di|RZ
z3*UcQ`QT~kgVzf`cvk%Jo7I1KIr!nD<WHXEe)2T;qsPf#zEl3!k5_*EPU%PYqrZNu
z_^UUwKYKm=&rgECcsu<s@09-OY5mU+0)PK7^X<1vKl|+3zx~(kAHQG!@SXB6{&?eu
zZ`D6|v;3Vm8c^AThUsf>wEp>X-5<SI`ai#O`+xuIli&XN!S|jOzkV<E-PcOreZBU#
zw=@6vTK((K8sGiQ*1!MFt*<;NeB){BTTh$cc&+lahxvbat@!tkN?&?8`R%tm|M0B!
z$vxj&TkhA_-8XXDn>oYHtp1gZ{$@gRJ*qtN3imyNT{o}e;H>$CO$WDT<&?~viiKac
z@JnWX&cMtW8Ce4(XIO!GFX&ea21Y^8NU3NUEfL|AP0Mu$CvRdD&8(`K)v_>Jw&k{M
zrDmixO!S7CQPwY)^z^KnS}-h>jPM1t0up5X5`G+3oiOYQN=mhu(=#&Km5h2NE~h4B
z3t8o21_70PE}@{rp%lq%Oui76ga4GUbRM*iqRJ|#^qc}AB}8IA(KP%>K#WLfAtHyf
zpg~yy@mx?e9}+G2h4Wq^!mVky2$*q-uv#%~<<Hppa~95&l{aM<AOWAW@=zlh(=&##
z#Gs;G)X)cY%pvW{07e~iP|qADmLyh25Gg5_hm=qR1N<L?5a1559*yi683Bq&#Ckz*
zeVLCHH0l6~ON!-Dq|yMiqoR*%SW|NPm~a7$*F&6{0SRrAPZ{UUjiHwV!4I(vfl3s0
z7(r35G(_}Kpqq&3O`5;RLQlo?c`PeqO$`m$Y&2b<d;~W!aUOHtbRX(S@EVM%LEy{x
z^Iv&b{Kt1XfBmTPR}aen>)pa%J*s`}osDn3+xhB~!r#4C`08uLzj>JX%Kh};J<fmi
z37Av(=9BUVZ#O@9tMT2Z<!?XE|I^1C|MYh4r|+(Q@T~UryNMsXQTXAr+y_sRKmU03
z8@FTMx*Pw&<HUFF20wfd{pr)p&)>-X@Imy$J0bA@7jG7R_B8dIck}=4Q^jAsTln#t
z<-fg^{?wNHd+*l2@mBg*U)%)$zyEgOr|&g?^(Si|Jk5XqY5MzbHvi-(^U0m~Pd>f%
z%g<GQ`svIszqs|=FJJ!6m-aq*qwvAw?2lf{{rbJ`kKd|&?`iqF?^b{Ch1FmE<=zk8
zt$+Be{KKdDUwpFu+b?u}^(XaTf2#f4&#wRL=eoc7;?^%dzy8Bdb$;;ajjuc`eEp5u
z*B_U@en0=+N97+sZGP=${Hr%&|J#k|mv4ms{A%z|kAtt*46oO%k1MuERr`ao<yKzz
za!z$Ut+<kq9|Qy^Vd<V%uwi3$?W}dbVBJIC_OBcy#GQb+Vdr#hjIN2Y>87t*sq5CI
zRRgtcTx^?`8>Xd-f!eT8T2@NkywI@FD#pd4j+)mj7Id_NX1;1#%<HLnD8<CcsTXpJ
z*@}9$q?{=zW^(fBf{K!oO{W!<w33=cETo`hRa87?)YP<UA){GLs*!-qMHF*U>1<3k
z7njec6btDdlCt@@WG*eGC8c;nO{(US>e;wzCZd>$D(7RWg|KWcEJLm_mq4nbrKEL~
zxOyh7T}&z$vxw_d3kl_141ZlS7gtP%rQ>nMbX1OnV${YOu(5}2oKZ7-SPy~F`V8~|
zEv;WiAHv$4a&Z95b1?W~$^|L4UqKsCq0mIE(+v_>9J27Hjhso%3KpqHC6r;s(xi|w
zCY&D^FOCYS12X!kgf@)y0oin~raH)(?&neln6npH!~y})ZcU#&OLUxq{9|W%<7YV&
zXSow_H+=xv%Yl^z*5u$fU(0>(jnentEPwaS$`9Uceedn&ho9{H>d%gT{`uVxKhgft
z``w@Z@#YWT?*Kpf%*M|@xBcT!t^eDf?f?8U>;LwJ-5-CV`O81<{_AJgfBnbZfB)Rh
zkKSth@Xg8(-z<LbapFgBW<Pu_{jEE(fB$snU*5_8^5cbHeZ26SPgQ^ZM*8Q^(m#F}
z`RykwzkRpzv)AH3c^LlXTdAKuOMmBK>hpWfZ#-%K<o(VEZ<qf2mk$2<{pPputKKU9
z;Enu0yc+z@qr}(lW#4Ey|N7O|&py5N;TzFkf35=N|N2Y&|Lx28CpCZeZuJ-MRDbj7
z?mvIL{=pl?fBC}3FaBct|M>f>zx=}1KfP1?@M#`j`0eLcfAyK#51(fK<-OW({<Qns
zFRuUhzuN!R7k7T~XM5j#yZ-(6Yya}6Yu|X1{Mw!1Kfhi6@v}UB;ZNTz{^I@G&prt>
zzxg2Zy(iUwxS#*#<I=ZZ|G!E&&*-)i>&|~nV(;2Xyh+YF>)5g5*xqEaS$k|x;*9O_
zG&Ob*z4yuj7EvscNQ%9I=)HpkdlN;GRAWQ$y%%=1(Kt@BU-!~J=N+6UkdT1)Zu#He
z|3Yk!nzJ*Gdo+Y}Yk1E@l*c24-Es0pKXSIa`pDesx3z4H*6-{28xpomc*EFs!`Aw#
zz5C|k-AXP0m->cZNf|#g2!As{`-w{UkyP-Zg#V#R__2uf;RyMIo|=Cg#Qbak{nI|=
zkGpIB=^pa^0m6q9j8BG$zvxH(xDWB(DE23V_5XaY@<+XO|J3r?cUwOHyVh%eFZ}HL
zjh}o^@SDGF`1rfL-$0sti~I54HeUNa<JaF}ee`X{FaH<slkW;|{IKQbkA&BM&{_M>
zx2xW}TmSw5@~0xwha=ec$A~`>(SE8D{8BCWwXX3Kb<<~xrq6Y4w=^v`;8ERj-P%*D
zZM&gwtD5aX8(XXA?_%uTD0_GP+8|+ToHpBCyEcG(CgsiaAT-T4l+D*wEfv$ePbb+Q
zk5GPb7xm*##Q$ll{>SEue`vh<L&itn#{cU#F&}&r^}#pmfA&|1pZ;adkH2y2y{}#S
z(O0ki=qr`)eeJU!{L%IQ{ngKZ_|@zG@TVZ&KKa3)e)yfQ{`&8}_N(uI{Wm}S#)seg
z+K1o!`Y*rxXTSLN*Z%E0Uw{8^{`=LY-}kQe{eBF)uiyKNKYTY6bpCsP{b%4Ny!S0o
zN8l9v#XrFRSq<wGe*;SCfBe(8{vtoaKUso;Nry0t=;%=+D`jhl7&%chFKurwEZ>Hc
zpq>>mapTs8xK)r{Xeq9=7nj<gCZ}fjNeeq+W<`Jha-W<U)v=;ldQ?kInV2sIEAnPe
zd4^Z8P&0Z$$wDicsd)n_uP5ZSgv%NBiJlgpLL{ZAf`OVb(EUp4<^UqS&=WHX%5yER
zAKs1Ym~JuFC&eF()g6u3AC9Bed#a5MH-p+cDN}RWNXghJnT3YvJiolvR$glkst9R4
zEpO(YEVZ1hwZD3L|K(o)*^~SK<(@iy*c~@<(-!X8itv1;IcH^+7x-^??)+~5{>Ap~
zi=D3X-TQtkH)v;N)*8Gzyhni!DX?)BA#Y+N)x?~UT(U7@3XFdm>6xtePNSk~LR^QB
zY4Ls;+AXe+XvjxXh^NC9&qr%QYVx5JA6*odA9cj-?D$f1>ft?yLGWx6o3`?u5^Pk*
zh-qnO%Pogfb;YGdrwS9c(evvaK`ZZgvj@^YZfbhmTf5U+@0_4M?nAHNL(bo+UhPHO
zT5ldqGvX%U-YCW`B6&pQ!!cY~$$2q^aZS_K?%Z1Mtk~&CEw<lUzEd?L{9MWWg`WS>
zdM~`Z(k%Q;-|*2=ceO*tIh3+?#)yY<_U;7vK*oA9h)(DlJz`=)&j`pcSrY?}q=phR
zFr&~zjU2z05zw+6QnF9Y+?&F0!h0J)zZfMx8^wDS?B`?nok65yn)+lIxjS4F&@e(;
zwnIWXoF>IBO_{lNuL?vdE1>2)8^?iE4(J<VwzfSn{n<2adz}1egs?t{*&M;|PEzK2
zs;#$gNLc@3>$t9K`i+SG^I^h2PtrcnwcHpayf3`|-Ikj_Xt?$R)<@srfByHx5B?|V
z7vCa$@YmS)|Elu6Kl|0+{kMPn?pOcyJAd-??|kKh?|uCz--cbguRy7X(fAL5^u4QZ
z@%;HeULD_nuk!#Ag^}TSL3q3<%+5CC?M<hvcaB%@IHaVIfs>nSO3w(=Hh$8|%gr^y
zNf;7D&xo4X2^%+VVJEGeh=HE6K=4z1a*Ruif)X9mf#zqFW*ed^Qc#LHn{UdQXc;{|
zEUP`4f%qqVx!O?B<Ik+rmov<a1Ra~G%d1Fv9iwO!9893YdVY4U!!IQjO@geR>z+V+
zCecwf-8qSlsHo2e>+MaS?+g-43w`C;){_}-X^x$q<DcwymUmi98;x!$B4?r}G^CUX
zf`pM<YP@{XUEF9dZ+5+W@*uy|=2epNv;5<ELCQqASa1Gf_xAZ_)2rR~m%De)w!2bG
zP0<Bje32Kl&{MNqsQD2kA+4pQRFt5oE~CaL6}Z4eT~bcSs0k4%BCSCejiij0;GL{W
zX$Y?Is{O&6&heVGp6XTL;|mS1pL9k|r05K%^tdlzX^QALK?(lp{p%SW<8bg+Opbfj
z|4C4RikXQg8x6lb?9Hz<yxHx7i<!4@3kG&VN=isb5h?Xx1pTO|V)p@Jz4LllPCv5?
z6AEHr0+Cfw!Xk9ZzzR+x&#Zhn_YBu$)r_=;6O+>;GU}6ix1RP^mS<W(6Gl{Y$0T~Y
z_h#B6D9*QdWu%CX1==!U;XfZnWX$}GnVmCHebZG*6)LU46b<CGnOa<Ics5>9UT%&W
z89@!rEh8RH<6ewm_Q&x%59;<tu?G`)c--!<yE^)+mOFm6*7foJ7{WP?fx|7q29@Nn
znwmCqp_I9$c$b9W5aWF+dc?>}S{r>bqE`$xgqYMb-#qMmy?*C-zA0s7_+{ARxyF)R
zkhAepX7<T)``PN9q=oO2Q4%(xLrx7C_--}(xtQWNHo7!C;4P+(S}E`I5z4RbVLoWD
z{R!vVkLut5%UkdL`Nu!_^G|;CjbDBLYya|{ul)SmfBds=|H)7P=8xb1)*t=!+h6_3
zx4x2{Z;hCEF-t>k@pf+MPICUXSI-Yygi#YWGsDkA7TEZ3B+Trjh3i+50~%@oDuam?
zgK^bJFU;{XR%XILjcUmO1tF@TL^af;j=nc|D`TLgw4{U@mrx<oTFk{9^L&<mYNKXU
zs8@F8nTec}qVqERg@u1%Zw<?+54*0NuXcwu%&3%**U&>E?7={_cM{{5kUY~Qx0txm
zRXN{s!>iyG=Q}exPQkz|+quQn=8G3S#hsS)BHO1#6)en@mK;&y;|6+IPdVFc%PcmO
zx4N=x9r3v)w-Qo=U9>^sa9=)bdH3v2afSU~&ffnx>dh{4BUXk-Pb)m^EN<LM%r=x(
zZl^7Luapo|(Y+JMuo#<C(SjmeN=46U=rJiaAwyrzax-dDT!u?)C_yo1f2b-bBNS|W
zrx*t!DnHLVnyU6dDX=t0ENy-nGbW+FxO=0ZWw{4y;t&bLm5%YMi}jYw9Q%uBUAYDJ
zyPZ4G*P?PvOpMIQ@nsG5!X}8xNkK9GaH#fK@6CjQ5*SCgA6$12TrX+K@#*^LRBcg1
zJTXubB802&lav@)P?0@@6-g;JHjVO+*B15ki&;ThO^zvuN5fTK5i+U<wF$wAN?F;E
zs!xXN!a61hyp)9zR%3&5WLSX#)f&>0TngNiu_~X29MUs9O3I4~^z(7__E7!Wy{g?2
z^!^lSZ;IrQGIobBj|WiC$8cNym4_nS-UQkyCWLj2@?z`3M14p_@k((|1}a{RBMv6f
z&nJ-Z_k{G^FLnpwN}_+FE-Xfc#0c+1<uQl^3;BGGl~kca(^bb-T1taSY4IT`;$o5i
z+ehtXJIgJuK3(QVwfNsY>CVq_vvzKJj(fI#=hbFE^i7YP5j8YLO)ZI;_JFqG=_GA)
zl(9L+n(0An+p5L9&$>}RZo2t>`mX_q{Uze(-z=?k#ciOsd2Tf`v(%ZGyA28DRCA6b
z#F&wNy4sO6^8#ub<UUNG9+32MqEAJEW)arWA(mo#T1Z7q8fXy>JW?QG3s&yoNKH&h
zc(dA4veHWyO2I%px3k_Z@h|Pnl7Vz=B)^^GoEpd}F(NI&l#Se!hP&AK>wuViYHtb4
zh#4iJs3nJ|QI6r7poHj`(zg2$kNc4i@76DOR{OQwFo2*bjAsmyGSW*M!k0()&iA@g
ziyV&}b!_M7jm(gokT$bQ^Md>=+pi~GJnIeE1y7{dfPop;GxH`++RR9sX|J|g&o%^S
z8x8NCcjuS5rPZd=T1RrBwYb^i)Y8K<4QE^TlQUO!J7Hvp)s%>u?wZ0MjiUWxqJJ9a
z9<MK1xM3MCB*nqM7*<gn)95l(s*NtMiUQ;+Gs`}jstRhTxux!e?M_g}IWY;3`l`}$
zYIq745#eHT^1*2B`No~FiIHD`nT3#>Wxre%X7x}dh`y1k69XeD$7XfpjE?FUt;y(^
zVF}(dRDIC>>GST7vl`On497im{lq{`OOZttZolWl;P8!-k`NiME-3NI>H5414-?i)
z$Ws;GJC4}t`P4OmIGt}!8JR&f>3F%tuOfNml&2%LVFTkpgmB9-dy}<Z=r1a2SkDRq
zqnK%M$w|*A>H}J4($;V!!Ml|-kBaql90ysjJ&Zn*FgE*<Psa%B_o{aW5uk#$`m5X$
zvO|Q8Xc#UL)-{C*DaiYywcC9+4<}H2<H+Y@$fu)7kBodYSQVJ44^JaQqPoKep9V!$
zWee%pLVmTtPAZUb83M}Yt0i8^3S<LOvQb{Ia5H*LNLG`#l9LAf@1AzXOvI#(8Z*)U
zefL3WuGu|}_J}cQ6SFkSf4S0{v+x26mPg9)DcMm&W5#~_Wc_|&sjIZsyEDysB5Nqj
zH<uP$0NOy1mzLXsG^S>RUNvK1g!L$>aT7nFVJ0jMUL`GJ-~#H2o3Ch3+{E+Bp_Xz$
z@PKHHX{jL<?sBc^x7%F>3)eM{c)2LNSm2-8S!D~QWTd=a5M0{1Wdk)XL0(vxCk9$<
zswO&BSvJtmXIl=3&<}6lC>Wbw&$oxCka-pM)JzGB>pT;62`$Sbp{;gTA55`#hiQ+8
z2&ws27>eDKNbdxyFxPOp+i~Hz_trgF*bw?v_|mK(r)Omh?DAYwah9L8(!DC&%e@EZ
z`$M^{yZM#&xR#MnQZp8I+(<oLYdU|}4DtVZyFE3_fJ#zaZO^W>#ugh>8*Pd8*8Jnn
z;-k*=lJM+tXL$o!NwZHwcge_bWgt9U5=>M_bxzj>RD__07=j{TW&fvp;_S&ikAV`K
zVHOrySsNu{q$cexxuqVzgr1bsAg@Crq<5?)JdMp7SbL*2`9)!7N%&^_-pNW^SW8J8
zS)NJc-h-R_4{jFV@+#0JGo@%|1f*Cf4zAI9*AT)xSmzq5fxIhQIj+$v&|{v_ikuqn
z8ocHixE33)@eka{$+0JTYF2?wN|0AqUW!Sn8P3uAr~MUqYePUuKA1wK?Tt}0`)sMr
zF;QQdYxK%64l&9p#W|(q{V8J7)L58pK3nc6+Jzno4*Gk-zz8VuekCbk=6My=r^ARB
zW2h%X$cOi<4<?E0T{ky-Zap2SebQHTG=|FP+1^Q%cO2;*ug^gJgvgw%huAnAsZAN!
zjtS(mfvSKA7n&vnL|ETseOQ9>OjHHMbp<meqahqy7)Qg`J(HEEcJ}csE3U=Hb=b0%
zaW>0&-1%X7hMlu8PnSW&-VJKWx%uYoLhI|DzU*8hX!47d=F$wSY-3z3Hk>crj=@Nx
zpaNhCC@JvHkC)m{S33(!J&y*E85^%)Zw#tPpl1pT!t`uo!Xof0scr=|sONZ8bQpZ%
zW<gNH@F>V>YXeNUaU(CHWyV3E8dy*j<2oAPl(dNfbLGim!}Ea~NiF%+a&vixnKKeo
z8f;ESdcD|iVxgy%n3NLrdXAe@;$ot@V-uxlq^Gsaox8UJV(Nvxy<}jgWZ1kGS^(NN
zdeb+BOc)!shwxhi1ecWeun+B0a`Q{=Q6oJ8)mTP|YAJp*sr>Zz%cH*1W@|`Cit8vv
z3nyz}6m7iH3@56_#!S?c?XKU2#P7UQXFK=eCbnCIP8jI<8Fq1jlb>Vd=UJz#4Y@hy
z$y%dZjSHA)sg<VEPFHY&pImQAtu|%XTgqFvUp~KA+`R2nll%roW<eOU@shLr%sfA6
zBq!|bFZOz~t4+Zf=IPV!)LL_Pt>Ji$d%n&s&e4KeTo~Gij_n*n9UD0@aXo~qe;N@{
z5sC|q8@->!V31kAeX@G{WTpMZNHsvGw2rwyP`TOhYyT7~Yb2Fs=~)xCWaIj$G3zbA
zdfs#M=w4++jED3&oofn+(HT80D#N7J_?QHFc>j~E4C@)V9+|8u>nK?zK0H-h(30|6
z3dDaxNq0`5V_Jq|8vAstE^Ojt=9?3G`rdFQT)LQ!64e7|rZ^>_UbsmECvD&qXBy9z
z+Wm4OXc+jlgON(72>ooh&NYqqD*zPJcKT}|^7lq@yM47=JvWwwA8vPFI~=S5tr(RN
zp-sa9=Z@iOzX<J}tcTm0(b7HR_51ym&JkopN_LGS_Xn!n<8>K5)jd&@){)NV1h1Ba
z2{m!|{wMCKy5sqVoQ>^)OQ)lrShy)Q<yqg2x9eTNv`?1XiVMxbE5oQGVQ+r9-J4$$
zI>g9`1|QepGkVJ9Qpe#K5{63G6b@!g$21;-xx94ye61_0gJHHgVg~U;4x5>|MPc05
z0AN36ZOyOU_Z#^x4Kr<TDJ-;smP?sA5e>yBg*pI}5hrH@4Mv5E5mFO;a`f?HQ+Yw?
zmY`kYx|eH>uh$!kGxUs+7*}CSX2#__KdZ+@Bvt3Lv@;tct0JW2*rI`&(twaa@Alt_
z>*yDYEh#lIF2?|G^g}fmugYne9wpz{R`X<>5s=WgA5;R#|6-@VXm3bqX;}jea{Oe8
zf3n$pywjdu;+$+W0ThjCFnJUC+|DeT$q@zeVy&^TEO`Cm{+q)G=TGnF*V{cBQr1Fy
zxzc#K)_ArI)qs7r#yeT#ynNhLSmB*)wiZ{RT(>0bjahh0^9_a7*7NPV7rXZhYwa)~
zxV4msjhmQj$gQ-NHai`93LIY?4W4XwMrT=X_wOHXwZ^Ta(?#0Z5+|g_hBc()x#qZ%
z;u}M}vI;U{Oh!S-m>FJBj7v>hV^v^tTt2#+TWEautQQbgScZp5HZH|K@4e<8tBD#(
zWk}CiX2HzJYw7-xy8WK(JFOqV_z+W&k{W8p#PUf|Fl_}yh}bmJ(fe6oq#`Xtdxoy(
z)P#%@?;Wd5DRJjByu5*xR8saIRD>0zpptYji8>PFii@p)@0?<^ON#X?i6Jc&=02wc
zQ&?!u*|;GU$txq~=0GMf4<=#MMS-ES(^vCi6u&=Cf&<vma>vb$yVVcxRxfv4+kH^^
ze5iJBpvEy&?;b}7<zx^%oA+)WO;Lep?T;Z3M{5Gm2*$7aS7aKOP?Do^V!^}?OYso@
zaKISl7a`%9yopt`@De(@V+si&_vP-u@p5NCT%T3rF6;tGrKFbmdZRn0qrH6u?41_W
zG4tT*tatqOa5!$|hBQFnq0V4aIvz;qM|W?zM5Nt;`h!W(@XXxYos*?@n94&sfH{KF
z>TT%4+4<&>9^f{)v~vG<F1bg`hQqC(W$n!=D<5J$rlZDmbSMRIp3pM`N`gm<P8ex%
z1Lb(J8OYE6cuj7G`Swxk#WL@7fgRUi3LvwW8ct`JX)U&3BE4M({zFMAh)FpyCPf!a
z4A*#VNJ%WtHXPd<;tD)CVo?><F<j*vsmvQVn|&xv<Ml%c*E2=h8LrK(-T8mcsmqPd
zq=A{%({pCV@f^3X%)fZjarU_N^=?Pb&PW-Fm&-hup<XU>qbk&!&DP6Dt$<c8pWS_X
zFmSfrbMmm`WVtav!z#>j3Ue^y2u@e{7aNTin@#CCW_hhCW@UMl#0X3`7EXS#84y$0
z!i1PV-t2_>@6}PCi&3#zesR4WbjyJP2gj=?_tOgvUM=C(PUrD@bAFBu?Js2@0~bA>
z7Z$7yo(YtDxc1U2NGk|F5#nT}*{vg%A9w9c)HoFQ{6cfm#yMGQJshtIh%s3$y=-Jd
zkT^%GPu7~>KJPwVX?(RJyqs-FigDh7>Vux^p$Uv@1Q}D(UO((U7^zOEN&643C8cO*
z|L2arPjgEA-rbMDUMU%<fk`OEb$LBGt0pDo_=AC)V8jEX@I!bgh)_QxI+|CG-+pkz
zDaIU5)#pJEXea?S*&{(8O(OOukZu|EZ0(*`&IqYkNj=viCPN_YjbY&tun&a&^6hJm
zDN;bm^vlV6!?ghJT%)KL{V;3eQYP-f1oq+Gs*Ubj`=iLf@8cXkS8x&y2b{RZYGI7Z
z=;$z>L;U+isDuiVf^a-5D9;FvfxsBqFhs<3thZZzDIGN;Mdpl*l7+v0??%GFDbKe+
z&i?kvgNU9THFDxM{>An^xE)u}(#rKq@jej_@YlfzDga)smbcYg9n$d9wwBCHV{uU!
z)U*8>I=m_~*97Sk)NwpYrd!Sm85{k2u2;+UgZ?q`@^j5l-cx24#J^8QfXAef<&)z;
z4aN=B+zcCj1b_={-_ktq%kA4QR|T)u8gdrO<su*Emx~2<R*z3>@euHk@&OS6?j1}$
zc>@hJAvjHGBPS%odnamRGPHBN?qIMoEWu{<ycywVi`@vXQgAqeJ(N){UX1+CDSET}
zAZKraGz=?AfLMzQg4`1C{83wJRZw2$73Wx&EBwnv?)e<M2yb9T@Mf#MwAygI*?Iy%
zevY4*<6k_wleP1l06=W4h=EjC;9jgXzuIhr3<7AGoMq=0;A#j%0C;D4AaY_>c44*c
zc;j|zt|2)m2wPYY3p>5gSX^!M0PPo}lQwQxPkT02Ra$6@o9F=rKB=P>Z33{Qg9_^D
zLYs38vD@=Gd?=6^2^;g>;b44`|Ak}Np(X_NbP)fSn|EGsbp~YEkOVp=2YwUuzh@G0
z{;>V!P8X1wi-o4NlH};Sd2si71T>78?4Kk(Z2vS5ZF&q5hUtE?Hm@co#K;%jA0;K2
zgWgX8B0=SM4Bd!GQ6&>S3-ghl0+WJwyb9=XTtjk<)ufG#jD_oirUOz|4-pO=kC?Uc
z;KV(RdNEw}bf{*tw=%5b1=Z}RmK|2n5_)z}K?{MD65;%Es$Why7(;o%9gvZ{65O)~
z6$e8#&QXMW9O;@s!h{&rFpnk(t|<ceq2MP%g98B*7UPl%N*sb=8Vw&A7GvT{5}boN
zDw;Wtk?M?r@oMGvo3%R^E4Rz`X5gq$uJaal#z1vX)@O}e=OpgYy$aYUdpuCPJB&D*
z!Y_Ay=8)jSX70uI-LtK(%zR^5M{`f2{32{rMLQZpmhG)RkXs@cWCX{n9dCAfLwcqg
z2(6Kwu{XypFzEy2p}XY_r<CSbvpot%K*I`aSqU>p*`}gh5Y^HE=fcGS_>(j;6M#VN
zye}Si7N8}`2}O`?4_n`DwLt`C%_OM*WgGp}PEYHx5jipq+dkm1h)F>S<8TO7G%}#z
zN93e}t?_iBH3$BbhEmYcpaS@&Fa>xsB9ff?{`LU&a003dCOXrcdDv6l>OFaMudvka
zmJw1$PI<NohS{W*dA8AV@u;=9%zwSr_VQu##WMe7mT|ejF3&QK7r6O(PQuDa&GR98
zvn#^mjgB|Z`utjoM@8@fAu)r)%ZVE)g(dFsdgH|wbi<|~*mD+6ZlNK&C<y4uaVtAF
z57QzOV4zn`3d3T?3@-wtT1DO+s>p&@V&-i3UW2U<FutAR$hZpZW6s_Xcv#q?t=Tn!
zdfInuf2{Uovm?FI{MI=Xw6k*?tp{@Sff!YoZ;a^4<pn{;NK2}S|NijKv6T%<*dwZa
zx7UBZ-F3R&meSFZN^)ch?H;W54AnmGx%H%{%0&BdL<%8|bPZL;CFryQ8=k6rDEt@i
zNOfou5tm^v?0nC7)mG=f6^+z06Sb%%z!a8{B4Hv+sqwza+L!_dj2*Q7v!TlTi~xk_
z^WiGExiE<Y<OJu`6|Q+Uit@wOqMYWO#(@{(0g^Aq0@Z{P1jP`1AJ;S{pdi8HZvRaX
z^MEFN6R6-c&L_eG`2kBbtY&z{1P_#P6(wtALwS!#aY=|GSd|fDJQE0*_aO`5c^DAF
zQXD|qxSCQha}!$H<?8L6nRC9_QkrSFTmvJZ6>7`T7$P8}9!-*0yQ>bw%;m0|&}9y%
z2^+mPfl>H$^w-b&&mQ+=7n@U%)G|Uqf{Ux^;5EP??3%=el?=a*QCPX{S5wk6jWN>|
z+wm!xus`NiFd)+pMP!eR4(bT#U&_)DH?qs~!mO18x)DCkAL>6yFNlAb!QVXU0G@nh
zeqH^s;RNPND<!VQr1bd9CEkgh7E_^;TGENV34E)lnjchfV=CT>g&z>1BQjFfEXZ3M
zgHmEdiciRKj=>7Ro*7HaY|AI}t)IGKxo(_zI8Mkd--#~>bL)4`x9?{cgdrU>q@w1`
zyqpPUG+u6&bGp$|T5Ej!wCl^KU0{u!FLJ+F6TDjHoy~K;*lK^d0kWLoRO2Eu?BmDX
z#YcC-Hm*-k^(gU~87?GIR7<?r6as|L&U4EvEfD+=-WfahWKEcx<ELz#(o$30!t`s1
z@CZk0hI72sQkrXms{z^%T9|7RRWR|P`t1$YIwzpq5uf!}JI7IRE$i7}&06QDDXZY}
zdEb$qe7xJ8-DoX7Z1d?!Aw4BO4;LSuvoTKQxc;fyb35<tT1!$(bWhiwZQVJ0+<ms*
zkukE}qX-`WoX$^tBehu#%QcF5cK23j3YV0U-2)W`Ed?xzl$@~L{!wTWosba#trv`R
zDE^Mon;9MP%{(V1Ljr=1O6tyStb(4LRA3_D7g%{gum&a&ei_~+LhX;%q)qIn1GoHF
zv5njxM?D=zdgSzsP55H84lt%qiVZ=nmJ=M4_3)}cAV*N703LxO1tqxPG&Cc0U<%`%
zLU}|eFSuZGqF;h{j@2Iy*B*`3`b8KJIcXgYYCgn2h@U@%eNs(<EP!*@M153AhLy;G
z1OrD(LjfZaj(`k(vIIz{Ijv`<_59N8ou!V-7b14p*t*qU6E`>d)r{?-8t|+mHvZ|(
zy?6UVaBJWO9l={2ulGxdo@s(lLJBLH9yKjB*9^v%Uj<J0Rd+jt(Lf5EpXHHJWBP`m
ziXGB$5+<;Qg@DVhXpe>rMIJmm$o&L-UI`lXPk8}g8W!At=nLRFl^1w9D=n$RMpVeW
zl?t&}uuwuWRQf7H1Q~OyW16;cuQmg#%L-Chf=laI88F~9%%Fr&FmPZmE-+D>(t#=y
z$QU1ZCG-Gnbxwjy$auMV|7^3fu+o;EZv?uYuyS3{H?4xak)1a%!3)UDanp<3)6JHb
zTWz@+MsbFAYNx(g;lEyK_}$}<lLdgcw1}CSSQMPS=sn--JKpI&dE9v@MMC=kc=hGu
z&R36GPF7*v4ypkFVKQuYz>h8-wj?azev(dCT1tzJAsC-$c}WX10npwqfXWHiHln5m
z<)o+mm2nm0d4IKI9POCI`Xq$?q3Rb8Dnh>>0~gvqhP|mT93#c8&M%!Kp&5=>M*^*t
znqj@#0-uE%*Agxk1Q7pe4KZ(``6UQ&d0xMG5Hz!burIOL7LgKN!?jPkKK6{%T*BrR
zjBg@zY#Ni1;zOe~CkCc_xFV$>J#PDrYX}mA0qQ!XCdHK4pacOf(n~uhGF_9?5&*X6
zb>y;%9+e=YQtZiG6Kn=T^g{7>P63vz2+Co*gC14WT)=In2~hmqGFoB2y)-Wjt4J_c
z!cXH`Iz+y65}D93B5DdeLM}Kb5Fo}52Pz%Fb4KbQVZm;2Ow>6iYCzxmrZG=@Z|>f^
z2KYK8!$lOhteydh04V^<2f`j2!5><KX9@|w13CwuPie@n7Mlx3M$ycS>j*AURnAB)
zTevYfH6mv_MeOzaxYNziq`lRtpcNK7o{S*C(oI9DTxmbsx|^{>Qc(PADm1beBgmkV
z5r!$o08)qI*DwQWqEATzq#sms9U`hv&h#spPBAf{V1`wkunLqOBdP_?NrNRcw;cO>
z3PcAN33AFIs!y4(80_;AaMjt+4MG3FUQOCW%9%+?J@#~-UA8fk8iHSphLDb`c>yJN
zy{G0#Oi$ZdV=^MRIavdzU~39U$x!oCYC7E6sHi@nWtv;B&vn%XWK6h~&LKol!OK}1
z!5L41@V4_V9(BToPt0(2<pG2y6~vsG9nzCii@fvgj?2gG2@@r*!Jf_0E*2P<bFA}O
z_QjGQ2sFk-g$;-Nldkil{<G(IFQ4}WP2{MNe74;1?os=H?)99k39@z;Oa~DBAWOi*
zDlZGJELxCQE6p(-8DLk+OwZa_$BTlfo|>|9Jqkk1zyd0`-2Sm+9CtNpNr8LRXW=bL
zF|ZU7l;e(|7%G`zb7OkB)up3GXSjepgL=xB&+e6%8((jCK;Hz&`DRTBTSl-YScDx6
zBe^igIp6Lntap}IZo?kPi@sY2{S|xnuS0GBV!g#Ds&ou}4xBS1MSJ?MhsJAtqt#gz
zZU27dVPEyxOmj?5EZabDqv9G|OohDw3a2B5#r1prp9QDu!58)eZ<#`stb&WhRwvlX
zlSt4fAQkooD>G2;fSH3>Rk5}Q>Y%zF0XbArV4?@JFCfOpRWzvQQ3U{JK~lqTgW!QQ
zl;R+UJ!7@5p(@`v0um6$dAJCU(Oa+?;umAVfB=CMmJ>aqdfznS58@xX08GAc4zC6M
zf0YhWTuR4){jnUh01-S>e{5!4Nw|4-LWco&xNM}qU1*KMX8R=5+<0sDPE}}D=+rQs
za@yJ2T`*{%)EsYgzS|#pw>OesY>$~6@(b-pB9dRpis>5@#-@~|(XXV3w3H~gqDCGl
zAD5W!oTk8&0VOQ}KC+S%)o@ZqKCDwlH1H(Jqrk-t3}^&!KqCO!S+Mi7HlTd?j1}k*
z9aiaLdQ#p-PwH^MKi{kif&ciy+MdQlRjiPb`(y}vG)?uYIiRe(<EWUNay%o<nVa^8
z5CI9%GmbpyyOmOq567_+q@NtZCiW=$=)o-j#YKB(Qp?Um{2L(+(NPlv21cI(w?A4H
zmEj9o=s1*!ff}~Z%a7V1{$oZ`Tnps|dv2%y*H+u<4CBNO9xgUzW2P2(uUrFfy~F?R
zAANh&U)*Snm`DXX3pCQZ-L}ikrqUv}Fvt6S%_kNY1-UuSyPd9A8?EJezGJG!F<IlB
zs*S-StC`_bU;|pRONIqt0_HFHn}7%5vpngqcrgMt6FVqH9Sq+BpTaFAM9mFxdrNXj
z7@6TZp!yl;4hibrlb+lhH))~1S{Ej?1h`zjshW_aKB9zc%ZeI^0TVrB;rX>7v6x_f
z0W&%5zX|dGYFSX8r9`DQ&Vf(!0F6cUj^58b!&Sja%pNQ!bX|jHakadprGSpQihsa{
z^Zb&PnbDEMQea2b&@>^mT%)yMWCLl8tH{o=+9(JTDfVgKbw~sd3?UWWBc%jX%#^hW
zR(B330Yws_)I+m|fCm#CPC(MYn%0X4l`%CHg7yVCcm21Z{`)5CePB~gA)(HDr)r#I
zRbeT?H;n^*_@ws++&BRHK)+!n10vr&fr!ZPa2yO(z<Ftgf4<aOoDl%0N~%d|HQ~g<
zc)Qf}<)-j>mUcPME@+6SM&=nTmdJU|X`ZR&#$ypFw|?)1n3$b!gD&7w(Bd;qvDv1x
zod>1O`yp!+EbIDIoEKw+h#ow$w!&;XXzYjvm=)DMO>|Dd8WQbb3=M}zO!7%7Ar(|t
zSO@{&L-a^s<iSA910?|R15*Gz0}|+Lxee$6oMf!riw*D;+Rj&7G8UM3FyOsHL4zyr
znyd!G3G~S^MFvWoHaEw#ET0(f8plSISE2}pqrFkY6)PU9@r<Kmx)u}vlUd;>u*QEd
zT<wNdIE^bWbcAK(n3Cj^;e2XRVX5VOqr(H$W)hiH;$w1T(Z)!cXkH~QztnKK*&fp4
zldzno!Jo|1|6{l9<vK58g+(lMN)O{0@#JCa%V*uM_wF5UwVZCZo<C|ySSe?#{4cf|
zzu0OzU*(nO*{3TFK<`18!&<_{2GAj4d4cO!py2RG>+`V7ZDr=|oRF4yFj;fe5v)x2
zbX`=93(AmD71}dZb2{GuK<r3_I$Q6|EkQeO$v$k0ZnlKyV7H2!w{fF7^6STUfEX9R
z)?DCc4CK=p4)DEz1QFL00o#U6q~cmr+{RA8UapP_bFOCu0rfwvB3~@<@)k-;g>?_!
z$jWi2I+|~|Dh9;zL6vJ5vvK?5yp?;tDug-!H{xKl!Z}u5G||c?23XHA2?5NAteOlK
zOj3c}>;1fFV1Od>PE;gS*tCx57a?E*1+QoK!A)4cbxfghwniY3N0YGZO7Mtb>68Kn
zu3LohOk?5LAE`YAlS@I#&a~|GRl)e<8ApK-a*o#drcq%f&NWeUI8x!7sP#iTkP(hX
z>)aDauLunaKcXPGCh9;A9gbAJ7`XYg@7nVR*Fi4<m3=x^AA;p`6XV!S%V}Xrp89gN
zxj0LQDv^VIvk636P0Q%G9uaY_tzxzloml8O-yLvindOK50h18ccv1@;rOm$5?qFeO
zFg)89(l;f|?K^|GeKFH5VI_<$UI`Ri48%N~Knfg9U?KPc5C!CPuaxAN!Z^X3mJy-0
z!?#)rvkidA01iMdKplYK2T%s+3~&%ElfW`nNCSKI?4T0h048Sz>_hTP>LC7MM*?)$
z&V%}cabi@%jcb`uJzQhB2u$2k%G2T6pz=yT9}HF}m5hTa)&%<hUgfEpaR^3$6dTur
z*tiW)#W#iaNU^#3rZ1lMWx)kAu?r?Hn8?RgYS|7s$ndFf<(1}_0B{yKFam%jdOS=2
z^0DyUwlHlaCJgwDf#Qda>m|X-qqf)2dy<RnSI@gH9=DZO1i4w-yRF9m*loF7=bbHc
zFE^SaI(*bXI$IaM+-y0$`UQqtQk%6gK;xAFB^jwP10|#*!rl=KWuTyrCM#j~Da|q<
zG6P~nN>7Ec$T5updn>yn^ckt8CtYtNqA)ZU6Qf|`!qQY>0op4d5h{3j;Co&!2*MJi
zd#XBVru?VlLDWLYtq7tPdPED2ve7kM=N&;j@A)jJrM`REmN9^bf;{T`EILtJRFT{R
zx8hRlv%A+kV|a*v*n#=o^WH;IRnkN$%y6FdeV$U`vKnG!8kLljj~-N{l*D%{LYM-a
zgB4&6K>YirDzkdR@eC`fz`7?9r%SCc-N1bZ+H)|DgkXU94}d6=5WHd>Ob3t!aM$2`
zZ@31Ug-1#%FLoVG;-B`yVgmszUFTR`K#B>g@E#E&B*TCV4qmYx^oxNC$0!1ZBbW{n
z8Vba}XR0243Ml~JmV)?qP1dAsuz@cGr0<!k^Nm;dp^D1zu()_U$A<VXo0usm!lQMU
z^DQ|O?@+|j3MzLd$fw(Vk=dr0{r1J~WOA`9Y->p@bYvdgKYcMCU+fI)n_Xh&!8B`6
M%ydauUK!{A1EZB6BLDyZ

literal 0
HcmV?d00001

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 34f5ff60..ac661549 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -3,36 +3,15 @@
 import warp.examples
 import warp.optim
 
-import torch as tc
+from warp.tests.unittest_utils import *
 
 import math
 import os
 
-from PIL import Image
-
-#wp.config.mode = "debug"
-#wp.config.verify_fp = True
-#wp.config.verify_cuda = True
-
-wp.set_device("cuda:0")
-wp.set_module_options({"fast_math": False})
-
-#wp.clear_kernel_cache()
-
-rng = np.random.default_rng(45)
-
-def assert_equal(result: np.ndarray, expect: np.ndarray, tol=1.e-2):
-    if tol != 0.0:
-        # TODO: Get all tests working without the .flatten()
-        np.testing.assert_allclose(result.flatten(), expect.flatten(), rtol=tol, atol=1.e-2, equal_nan=True)
-    else:
-        # TODO: Get all tests working with strict=True
-        np.testing.assert_array_equal(result, expect)
-
-    return True
-
+# needs to be constant for the whole module
+NUM_THREADS = 32
 
-def create_layer(dim_in, dim_hid, dtype=float):
+def create_layer(rng, dim_in, dim_hid, dtype=float):
 
     w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
     b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
@@ -42,7 +21,7 @@ def create_layer(dim_in, dim_hid, dtype=float):
 
     return (weights, bias)
 
-def create_array(dim_in, dim_hid, dtype=float):
+def create_array(rng, dim_in, dim_hid, dtype=float):
 
     s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
     a = wp.array(s, dtype=dtype, requires_grad=True)
@@ -50,22 +29,22 @@ def create_array(dim_in, dim_hid, dtype=float):
     return a
 
 
-NUM_FREQ = wp.constant(8)
+def test_multi_layer_nn(test, device):
 
-DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
-DIM_HID = 32
-DIM_OUT = 3
+    import torch as tc
 
-NUM_THREADS = 32
+    NUM_FREQ = wp.constant(8)
 
-IMG_WIDTH = NUM_THREADS*16
-IMG_HEIGHT = NUM_THREADS*16
+    DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
+    DIM_HID = 32
+    DIM_OUT = 3
 
-BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
+    IMG_WIDTH = NUM_THREADS*8
+    IMG_HEIGHT = NUM_THREADS*8
 
-dtype = wp.float16
+    BATCH_SIZE = min(512, int((IMG_WIDTH*IMG_HEIGHT)/8))
 
-def test_multi_layer_nn():
+    dtype = wp.float16
 
     @wp.func
     def relu(x: dtype):
@@ -90,9 +69,6 @@ def compute(batches: wp.array(dtype=int),
                 loss: wp.array1d(dtype=float),
                 out: wp.array2d(dtype=float)):
 
-        # row, col = wp.tid()
-        # linear = row*IMG_WIDTH + col
-
         linear = batches[wp.tid()]
         row = linear/IMG_WIDTH
         col = linear%IMG_WIDTH
@@ -116,7 +92,7 @@ def compute(batches: wp.array(dtype=int),
             local[s*4 + 2] = dtype(wp.sin(y * scale))
             local[s*4 + 3] = dtype(wp.cos(y * scale))
 
-            # # write input back to array so that torch can use it
+            # write input back to array so that torch can use it
             input[s*4 + 0, linear] = local[s*4 + 0]
             input[s*4 + 1, linear] = local[s*4 + 1]
             input[s*4 + 2, linear] = local[s*4 + 2]
@@ -148,6 +124,7 @@ def compute(batches: wp.array(dtype=int),
         # untile back to SIMT
         output = wp.untile(o)
 
+
         # compute error
         error = wp.vec3(float(output[0]) - reference[0,linear],
                         float(output[1]) - reference[1,linear],
@@ -162,20 +139,26 @@ def compute(batches: wp.array(dtype=int),
             out[i, linear] = float(output[i])
                 
 
+    rng = np.random.default_rng(45)
 
-    weights_0, bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
-    weights_1, bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
-    weights_2, bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
-    weights_3, bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
+    weights_0, bias_0 = create_layer(rng, DIM_IN, DIM_HID, dtype=dtype)
+    weights_1, bias_1 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
+    weights_2, bias_2 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
+    weights_3, bias_3 = create_layer(rng, DIM_HID, DIM_OUT, dtype=dtype)
 
-    input = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_IN, dtype=dtype)
-    output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
+    input = create_array(rng, IMG_WIDTH*IMG_HEIGHT, DIM_IN, dtype=dtype)
+    output = create_array(rng, IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
 
-    # # reference 
+    # generate reference image
+    from PIL import Image
     reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
     with Image.open(reference_path) as im:
-        reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0    
-    reference = wp.array(reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T, dtype=float)
+        reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB"))
+        reference_np = reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T
+    np.save(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), reference_np, allow_pickle=True)
+
+    reference_np =  np.load(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), allow_pickle=True)/255.0
+    reference = wp.array(reference_np, dtype=float)
 
     loss = wp.zeros(1, dtype=float, requires_grad=True)
 
@@ -186,20 +169,19 @@ def compute(batches: wp.array(dtype=int),
 
     optimizer_grads = [p.grad.flatten() for p in params]
     optimizer_inputs = [p.flatten() for p in params]
-    optimizer = warp.optim.Adam(optimizer_inputs, lr=0.001)
+    optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
 
     num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
-    max_iters = 5000
-    max_epochs = int(max_iters/num_batches)
+    max_epochs = 30
 
     # create randomized batch indices
     batches = np.arange(0, IMG_WIDTH*IMG_HEIGHT, dtype=np.int32)
     rng.shuffle(batches)
     batches = wp.array(batches)
          
-    with wp.ScopedTimer("Training"):
+    with wp.ScopedTimer("Training", active=False):
 
-        for i in range(max_epochs):
+        for epoch in range(max_epochs):
             
             for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
 
@@ -222,8 +204,10 @@ def compute(batches: wp.array(dtype=int),
 
                 tape.backward(loss)
 
-                verify = False
-                if verify:
+                # check outputs + grads on the first few epoch only
+                # since this is a relatively slow operation
+                verify = True
+                if verify and epoch < 3:
 
                     indices = batches[b:b+BATCH_SIZE].numpy()
 
@@ -233,7 +217,7 @@ def compute(batches: wp.array(dtype=int),
                     z_np = np.maximum(weights_3.numpy()@z_np + bias_3.numpy(), 0.0)
 
                     # test numpy foward
-                    assert_equal(output.numpy()[:,indices], z_np)
+                    assert_np_equal(output.numpy()[:,indices], z_np, tol=1.e-2)
 
                     # torch
                     input_tc = tc.from_numpy(input.numpy()[:, indices]).requires_grad_(True)
@@ -261,47 +245,42 @@ def compute(batches: wp.array(dtype=int),
                     l_tc.backward()
 
                     # test torch
-                    assert_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices])
-                    assert_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy())
-                    assert_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy())
-                    assert_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy())
-                    assert_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy())
-                    assert_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy())
-                    assert_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy())
-                    assert_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy())
-                    assert_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy())
-
-                # cosine weighted decay
-                optimizer.lr = 0.5*0.01*(1.0 + math.cos(float(i)/float(max_iters)*math.pi))
-                optimizer.step(optimizer_grads)
+                    assert_np_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices], tol=1.e-2)
+                    assert_np_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy(), tol=1.e-2)
 
+                optimizer.step(optimizer_grads)
                 tape.zero()
 
-            print(f"Epoch: {i} Loss: {loss.numpy()}")
-
-              
-
-    predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
-    predicted_image = (predicted_image * 255).astype(np.uint8)
+            #print(f"Epoch: {epoch} Loss: {loss.numpy()}")
 
-    predicted_image_pil = Image.fromarray(predicted_image)
-    predicted_image_pil.save("test_tile_mlp_wp.jpg")
+    # predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
+    # predicted_image = (predicted_image * 255).astype(np.uint8)
 
-    return
+    # predicted_image_pil = Image.fromarray(predicted_image)
+    # predicted_image_pil.save("test_tile_mlp_wp.jpg")
 
+    # initial loss is ~0.061
+    assert loss.numpy()[0] < 0.002
 
-    # print(input)
-    # print(output)
 
-    # numpy
 
 
-    
+def test_single_layer_nn(test, device):
 
+    import torch as tc
 
+    DIM_IN = 8
+    DIM_HID = 32
+    DIM_OUT = 16
 
-
-def test_single_layer_nn():
+    NUM_BLOCKS = 56
 
     @wp.func
     def relu(x: float):
@@ -325,40 +304,72 @@ def compute(input: wp.array2d(dtype=float),
         wp.tile_store(out, 0, i, o)
 
 
-    weights, bias = create_layer(DIM_IN, DIM_OUT, dtype=float)
+    with wp.ScopedDevice(device):
+
+        rng = np.random.default_rng(45)
+
+        # single layer weights, bias
+        weights, bias = create_layer(rng, DIM_IN, DIM_OUT, dtype=float)
+
+        input = create_array(rng, NUM_THREADS*NUM_BLOCKS, DIM_IN)
+        output = create_array(rng, NUM_THREADS*NUM_BLOCKS, DIM_OUT)
+
+        with wp.Tape() as tape:
+            wp.launch_tiled(compute, dim=[NUM_BLOCKS], inputs=[input, weights, bias, output], block_dim=NUM_THREADS)
+
+        output.grad = wp.ones_like(output)
+        tape.backward()    
+
+        # numpy
+        output_np = np.maximum(weights.numpy()@input.numpy() + bias.numpy(), 0.0)
+
+        # test numpy foward
+        assert_np_equal(output.numpy(), output_np, tol=1.e-2)
+
 
-    input = create_array(NUM_THREADS*NUM_BLOCKS, DIM_IN)
-    output = create_array(NUM_THREADS*NUM_BLOCKS, DIM_OUT)
+        # torch
+        weights_tc = tc.from_numpy(weights.numpy()).requires_grad_(True)   # use .numpy() to avoid any memory aliasing
+        input_tc = tc.from_numpy(input.numpy()).requires_grad_(True)
+        bias_tc = tc.from_numpy(bias.numpy()).requires_grad_(True)
 
-    with wp.Tape() as tape:
-        wp.launch_tiled(compute, dim=[NUM_BLOCKS], inputs=[input, weights, bias, output], block_dim=NUM_THREADS)
+        output_tc = tc.clamp(weights_tc@input_tc + bias_tc, min=0.0)
+        output_tc.backward(tc.ones_like(output_tc))
 
-    output.grad = wp.ones_like(output)
-    tape.backward()    
+        # test torch
+        assert_np_equal(output_tc.detach().numpy(), output.numpy(), tol=1.e-2)
+        assert_np_equal(input.grad.numpy(), input_tc.grad.detach().numpy(), tol=1.e-2)
 
 
-    # print(input)
-    # print(output)
+class TestTileMLP(unittest.TestCase):
+    pass
 
-    # numpy
-    output_np = np.maximum(weights.numpy()@input.numpy() + bias.numpy(), 0.0)
+test_devices = get_test_devices()
 
-    # test numpy foward
-    print(np.allclose(output.numpy(), output_np))
+try:
+    import torch
 
+    # check which Warp devices work with Torch
+    # CUDA devices may fail if Torch was not compiled with CUDA support
+    torch_compatible_devices = []
+    torch_compatible_cuda_devices = []
 
-    # torch
-    weights_tc = tc.from_numpy(weights.numpy()).requires_grad_(True)   # use .numpy() to avoid any memory aliasing
-    input_tc = tc.from_numpy(input.numpy()).requires_grad_(True)
-    bias_tc = tc.from_numpy(bias.numpy()).requires_grad_(True)
+    for d in test_devices:
+        try:
+            t = torch.arange(10, device=wp.device_to_torch(d))
+            t += 1
+            torch_compatible_devices.append(d)
+            if d.is_cuda:
+                torch_compatible_cuda_devices.append(d)
+        except Exception as e:
+            print(f"Skipping Torch tests on device '{d}' due to exception: {e}")
 
-    output_tc = tc.clamp(weights_tc@input_tc + bias_tc, min=0.0)
-    output_tc.backward(tc.ones_like(output_tc))
+    add_function_test(TestTileMLP, "test_single_layer_nn", test_single_layer_nn, check_output=False, devices=torch_compatible_cuda_devices)
+    add_function_test(TestTileMLP, "test_multi_layer_nn", test_multi_layer_nn, check_output=False, devices=torch_compatible_cuda_devices)
 
-    # test torch
-    print(np.allclose(output_tc.detach().numpy(), output.numpy()))
-    print(np.allclose(input.grad.numpy(), input_tc.grad.detach().numpy()))
+except Exception as e:
+    print(f"Skipping Torch tests due to exception: {e}")
 
 
-#test_single_layer_nn()
-test_multi_layer_nn()
\ No newline at end of file
+if __name__ == "__main__":
+#    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)

From a3a5c63e688ed191f2356102b85f5f6ceacd99e7 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Tue, 15 Oct 2024 08:10:15 +0000
Subject: [PATCH 069/102] Disable reference image loading in MLP unit test

---
 warp/tests/test_tile_mlp.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index ac661549..693dffe3 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -149,13 +149,13 @@ def compute(batches: wp.array(dtype=int),
     input = create_array(rng, IMG_WIDTH*IMG_HEIGHT, DIM_IN, dtype=dtype)
     output = create_array(rng, IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
 
-    # generate reference image
-    from PIL import Image
-    reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
-    with Image.open(reference_path) as im:
-        reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB"))
-        reference_np = reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T
-    np.save(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), reference_np, allow_pickle=True)
+    # # generate reference image
+    # from PIL import Image
+    # reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
+    # with Image.open(reference_path) as im:
+    #     reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB"))
+    #     reference_np = reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T
+    # np.save(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), reference_np, allow_pickle=True)
 
     reference_np =  np.load(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), allow_pickle=True)/255.0
     reference = wp.array(reference_np, dtype=float)

From fcc95c072c3e48d7f6df72d460c1909dbc4cf8c6 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Tue, 15 Oct 2024 13:00:44 -0700
Subject: [PATCH 070/102] Tile/Mathdx: simplifying matmul implementation using
 arrangement

---
 .gitlab/ci/mathdx-support.yml |  4 +--
 warp/builtins.py              | 66 +++++++++++++++--------------------
 warp/context.py               |  9 +++--
 warp/native/mathdx.cpp        |  5 +--
 warp/native/tile.h            | 11 +++---
 warp/native/warp.cu           |  6 ++--
 warp/native/warp.h            |  2 +-
 7 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index d7879267..3b78b4d5 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -36,7 +36,7 @@ linux-x86_64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/54/libmathdx_build_x86_64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/68/libmathdx_build_x86_64_ubuntu20.04_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -59,7 +59,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/54/libmathdx_build_aarch64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/68/libmathdx_build_aarch64_ubuntu20.04_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
diff --git a/warp/builtins.py b/warp/builtins.py
index 1c9d5ecc..b34533f0 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5624,6 +5624,7 @@ def tile_matmul_generic_lto_dispatch_func(
     out.type.storage = "shared"
     template_args = [accumulate]
 
+    # Maps Python/Warp types to C++ types and enums
     def cublasdx_type_map(dtype):
         if dtype == float16:
             return ("wp::float16", 3, 0)
@@ -5638,7 +5639,13 @@ def cublasdx_type_map(dtype):
         if dtype == vec2d:
             return ("wp::vec2d", 6, 1)
         raise RuntimeError("Unsupported input type in tile_matmul")
-
+    
+    def cublasdx_arrangement_map(layout):
+        if layout == "colmajor":
+            return 0 # CUBLASDX_ARRANGEMENT_COL_MAJOR
+        if layout == "rowmajor":
+            return 1 # CUBLASDX_ARRANGEMENT_ROW_MAJOR
+        raise RuntimeError("Unsupported layout in tile_matmul")
 
     # generate the LTO
     M, K = a.type.M, a.type.N
@@ -5646,28 +5653,20 @@ def cublasdx_type_map(dtype):
     num_threads = options["block_dim"]
     arch = options["output_arch"]
 
-    def make_function(M, N, K, adtype, bdtype, cdtype, tA, tB):
+    def make_function(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout):
 
         (a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
         (b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
         (c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
+        a_arrangement = cublasdx_arrangement_map(alayout)
+        b_arrangement = cublasdx_arrangement_map(blayout)
+        c_arrangement = cublasdx_arrangement_map(clayout)
 
         if (a_type != b_type or a_type != c_type):
             raise RuntimeError("time_matmul(A, B, C) requires all inputs to be real or complex")
-
         element_type = a_type
 
-        # Warp follows Numpy: matrices are row-major
-        # But cuBLASDx follows BLAS: matrices are col-major
-        # So we have to flip M <-> N and A <-> B
-        def make_transpose(t):
-            if t == "N":
-                return 0  # CUBLASDX_TRANSPOSE_MODE_NON_TRANSPOSED
-            elif t == "T":
-                return 1  # CUBLASDX_TRANSPOSE_MODE_TRANSPOSED
-            raise RuntimeError("Invalid transpose mode")
-
-        lto_symbol = f"dot_{M}_{N}_{K}_{tA}_{tB}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
+        lto_symbol = f"dot_{M}_{N}_{K}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
 
         # early out if LTO for this combination already exists for this module
         if lto_symbol in builder.ltoirs:
@@ -5683,15 +5682,16 @@ def make_transpose(t):
             include_dirs,
             get_mathdx_include_dirs(),
             arch,
-            N,
             M,
+            N,
             K,
-            b_prec,
             a_prec,
+            b_prec,
             c_prec,
             element_type,
-            make_transpose(tB),
-            make_transpose(tA),
+            a_arrangement,
+            b_arrangement,
+            c_arrangement,
             num_threads,
         )
         if not result:
@@ -5701,35 +5701,25 @@ def make_transpose(t):
                 lto_code = f.read()
 
             builder.ltoirs[lto_symbol] = lto_code
-            builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({c_dtype}, {b_dtype}*, {a_dtype}*, {c_dtype}, {c_dtype}*);"
+            builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({c_dtype}, {a_dtype}*, {b_dtype}*, {c_dtype}, {c_dtype}*);"
 
             return lto_symbol, lto_code
 
-    def tile_layout_mode(tile):
-        if tile.layout == "rowmajor":
-            return "N"
-        if tile.layout == "colmajor":
-            return "T"
-
     def tile_flip_layout(layout):
-        if layout == "N":
-            return "T"
-        elif layout == "T":
-            return "N"
-
-    a_layout = tile_layout_mode(a.type)
-    b_layout = tile_layout_mode(b.type)
-    c_layout = tile_layout_mode(out.type)
+        if layout == "rowmajor":
+            return "colmajor"
+        elif layout == "colmajor":
+            return "rowmajor"
 
     #    C += A * B
-    (fun_forward, lto_forward) = make_function(M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a_layout, b_layout) 
-    # adjA += adjC * B^T
+    (fun_forward, lto_forward) = make_function(M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a.type.layout, b.type.layout, out.type.layout) 
+    # adjA += adjC * B^T - Tranpose ~= flipped layout
     (fun_backward_A, lto_backward_A) = make_function(
-        M, K, N, out.type.dtype, b.type.dtype, a.type.dtype, c_layout, tile_flip_layout(b_layout)
+        M, K, N, out.type.dtype, b.type.dtype, a.type.dtype, out.type.layout, tile_flip_layout(b.type.layout), a.type.layout
     )
-    # adjB += A^T * adjC
+    # adjB += A^T * adjC - Tranpose ~= flipped layout
     (fun_backward_B, lto_backward_B) = make_function(
-        K, N, M, a.type.dtype, out.type.dtype, b.type.dtype, tile_flip_layout(a_layout), c_layout
+        K, N, M, a.type.dtype, out.type.dtype, b.type.dtype, tile_flip_layout(a.type.layout), out.type.layout, b.type.layout
     )  
 
     return (
diff --git a/warp/context.py b/warp/context.py
index 281a6009..65ddeebe 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -3398,10 +3398,13 @@ def __init__(self):
                 ctypes.c_int,  # M
                 ctypes.c_int,  # N
                 ctypes.c_int,  # K
-                ctypes.c_int,  # precision
+                ctypes.c_int,  # a_precision
+                ctypes.c_int,  # b_precision
+                ctypes.c_int,  # c_precision
                 ctypes.c_int,  # type
-                ctypes.c_int,  # tA
-                ctypes.c_int,  # tB
+                ctypes.c_int,  # a_arrangement
+                ctypes.c_int,  # b_arrangement
+                ctypes.c_int,  # c_arrangement
                 ctypes.c_int,  # num threads
             ]
             self.core.cuda_compile_dot.restype = ctypes.c_bool
diff --git a/warp/native/mathdx.cpp b/warp/native/mathdx.cpp
index 75a83e3d..c540c873 100644
--- a/warp/native/mathdx.cpp
+++ b/warp/native/mathdx.cpp
@@ -45,8 +45,9 @@ WP_API bool cuda_compile_dot(
                              int precision_B,
                              int precision_C,
                              int type,
-                             int tA,
-                             int tB,
+                             int a_arrangement,
+                             int b_arrangement,
+                             int c_arrangement,
                              int num_threads)
 {
     printf("CUDA is disabled and/or Warp was not compiled with MathDx support.\n");
diff --git a/warp/native/tile.h b/warp/native/tile.h
index 8df8e202..a8c3534d 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -1296,14 +1296,13 @@ void adj_tile_extract(Tile& t, int i, int j, AdjTile& adj_t, int adj_i, int adj_
     adj_t.adj_extract(i, j, adj_ret);
 }
 
-// cuBLASDx follows the BLAS convention: matrices are col-major, so we swap A & B in the code below
 template <int Add, typename Fwd, typename AdjA, typename AdjB, typename TileA, typename TileB, typename TileC>
 TileC& tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, TileA& A, TileB& B, TileC& C)
 {       
     using T = typename TileA::Type;
 
     WP_TILE_SYNC();
-    fun_forward(T(1.0), B.data, A.data, T(Add), C.data);
+    fun_forward(T(1.0), A.data, B.data, T(Add), C.data);
     WP_TILE_SYNC();
     
     return C;
@@ -1317,8 +1316,8 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
     using T = typename TileA::Type;    
 
     WP_TILE_SYNC();
-    fun_backward_A(T(1.0), B.data, adj_C.data, T(1.0), adj_A.data);
-    fun_backward_B(T(1.0), adj_C.data, A.data, T(1.0), adj_B.data);
+    fun_backward_A(T(1.0), adj_C.data, B.data, T(1.0), adj_A.data);
+    fun_backward_B(T(1.0), A.data, adj_C.data, T(1.0), adj_B.data);
     WP_TILE_SYNC();
 }
 
@@ -1330,8 +1329,8 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
     using T = typename TileA::Type;    
 
     WP_TILE_SYNC();
-    fun_backward_A(T(1.0), B.data, adj_C.data, T(1.0), adj_A.data);
-    fun_backward_B(T(1.0), adj_C.data, A.data, T(1.0), adj_B.data);
+    fun_backward_A(T(1.0), adj_C.data, B.data, T(1.0), adj_A.data);
+    fun_backward_B(T(1.0), A.data, adj_C.data, T(1.0), adj_B.data);
     WP_TILE_SYNC();
 }
 
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index bb6bb8e7..b043aeba 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -2926,7 +2926,7 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         return res;
     }
 
-    bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int tA, int tB, int num_threads)
+    bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads)
     {
 
         CHECK_ANY(ltoir_output_path != nullptr);
@@ -2949,8 +2949,8 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_BLOCK_DIM, block_dim.size(), block_dim.data()));
         std::array<long long int, 3> size = {M, N, K};
         CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_SIZE, size.size(), size.data()));
-        std::array<long long int, 2> transpose_mode = {(cublasDxTransposeMode_t)tA, (cublasDxTransposeMode_t)tB};
-        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_TRANSPOSE_MODE, transpose_mode.size(), transpose_mode.data()));
+        std::array<long long int, 3> arrangement = {arrangement_A, arrangement_B, arrangement_C};
+        CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_ARRANGEMENT, arrangement.size(), arrangement.data()));
         
         CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
         for(int dir = 0; dir < num_include_dirs; dir++) 
diff --git a/warp/native/warp.h b/warp/native/warp.h
index f913c006..33c878d2 100644
--- a/warp/native/warp.h
+++ b/warp/native/warp.h
@@ -319,7 +319,7 @@ extern "C"
 
     WP_API size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_dir, int num_cuda_include_dirs, const char** cuda_include_dirs, bool debug, bool verbose, bool verify_fp, bool fast_math, const char* output_path, size_t num_ltoirs, char** ltoirs, size_t* ltoir_sizes);
     WP_API bool cuda_compile_fft(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int size, int elements_per_thread, int direction, int precision, int* shared_memory_size);
-    WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int tA, int tB, int num_threads);
+    WP_API bool cuda_compile_dot(const char* ltoir_output_path, const char* symbol_name, int num_include_dirs, const char** include_dirs, const char* mathdx_include_dir, int arch, int M, int N, int K, int precision_A, int precision_B, int precision_C, int type, int arrangement_A, int arrangement_B, int arrangement_C, int num_threads);
 
     WP_API void* cuda_load_module(void* context, const char* ptx);
     WP_API void cuda_unload_module(void* context, void* module);

From 90bc5353a4ea4572c060a722ff4ba56baaf85df0 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Tue, 15 Oct 2024 14:16:22 -0700
Subject: [PATCH 071/102] Update libmathdx artifactory paths + typo

---
 .gitlab/ci/mathdx-support.yml | 4 ++--
 warp/builtins.py              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index 3b78b4d5..4b85d124 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -36,7 +36,7 @@ linux-x86_64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/68/libmathdx_build_x86_64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/69/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -59,7 +59,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/68/libmathdx_build_aarch64_ubuntu20.04_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/69/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
diff --git a/warp/builtins.py b/warp/builtins.py
index b34533f0..e733a7c3 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5713,11 +5713,11 @@ def tile_flip_layout(layout):
 
     #    C += A * B
     (fun_forward, lto_forward) = make_function(M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a.type.layout, b.type.layout, out.type.layout) 
-    # adjA += adjC * B^T - Tranpose ~= flipped layout
+    # adjA += adjC * B^T - Transpose ~= flipped layout
     (fun_backward_A, lto_backward_A) = make_function(
         M, K, N, out.type.dtype, b.type.dtype, a.type.dtype, out.type.layout, tile_flip_layout(b.type.layout), a.type.layout
     )
-    # adjB += A^T * adjC - Tranpose ~= flipped layout
+    # adjB += A^T * adjC - Transpose ~= flipped layout
     (fun_backward_B, lto_backward_B) = make_function(
         K, N, M, a.type.dtype, out.type.dtype, b.type.dtype, tile_flip_layout(a.type.layout), out.type.layout, b.type.layout
     )  

From a54851c19033dea8d723afa918f86f0922222a0b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 17 Oct 2024 03:02:03 +0000
Subject: [PATCH 072/102] Skip MLP unit tests on non-math DX platforms

---
 warp/tests/test_tile_mlp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 693dffe3..d79ce897 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -8,6 +8,8 @@
 import math
 import os
 
+wp.init()
+
 # needs to be constant for the whole module
 NUM_THREADS = 32
 
@@ -28,7 +30,7 @@ def create_array(rng, dim_in, dim_hid, dtype=float):
 
     return a
 
-
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
 def test_multi_layer_nn(test, device):
 
     import torch as tc
@@ -271,7 +273,7 @@ def compute(batches: wp.array(dtype=int),
 
 
 
-
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
 def test_single_layer_nn(test, device):
 
     import torch as tc

From e577c7d98c3b825bf7edce4db6d6a2de638b872d Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Thu, 17 Oct 2024 08:59:25 +0000
Subject: [PATCH 073/102] Change to make all shared tile write operations
 synchronize.

---
 warp/native/tile.h | 54 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index 8df8e202..f4d8871c 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -102,15 +102,25 @@
 
 */
 
-// wp.tile_load(A, offset, shape)
-// wp.tile_load(A, (x, y), (16, 16))
-// wp.tile_load(A, (x, y, z), (3, 3, 3))
-
-// wp.tile_load(A, index, shape)
-// wp.tile_load(A, x, m)
-// wp.tile_load(A, x, y, m, n)
-// wp.tile_load(A, x, y, z, m, n, o)
-// wp.tile_load(A, x, y, z, m, n, o, p)
+// Notes on shared memory synchronization
+// ======================================
+//
+// Currently operations that wite to shared memory tiles (e.g.: tile_load())
+// must synchronize before they return through WP_TILE_SYNC(), this
+// ensures subsequent read operations from the tile do not cause a race condition.
+//
+// For tile_shared_t adjoints, the gradient accumulation is done through shared
+// memory atomics, i.e.: atomic_add(), so explicit synchronization is not
+// required, with the exception of some operations like GEMMs, which use
+// standard shared memory loads and stores to compute and  accumulate gradients.
+//
+// The current synchronization strategy is conservative, can lead to more
+// synchronization than necessary. A more sophisticated strategy would be
+// to track the 'dirty' state of shared tiles, and synchronize only when
+// necessary. In addition, custom synchronization for e.g.: tile_load()
+// operations could be added through a SyncProvider template parameter on
+// the tile_shared_t type, for example to support barrier synchronization
+// for asynchronous global to shared loads.
 
 namespace wp
 {
@@ -458,6 +468,8 @@ struct tile_shared_t
         else
             copy_from_global(t.data, t.x, t.y); // 2d load
         
+        // synchronization happens in copy functions
+
         return *this;
     }
 
@@ -468,6 +480,7 @@ struct tile_shared_t
         for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
             data[i] = x;
 
+        WP_TILE_SYNC();
         return *this;
     }
 
@@ -522,6 +535,8 @@ struct tile_shared_t
         // todo: make this subtile (stride aware)
         for (int i=threadIdx.x; i < M*N; i+= WP_TILE_BLOCK_DIM)
             data[i] = T(0);
+
+        WP_TILE_SYNC();
     }
 
     // extract a single tile element to a native type
@@ -553,6 +568,8 @@ struct tile_shared_t
 
             (*this)(linear) = tile.data[i];
         }
+
+        WP_TILE_SYNC();
     }
 
     inline CUDA_CALLABLE void add(const tile_register_t<T, M, N>& tile) 
@@ -576,8 +593,6 @@ struct tile_shared_t
 
     inline CUDA_CALLABLE void print()
     {
-        WP_TILE_SYNC();
-
         if (threadIdx.x == 0)
         {
             printf("tile(m=%d, n=%d, storage=shared) = [", M, N);
@@ -663,6 +678,8 @@ struct tile_shared_t
         {  
             (*this)(i) = wp::index(src, tile_i + i);
         }
+
+        WP_TILE_SYNC();
     }
 
     inline CUDA_CALLABLE void copy_from_global(const array_t<T>& src, int x, int y)
@@ -688,6 +705,8 @@ struct tile_shared_t
             coord_t c = coord(i);
             (*this)(c.i, c.j) = ptr[c.i*stride_i + c.j*stride_j];
         }
+
+        WP_TILE_SYNC();
     }
 };
 
@@ -766,6 +785,8 @@ inline CUDA_CALLABLE auto tile_alloc_zeros()
     for (int i=threadIdx.x; i < Len; i+= WP_TILE_BLOCK_DIM)
         data[i] = T(0);
 
+    WP_TILE_SYNC();
+
     return tile_shared_t<T, M, N, StrideM, StrideN>(data);
 }
 
@@ -1302,7 +1323,6 @@ TileC& tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B, Ti
 {       
     using T = typename TileA::Type;
 
-    WP_TILE_SYNC();
     fun_forward(T(1.0), B.data, A.data, T(Add), C.data);
     WP_TILE_SYNC();
     
@@ -1316,6 +1336,8 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 {   
     using T = typename TileA::Type;    
 
+    // need to sync here because previous operations
+    // may still be performing atomic adds onto adj_A, adj_B, adjC
     WP_TILE_SYNC();
     fun_backward_A(T(1.0), B.data, adj_C.data, T(1.0), adj_A.data);
     fun_backward_B(T(1.0), adj_C.data, A.data, T(1.0), adj_B.data);
@@ -1329,6 +1351,8 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 {   
     using T = typename TileA::Type;    
 
+    // need to sync here because previous operations
+    // may still be performing atomic adds onto adj_A, adj_B, adjC
     WP_TILE_SYNC();
     fun_backward_A(T(1.0), B.data, adj_C.data, T(1.0), adj_A.data);
     fun_backward_B(T(1.0), adj_C.data, A.data, T(1.0), adj_B.data);
@@ -1340,7 +1364,6 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
     do { \
         void function_name(dtype*, dtype*); \
         WP_TILE_SHARED __align__(16) char buffer[shared_memory_size]; \
-        WP_TILE_SYNC(); \
         for(int b = 0; b < (int)batch_size; b++) { \
             function_name(Xinout.data + (int)b * (int)ept, (dtype*)buffer); \
             WP_TILE_SYNC(); \
@@ -1397,13 +1420,14 @@ inline CUDA_CALLABLE void adj_tile_broadcast(Tile& t, Tile& adj_t, AdjTile& adj_
 
     static_assert(LenTile == LenAdjTile);
 
-    // since the incoming adjoint will have the same physical storage 
+    // since the incoming adjoint will have the same sized physical storage 
     // as the original tile (just with different strides and expanded dimensions), 
     // we can simply update the gradient element by element
     for (int i=threadIdx.x; i < LenTile; i+=WP_TILE_BLOCK_DIM)
     {
-        adj_t.data[i] += adj_ret.data[i];
+        atomic_add(&adj_t.data[i], adj_ret.data[i]);
     }
 }
 
+
 } // namespace wp

From 9ee677568f96190ea0cbd265804273b9c6c3d6a4 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Thu, 17 Oct 2024 08:23:10 -0700
Subject: [PATCH 074/102] Fix Ruff errors

---
 warp/builtins.py            |  16 +--
 warp/optim/adam.py          |   4 +-
 warp/tests/test_tile_mlp.py | 242 +++++++++++++++++++-----------------
 3 files changed, 136 insertions(+), 126 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 1c9d5ecc..b50705a7 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5601,10 +5601,10 @@ def tile_matmul_generic_lto_dispatch_func(
     b = arg_values["b"]
 
     if len(return_values) > 0:
-        accumulate = 0 # for c = tile_matmul(a,b) case we want to overwrite c value
+        accumulate = 0  # for c = tile_matmul(a,b) case we want to overwrite c value
         out = return_values[0]
     else:
-        accumulate = 1 # for tile_matmul(a,b,c) case we want to add to c value
+        accumulate = 1  # for tile_matmul(a,b,c) case we want to add to c value
         out = arg_values["out"]
 
     if any(not is_tile(arg.type) for arg in [a, b, out]):
@@ -5639,7 +5639,6 @@ def cublasdx_type_map(dtype):
             return ("wp::vec2d", 6, 1)
         raise RuntimeError("Unsupported input type in tile_matmul")
 
-
     # generate the LTO
     M, K = a.type.M, a.type.N
     _, N = b.type.M, b.type.N
@@ -5647,12 +5646,11 @@ def cublasdx_type_map(dtype):
     arch = options["output_arch"]
 
     def make_function(M, N, K, adtype, bdtype, cdtype, tA, tB):
-
         (a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
         (b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
         (c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
 
-        if (a_type != b_type or a_type != c_type):
+        if a_type != b_type or a_type != c_type:
             raise RuntimeError("time_matmul(A, B, C) requires all inputs to be real or complex")
 
         element_type = a_type
@@ -5701,7 +5699,9 @@ def make_transpose(t):
                 lto_code = f.read()
 
             builder.ltoirs[lto_symbol] = lto_code
-            builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({c_dtype}, {b_dtype}*, {a_dtype}*, {c_dtype}, {c_dtype}*);"
+            builder.ltoirs_decl[lto_symbol] = (
+                f"void {lto_symbol}({c_dtype}, {b_dtype}*, {a_dtype}*, {c_dtype}, {c_dtype}*);"
+            )
 
             return lto_symbol, lto_code
 
@@ -5722,7 +5722,7 @@ def tile_flip_layout(layout):
     c_layout = tile_layout_mode(out.type)
 
     #    C += A * B
-    (fun_forward, lto_forward) = make_function(M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a_layout, b_layout) 
+    (fun_forward, lto_forward) = make_function(M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a_layout, b_layout)
     # adjA += adjC * B^T
     (fun_backward_A, lto_backward_A) = make_function(
         M, K, N, out.type.dtype, b.type.dtype, a.type.dtype, c_layout, tile_flip_layout(b_layout)
@@ -5730,7 +5730,7 @@ def tile_flip_layout(layout):
     # adjB += A^T * adjC
     (fun_backward_B, lto_backward_B) = make_function(
         K, N, M, a.type.dtype, out.type.dtype, b.type.dtype, tile_flip_layout(a_layout), c_layout
-    )  
+    )
 
     return (
         (
diff --git a/warp/optim/adam.py b/warp/optim/adam.py
index fb2d0064..a235432a 100644
--- a/warp/optim/adam.py
+++ b/warp/optim/adam.py
@@ -101,7 +101,7 @@ def set_params(self, params):
                 elif param.dtype == wp.float32:
                     dtype = wp.float32
                 elif param.dtype == wp.float16:
-                    dtype = wp.float32      # we always use fp32 for moments, even if params are fp16
+                    dtype = wp.float32  # we always use fp32 for moments, even if params are fp16
                 else:
                     raise RuntimeError(f"Unsupported dtype for Warp Adam optimizer: {param.dtype}")
 
@@ -143,7 +143,7 @@ def step_detail(g, m, v, lr, beta1, beta2, t, eps, params):
                 dim=len(params),
                 inputs=kernel_inputs,
                 device=params.device,
-            )            
+            )
         elif params.dtype == wp.types.vec3:
             wp.launch(
                 kernel=adam_step_kernel_vec3,
diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index d79ce897..36915535 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -1,20 +1,19 @@
+import os
+
 import numpy as np
+
 import warp as wp
 import warp.examples
 import warp.optim
-
 from warp.tests.unittest_utils import *
 
-import math
-import os
-
 wp.init()
 
 # needs to be constant for the whole module
 NUM_THREADS = 32
 
-def create_layer(rng, dim_in, dim_hid, dtype=float):
 
+def create_layer(rng, dim_in, dim_hid, dtype=float):
     w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
     b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
 
@@ -23,28 +22,28 @@ def create_layer(rng, dim_in, dim_hid, dtype=float):
 
     return (weights, bias)
 
-def create_array(rng, dim_in, dim_hid, dtype=float):
 
+def create_array(rng, dim_in, dim_hid, dtype=float):
     s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
     a = wp.array(s, dtype=dtype, requires_grad=True)
 
     return a
 
+
 @unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
 def test_multi_layer_nn(test, device):
-
     import torch as tc
 
     NUM_FREQ = wp.constant(8)
 
-    DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
+    DIM_IN = wp.constant(4 * NUM_FREQ)  # sin,cos for both x,y at each frequenecy
     DIM_HID = 32
     DIM_OUT = 3
 
-    IMG_WIDTH = NUM_THREADS*8
-    IMG_HEIGHT = NUM_THREADS*8
+    IMG_WIDTH = NUM_THREADS * 8
+    IMG_HEIGHT = NUM_THREADS * 8
 
-    BATCH_SIZE = min(512, int((IMG_WIDTH*IMG_HEIGHT)/8))
+    BATCH_SIZE = min(512, int((IMG_WIDTH * IMG_HEIGHT) / 8))
 
     dtype = wp.float16
 
@@ -61,49 +60,52 @@ def zero(loss: wp.array(dtype=float)):
         loss[0] = 0.0
 
     @wp.kernel
-    def compute(batches: wp.array(dtype=int),
-                input: wp.array2d(dtype=dtype),
-                weights_0: wp.array2d(dtype=dtype), bias_0: wp.array2d(dtype=dtype),
-                weights_1: wp.array2d(dtype=dtype), bias_1: wp.array2d(dtype=dtype),
-                weights_2: wp.array2d(dtype=dtype), bias_2: wp.array2d(dtype=dtype),
-                weights_3: wp.array2d(dtype=dtype), bias_3: wp.array2d(dtype=dtype),
-                reference: wp.array2d(dtype=float),
-                loss: wp.array1d(dtype=float),
-                out: wp.array2d(dtype=float)):
-
+    def compute(
+        batches: wp.array(dtype=int),
+        input: wp.array2d(dtype=dtype),
+        weights_0: wp.array2d(dtype=dtype),
+        bias_0: wp.array2d(dtype=dtype),
+        weights_1: wp.array2d(dtype=dtype),
+        bias_1: wp.array2d(dtype=dtype),
+        weights_2: wp.array2d(dtype=dtype),
+        bias_2: wp.array2d(dtype=dtype),
+        weights_3: wp.array2d(dtype=dtype),
+        bias_3: wp.array2d(dtype=dtype),
+        reference: wp.array2d(dtype=float),
+        loss: wp.array1d(dtype=float),
+        out: wp.array2d(dtype=float),
+    ):
         linear = batches[wp.tid()]
-        row = linear/IMG_WIDTH
-        col = linear%IMG_WIDTH
+        row = linear / IMG_WIDTH
+        col = linear % IMG_WIDTH
 
         # normalize input coordinates to [-1, 1]
-        x = (float(row)/float(IMG_WIDTH) - 0.5)*2.0
-        y = (float(col)/float(IMG_HEIGHT) - 0.5)*2.0
+        x = (float(row) / float(IMG_WIDTH) - 0.5) * 2.0
+        y = (float(col) / float(IMG_HEIGHT) - 0.5) * 2.0
 
         local = wp.vector(dtype=dtype, length=DIM_IN)
 
         # construct positional encoding
         for s in range(NUM_FREQ):
-
-            scale = wp.pow(2.0, float(s))*wp.pi
+            scale = wp.pow(2.0, float(s)) * wp.pi
 
             # x-coord
-            local[s*4 + 0] = dtype(wp.sin(x * scale))
-            local[s*4 + 1] = dtype(wp.cos(x * scale))
+            local[s * 4 + 0] = dtype(wp.sin(x * scale))
+            local[s * 4 + 1] = dtype(wp.cos(x * scale))
 
             # y-coord
-            local[s*4 + 2] = dtype(wp.sin(y * scale))
-            local[s*4 + 3] = dtype(wp.cos(y * scale))
+            local[s * 4 + 2] = dtype(wp.sin(y * scale))
+            local[s * 4 + 3] = dtype(wp.cos(y * scale))
 
             # write input back to array so that torch can use it
-            input[s*4 + 0, linear] = local[s*4 + 0]
-            input[s*4 + 1, linear] = local[s*4 + 1]
-            input[s*4 + 2, linear] = local[s*4 + 2]
-            input[s*4 + 3, linear] = local[s*4 + 3]
-        
+            input[s * 4 + 0, linear] = local[s * 4 + 0]
+            input[s * 4 + 1, linear] = local[s * 4 + 1]
+            input[s * 4 + 2, linear] = local[s * 4 + 2]
+            input[s * 4 + 3, linear] = local[s * 4 + 3]
 
         # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
         f = wp.tile(local)
-        
+
         # input layer
         w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
         b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
@@ -126,20 +128,19 @@ def compute(batches: wp.array(dtype=int),
         # untile back to SIMT
         output = wp.untile(o)
 
-
         # compute error
-        error = wp.vec3(float(output[0]) - reference[0,linear],
-                        float(output[1]) - reference[1,linear],
-                        float(output[2]) - reference[2,linear])
+        error = wp.vec3(
+            float(output[0]) - reference[0, linear],
+            float(output[1]) - reference[1, linear],
+            float(output[2]) - reference[2, linear],
+        )
 
         # write MSE loss
-        wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*BATCH_SIZE))
-
+        wp.atomic_add(loss, 0, wp.length_sq(error) / float(3 * BATCH_SIZE))
 
         # image output
         for i in range(DIM_OUT):
             out[i, linear] = float(output[i])
-                
 
     rng = np.random.default_rng(45)
 
@@ -148,8 +149,8 @@ def compute(batches: wp.array(dtype=int),
     weights_2, bias_2 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
     weights_3, bias_3 = create_layer(rng, DIM_HID, DIM_OUT, dtype=dtype)
 
-    input = create_array(rng, IMG_WIDTH*IMG_HEIGHT, DIM_IN, dtype=dtype)
-    output = create_array(rng, IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
+    input = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_IN, dtype=dtype)
+    output = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_OUT)
 
     # # generate reference image
     # from PIL import Image
@@ -159,50 +160,51 @@ def compute(batches: wp.array(dtype=int),
     #     reference_np = reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T
     # np.save(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), reference_np, allow_pickle=True)
 
-    reference_np =  np.load(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), allow_pickle=True)/255.0
+    reference_np = np.load(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), allow_pickle=True) / 255.0
     reference = wp.array(reference_np, dtype=float)
 
     loss = wp.zeros(1, dtype=float, requires_grad=True)
 
-    params = [weights_0, bias_0,
-              weights_1, bias_1, 
-              weights_2, bias_2,
-              weights_3, bias_3]
+    params = [weights_0, bias_0, weights_1, bias_1, weights_2, bias_2, weights_3, bias_3]
 
     optimizer_grads = [p.grad.flatten() for p in params]
     optimizer_inputs = [p.flatten() for p in params]
     optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
 
-    num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
+    num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
     max_epochs = 30
 
     # create randomized batch indices
-    batches = np.arange(0, IMG_WIDTH*IMG_HEIGHT, dtype=np.int32)
+    batches = np.arange(0, IMG_WIDTH * IMG_HEIGHT, dtype=np.int32)
     rng.shuffle(batches)
     batches = wp.array(batches)
-         
-    with wp.ScopedTimer("Training", active=False):
 
+    with wp.ScopedTimer("Training", active=False):
         for epoch in range(max_epochs):
-            
-            for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
-
+            for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
                 loss.zero_()
 
                 with wp.Tape() as tape:
                     wp.launch(
-                        compute, 
+                        compute,
                         dim=[BATCH_SIZE],
-                        inputs=[batches[b:b+BATCH_SIZE],
-                                input,
-                                weights_0, bias_0,
-                                weights_1, bias_1,
-                                weights_2, bias_2, 
-                                weights_3, bias_3, 
-                                reference,
-                                loss,
-                                output],
-                        block_dim=NUM_THREADS)
+                        inputs=[
+                            batches[b : b + BATCH_SIZE],
+                            input,
+                            weights_0,
+                            bias_0,
+                            weights_1,
+                            bias_1,
+                            weights_2,
+                            bias_2,
+                            weights_3,
+                            bias_3,
+                            reference,
+                            loss,
+                            output,
+                        ],
+                        block_dim=NUM_THREADS,
+                    )
 
                 tape.backward(loss)
 
@@ -210,16 +212,15 @@ def compute(batches: wp.array(dtype=int),
                 # since this is a relatively slow operation
                 verify = True
                 if verify and epoch < 3:
+                    indices = batches[b : b + BATCH_SIZE].numpy()
 
-                    indices = batches[b:b+BATCH_SIZE].numpy()
-
-                    z_np = np.maximum(weights_0.numpy()@input.numpy()[:,indices] + bias_0.numpy(), 0.0)
-                    z_np = np.maximum(weights_1.numpy()@z_np + bias_1.numpy(), 0.0)
-                    z_np = np.maximum(weights_2.numpy()@z_np + bias_2.numpy(), 0.0)
-                    z_np = np.maximum(weights_3.numpy()@z_np + bias_3.numpy(), 0.0)
+                    z_np = np.maximum(weights_0.numpy() @ input.numpy()[:, indices] + bias_0.numpy(), 0.0)
+                    z_np = np.maximum(weights_1.numpy() @ z_np + bias_1.numpy(), 0.0)
+                    z_np = np.maximum(weights_2.numpy() @ z_np + bias_2.numpy(), 0.0)
+                    z_np = np.maximum(weights_3.numpy() @ z_np + bias_3.numpy(), 0.0)
 
                     # test numpy foward
-                    assert_np_equal(output.numpy()[:,indices], z_np, tol=1.e-2)
+                    assert_np_equal(output.numpy()[:, indices], z_np, tol=1.0e-2)
 
                     # torch
                     input_tc = tc.from_numpy(input.numpy()[:, indices]).requires_grad_(True)
@@ -234,33 +235,33 @@ def compute(batches: wp.array(dtype=int),
                     bias_2_tc = tc.from_numpy(bias_2.numpy()).requires_grad_(True)
 
                     weights_3_tc = tc.from_numpy(weights_3.numpy()).requires_grad_(True)
-                    bias_3_tc = tc.from_numpy(bias_3.numpy()).requires_grad_(True)                    
+                    bias_3_tc = tc.from_numpy(bias_3.numpy()).requires_grad_(True)
+
+                    z_tc = tc.clamp(weights_0_tc @ input_tc + bias_0_tc, min=0.0)
+                    z_tc = tc.clamp(weights_1_tc @ z_tc + bias_1_tc, min=0.0)
+                    z_tc = tc.clamp(weights_2_tc @ z_tc + bias_2_tc, min=0.0)
+                    z_tc = tc.clamp(weights_3_tc @ z_tc + bias_3_tc, min=0.0)
 
-                    z_tc = tc.clamp(weights_0_tc@input_tc + bias_0_tc, min=0.0)
-                    z_tc = tc.clamp(weights_1_tc@z_tc + bias_1_tc, min=0.0)
-                    z_tc = tc.clamp(weights_2_tc@z_tc + bias_2_tc, min=0.0)
-                    z_tc = tc.clamp(weights_3_tc@z_tc + bias_3_tc, min=0.0)
-                    
                     ref_tc = tc.from_numpy(reference.numpy()[:, indices]).requires_grad_(True)
-                    
-                    l_tc = tc.mean((z_tc - ref_tc)**2)
+
+                    l_tc = tc.mean((z_tc - ref_tc) ** 2)
                     l_tc.backward()
 
                     # test torch
-                    assert_np_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices], tol=1.e-2)
-                    assert_np_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy(), tol=1.e-2)
-                    assert_np_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy(), tol=1.e-2)
-                    assert_np_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy(), tol=1.e-2)
-                    assert_np_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy(), tol=1.e-2)
-                    assert_np_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy(), tol=1.e-2)
-                    assert_np_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy(), tol=1.e-2)
-                    assert_np_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy(), tol=1.e-2)
-                    assert_np_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy(), tol=1.e-2)
+                    assert_np_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices], tol=1.0e-2)
+                    assert_np_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    assert_np_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    assert_np_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    assert_np_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    assert_np_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    assert_np_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    assert_np_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    assert_np_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
 
                 optimizer.step(optimizer_grads)
                 tape.zero()
 
-            #print(f"Epoch: {epoch} Loss: {loss.numpy()}")
+            # print(f"Epoch: {epoch} Loss: {loss.numpy()}")
 
     # predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
     # predicted_image = (predicted_image * 255).astype(np.uint8)
@@ -272,10 +273,8 @@ def compute(batches: wp.array(dtype=int),
     assert loss.numpy()[0] < 0.002
 
 
-
 @unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
 def test_single_layer_nn(test, device):
-
     import torch as tc
 
     DIM_IN = 8
@@ -289,11 +288,12 @@ def relu(x: float):
         return wp.max(x, 0.0)
 
     @wp.kernel
-    def compute(input: wp.array2d(dtype=float),
-                weights: wp.array2d(dtype=float),
-                bias: wp.array2d(dtype=float),
-                out: wp.array2d(dtype=float)):
-
+    def compute(
+        input: wp.array2d(dtype=float),
+        weights: wp.array2d(dtype=float),
+        bias: wp.array2d(dtype=float),
+        out: wp.array2d(dtype=float),
+    ):
         i = wp.tid()
 
         f = wp.tile_load(input, 0, i, m=DIM_IN, n=NUM_THREADS)
@@ -305,46 +305,44 @@ def compute(input: wp.array2d(dtype=float),
 
         wp.tile_store(out, 0, i, o)
 
-
     with wp.ScopedDevice(device):
-
         rng = np.random.default_rng(45)
 
         # single layer weights, bias
         weights, bias = create_layer(rng, DIM_IN, DIM_OUT, dtype=float)
 
-        input = create_array(rng, NUM_THREADS*NUM_BLOCKS, DIM_IN)
-        output = create_array(rng, NUM_THREADS*NUM_BLOCKS, DIM_OUT)
+        input = create_array(rng, NUM_THREADS * NUM_BLOCKS, DIM_IN)
+        output = create_array(rng, NUM_THREADS * NUM_BLOCKS, DIM_OUT)
 
         with wp.Tape() as tape:
             wp.launch_tiled(compute, dim=[NUM_BLOCKS], inputs=[input, weights, bias, output], block_dim=NUM_THREADS)
 
         output.grad = wp.ones_like(output)
-        tape.backward()    
+        tape.backward()
 
         # numpy
-        output_np = np.maximum(weights.numpy()@input.numpy() + bias.numpy(), 0.0)
+        output_np = np.maximum(weights.numpy() @ input.numpy() + bias.numpy(), 0.0)
 
         # test numpy foward
-        assert_np_equal(output.numpy(), output_np, tol=1.e-2)
-
+        assert_np_equal(output.numpy(), output_np, tol=1.0e-2)
 
         # torch
-        weights_tc = tc.from_numpy(weights.numpy()).requires_grad_(True)   # use .numpy() to avoid any memory aliasing
+        weights_tc = tc.from_numpy(weights.numpy()).requires_grad_(True)  # use .numpy() to avoid any memory aliasing
         input_tc = tc.from_numpy(input.numpy()).requires_grad_(True)
         bias_tc = tc.from_numpy(bias.numpy()).requires_grad_(True)
 
-        output_tc = tc.clamp(weights_tc@input_tc + bias_tc, min=0.0)
+        output_tc = tc.clamp(weights_tc @ input_tc + bias_tc, min=0.0)
         output_tc.backward(tc.ones_like(output_tc))
 
         # test torch
-        assert_np_equal(output_tc.detach().numpy(), output.numpy(), tol=1.e-2)
-        assert_np_equal(input.grad.numpy(), input_tc.grad.detach().numpy(), tol=1.e-2)
+        assert_np_equal(output_tc.detach().numpy(), output.numpy(), tol=1.0e-2)
+        assert_np_equal(input.grad.numpy(), input_tc.grad.detach().numpy(), tol=1.0e-2)
 
 
 class TestTileMLP(unittest.TestCase):
     pass
 
+
 test_devices = get_test_devices()
 
 try:
@@ -365,13 +363,25 @@ class TestTileMLP(unittest.TestCase):
         except Exception as e:
             print(f"Skipping Torch tests on device '{d}' due to exception: {e}")
 
-    add_function_test(TestTileMLP, "test_single_layer_nn", test_single_layer_nn, check_output=False, devices=torch_compatible_cuda_devices)
-    add_function_test(TestTileMLP, "test_multi_layer_nn", test_multi_layer_nn, check_output=False, devices=torch_compatible_cuda_devices)
+    add_function_test(
+        TestTileMLP,
+        "test_single_layer_nn",
+        test_single_layer_nn,
+        check_output=False,
+        devices=torch_compatible_cuda_devices,
+    )
+    add_function_test(
+        TestTileMLP,
+        "test_multi_layer_nn",
+        test_multi_layer_nn,
+        check_output=False,
+        devices=torch_compatible_cuda_devices,
+    )
 
 except Exception as e:
     print(f"Skipping Torch tests due to exception: {e}")
 
 
 if __name__ == "__main__":
-#    wp.clear_kernel_cache()
+    #    wp.clear_kernel_cache()
     unittest.main(verbosity=2, failfast=True)

From f275782d825b9e6e3cb6aa9cb4196b983409c9ec Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Thu, 17 Oct 2024 08:35:19 -0700
Subject: [PATCH 075/102] Fix Ruff issues

---
 warp/builtins.py | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/warp/builtins.py b/warp/builtins.py
index 168a78f7..87ad6815 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -5639,12 +5639,12 @@ def cublasdx_type_map(dtype):
         if dtype == vec2d:
             return ("wp::vec2d", 6, 1)
         raise RuntimeError("Unsupported input type in tile_matmul")
-    
+
     def cublasdx_arrangement_map(layout):
         if layout == "colmajor":
-            return 0 # CUBLASDX_ARRANGEMENT_COL_MAJOR
+            return 0  # CUBLASDX_ARRANGEMENT_COL_MAJOR
         if layout == "rowmajor":
-            return 1 # CUBLASDX_ARRANGEMENT_ROW_MAJOR
+            return 1  # CUBLASDX_ARRANGEMENT_ROW_MAJOR
         raise RuntimeError("Unsupported layout in tile_matmul")
 
     # generate the LTO
@@ -5654,7 +5654,6 @@ def cublasdx_arrangement_map(layout):
     arch = options["output_arch"]
 
     def make_function(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout):
-
         (a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
         (b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
         (c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
@@ -5666,7 +5665,9 @@ def make_function(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout):
             raise RuntimeError("time_matmul(A, B, C) requires all inputs to be real or complex")
         element_type = a_type
 
-        lto_symbol = f"dot_{M}_{N}_{K}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
+        lto_symbol = (
+            f"dot_{M}_{N}_{K}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
+        )
 
         # early out if LTO for this combination already exists for this module
         if lto_symbol in builder.ltoirs:
@@ -5701,7 +5702,9 @@ def make_function(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout):
                 lto_code = f.read()
 
             builder.ltoirs[lto_symbol] = lto_code
-            builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}({c_dtype}, {a_dtype}*, {b_dtype}*, {c_dtype}, {c_dtype}*);"
+            builder.ltoirs_decl[lto_symbol] = (
+                f"void {lto_symbol}({c_dtype}, {a_dtype}*, {b_dtype}*, {c_dtype}, {c_dtype}*);"
+            )
 
             return lto_symbol, lto_code
 
@@ -5712,15 +5715,33 @@ def tile_flip_layout(layout):
             return "rowmajor"
 
     #    C += A * B
-    (fun_forward, lto_forward) = make_function(M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a.type.layout, b.type.layout, out.type.layout) 
+    (fun_forward, lto_forward) = make_function(
+        M, N, K, a.type.dtype, b.type.dtype, out.type.dtype, a.type.layout, b.type.layout, out.type.layout
+    )
     # adjA += adjC * B^T - Transpose ~= flipped layout
     (fun_backward_A, lto_backward_A) = make_function(
-        M, K, N, out.type.dtype, b.type.dtype, a.type.dtype, out.type.layout, tile_flip_layout(b.type.layout), a.type.layout
+        M,
+        K,
+        N,
+        out.type.dtype,
+        b.type.dtype,
+        a.type.dtype,
+        out.type.layout,
+        tile_flip_layout(b.type.layout),
+        a.type.layout,
     )
     # adjB += A^T * adjC - Transpose ~= flipped layout
     (fun_backward_B, lto_backward_B) = make_function(
-        K, N, M, a.type.dtype, out.type.dtype, b.type.dtype, tile_flip_layout(a.type.layout), out.type.layout, b.type.layout
-    )  
+        K,
+        N,
+        M,
+        a.type.dtype,
+        out.type.dtype,
+        b.type.dtype,
+        tile_flip_layout(a.type.layout),
+        out.type.layout,
+        b.type.layout,
+    )
 
     return (
         (

From 9c2c8e3b7739bab550e1579cdb7bc3e1fb4e6887 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Thu, 17 Oct 2024 08:46:02 -0700
Subject: [PATCH 076/102] Add missing wp.ScopedDevice to test_tile_mlp

---
 warp/tests/test_tile_mlp.py | 266 +++++++++++++++++++-----------------
 1 file changed, 137 insertions(+), 129 deletions(-)

diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 36915535..89fcf052 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -1,3 +1,10 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
 import os
 
 import numpy as np
@@ -142,135 +149,136 @@ def compute(
         for i in range(DIM_OUT):
             out[i, linear] = float(output[i])
 
-    rng = np.random.default_rng(45)
-
-    weights_0, bias_0 = create_layer(rng, DIM_IN, DIM_HID, dtype=dtype)
-    weights_1, bias_1 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
-    weights_2, bias_2 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
-    weights_3, bias_3 = create_layer(rng, DIM_HID, DIM_OUT, dtype=dtype)
-
-    input = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_IN, dtype=dtype)
-    output = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_OUT)
-
-    # # generate reference image
-    # from PIL import Image
-    # reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
-    # with Image.open(reference_path) as im:
-    #     reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB"))
-    #     reference_np = reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T
-    # np.save(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), reference_np, allow_pickle=True)
-
-    reference_np = np.load(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), allow_pickle=True) / 255.0
-    reference = wp.array(reference_np, dtype=float)
-
-    loss = wp.zeros(1, dtype=float, requires_grad=True)
-
-    params = [weights_0, bias_0, weights_1, bias_1, weights_2, bias_2, weights_3, bias_3]
-
-    optimizer_grads = [p.grad.flatten() for p in params]
-    optimizer_inputs = [p.flatten() for p in params]
-    optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
-
-    num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
-    max_epochs = 30
-
-    # create randomized batch indices
-    batches = np.arange(0, IMG_WIDTH * IMG_HEIGHT, dtype=np.int32)
-    rng.shuffle(batches)
-    batches = wp.array(batches)
-
-    with wp.ScopedTimer("Training", active=False):
-        for epoch in range(max_epochs):
-            for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
-                loss.zero_()
-
-                with wp.Tape() as tape:
-                    wp.launch(
-                        compute,
-                        dim=[BATCH_SIZE],
-                        inputs=[
-                            batches[b : b + BATCH_SIZE],
-                            input,
-                            weights_0,
-                            bias_0,
-                            weights_1,
-                            bias_1,
-                            weights_2,
-                            bias_2,
-                            weights_3,
-                            bias_3,
-                            reference,
-                            loss,
-                            output,
-                        ],
-                        block_dim=NUM_THREADS,
-                    )
-
-                tape.backward(loss)
-
-                # check outputs + grads on the first few epoch only
-                # since this is a relatively slow operation
-                verify = True
-                if verify and epoch < 3:
-                    indices = batches[b : b + BATCH_SIZE].numpy()
-
-                    z_np = np.maximum(weights_0.numpy() @ input.numpy()[:, indices] + bias_0.numpy(), 0.0)
-                    z_np = np.maximum(weights_1.numpy() @ z_np + bias_1.numpy(), 0.0)
-                    z_np = np.maximum(weights_2.numpy() @ z_np + bias_2.numpy(), 0.0)
-                    z_np = np.maximum(weights_3.numpy() @ z_np + bias_3.numpy(), 0.0)
-
-                    # test numpy foward
-                    assert_np_equal(output.numpy()[:, indices], z_np, tol=1.0e-2)
-
-                    # torch
-                    input_tc = tc.from_numpy(input.numpy()[:, indices]).requires_grad_(True)
-
-                    weights_0_tc = tc.from_numpy(weights_0.numpy()).requires_grad_(True)
-                    bias_0_tc = tc.from_numpy(bias_0.numpy()).requires_grad_(True)
-
-                    weights_1_tc = tc.from_numpy(weights_1.numpy()).requires_grad_(True)
-                    bias_1_tc = tc.from_numpy(bias_1.numpy()).requires_grad_(True)
-
-                    weights_2_tc = tc.from_numpy(weights_2.numpy()).requires_grad_(True)
-                    bias_2_tc = tc.from_numpy(bias_2.numpy()).requires_grad_(True)
-
-                    weights_3_tc = tc.from_numpy(weights_3.numpy()).requires_grad_(True)
-                    bias_3_tc = tc.from_numpy(bias_3.numpy()).requires_grad_(True)
-
-                    z_tc = tc.clamp(weights_0_tc @ input_tc + bias_0_tc, min=0.0)
-                    z_tc = tc.clamp(weights_1_tc @ z_tc + bias_1_tc, min=0.0)
-                    z_tc = tc.clamp(weights_2_tc @ z_tc + bias_2_tc, min=0.0)
-                    z_tc = tc.clamp(weights_3_tc @ z_tc + bias_3_tc, min=0.0)
-
-                    ref_tc = tc.from_numpy(reference.numpy()[:, indices]).requires_grad_(True)
-
-                    l_tc = tc.mean((z_tc - ref_tc) ** 2)
-                    l_tc.backward()
-
-                    # test torch
-                    assert_np_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices], tol=1.0e-2)
-                    assert_np_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-                    assert_np_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-                    assert_np_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-                    assert_np_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-                    assert_np_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-                    assert_np_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-                    assert_np_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-                    assert_np_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
-
-                optimizer.step(optimizer_grads)
-                tape.zero()
-
-            # print(f"Epoch: {epoch} Loss: {loss.numpy()}")
-
-    # predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
-    # predicted_image = (predicted_image * 255).astype(np.uint8)
-
-    # predicted_image_pil = Image.fromarray(predicted_image)
-    # predicted_image_pil.save("test_tile_mlp_wp.jpg")
-
-    # initial loss is ~0.061
-    assert loss.numpy()[0] < 0.002
+    with wp.ScopedDevice(device):
+        rng = np.random.default_rng(45)
+
+        weights_0, bias_0 = create_layer(rng, DIM_IN, DIM_HID, dtype=dtype)
+        weights_1, bias_1 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
+        weights_2, bias_2 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
+        weights_3, bias_3 = create_layer(rng, DIM_HID, DIM_OUT, dtype=dtype)
+
+        input = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_IN, dtype=dtype)
+        output = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_OUT)
+
+        # # generate reference image
+        # from PIL import Image
+        # reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
+        # with Image.open(reference_path) as im:
+        #     reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB"))
+        #     reference_np = reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T
+        # np.save(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), reference_np, allow_pickle=True)
+
+        reference_np = np.load(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), allow_pickle=True) / 255.0
+        reference = wp.array(reference_np, dtype=float)
+
+        loss = wp.zeros(1, dtype=float, requires_grad=True)
+
+        params = [weights_0, bias_0, weights_1, bias_1, weights_2, bias_2, weights_3, bias_3]
+
+        optimizer_grads = [p.grad.flatten() for p in params]
+        optimizer_inputs = [p.flatten() for p in params]
+        optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
+
+        num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
+        max_epochs = 30
+
+        # create randomized batch indices
+        batches = np.arange(0, IMG_WIDTH * IMG_HEIGHT, dtype=np.int32)
+        rng.shuffle(batches)
+        batches = wp.array(batches)
+
+        with wp.ScopedTimer("Training", active=False):
+            for epoch in range(max_epochs):
+                for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
+                    loss.zero_()
+
+                    with wp.Tape() as tape:
+                        wp.launch(
+                            compute,
+                            dim=[BATCH_SIZE],
+                            inputs=[
+                                batches[b : b + BATCH_SIZE],
+                                input,
+                                weights_0,
+                                bias_0,
+                                weights_1,
+                                bias_1,
+                                weights_2,
+                                bias_2,
+                                weights_3,
+                                bias_3,
+                                reference,
+                                loss,
+                                output,
+                            ],
+                            block_dim=NUM_THREADS,
+                        )
+
+                    tape.backward(loss)
+
+                    # check outputs + grads on the first few epoch only
+                    # since this is a relatively slow operation
+                    verify = True
+                    if verify and epoch < 3:
+                        indices = batches[b : b + BATCH_SIZE].numpy()
+
+                        z_np = np.maximum(weights_0.numpy() @ input.numpy()[:, indices] + bias_0.numpy(), 0.0)
+                        z_np = np.maximum(weights_1.numpy() @ z_np + bias_1.numpy(), 0.0)
+                        z_np = np.maximum(weights_2.numpy() @ z_np + bias_2.numpy(), 0.0)
+                        z_np = np.maximum(weights_3.numpy() @ z_np + bias_3.numpy(), 0.0)
+
+                        # test numpy foward
+                        assert_np_equal(output.numpy()[:, indices], z_np, tol=1.0e-2)
+
+                        # torch
+                        input_tc = tc.from_numpy(input.numpy()[:, indices]).requires_grad_(True)
+
+                        weights_0_tc = tc.from_numpy(weights_0.numpy()).requires_grad_(True)
+                        bias_0_tc = tc.from_numpy(bias_0.numpy()).requires_grad_(True)
+
+                        weights_1_tc = tc.from_numpy(weights_1.numpy()).requires_grad_(True)
+                        bias_1_tc = tc.from_numpy(bias_1.numpy()).requires_grad_(True)
+
+                        weights_2_tc = tc.from_numpy(weights_2.numpy()).requires_grad_(True)
+                        bias_2_tc = tc.from_numpy(bias_2.numpy()).requires_grad_(True)
+
+                        weights_3_tc = tc.from_numpy(weights_3.numpy()).requires_grad_(True)
+                        bias_3_tc = tc.from_numpy(bias_3.numpy()).requires_grad_(True)
+
+                        z_tc = tc.clamp(weights_0_tc @ input_tc + bias_0_tc, min=0.0)
+                        z_tc = tc.clamp(weights_1_tc @ z_tc + bias_1_tc, min=0.0)
+                        z_tc = tc.clamp(weights_2_tc @ z_tc + bias_2_tc, min=0.0)
+                        z_tc = tc.clamp(weights_3_tc @ z_tc + bias_3_tc, min=0.0)
+
+                        ref_tc = tc.from_numpy(reference.numpy()[:, indices]).requires_grad_(True)
+
+                        l_tc = tc.mean((z_tc - ref_tc) ** 2)
+                        l_tc.backward()
+
+                        # test torch
+                        assert_np_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices], tol=1.0e-2)
+                        assert_np_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+
+                    optimizer.step(optimizer_grads)
+                    tape.zero()
+
+                # print(f"Epoch: {epoch} Loss: {loss.numpy()}")
+
+        # predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
+        # predicted_image = (predicted_image * 255).astype(np.uint8)
+
+        # predicted_image_pil = Image.fromarray(predicted_image)
+        # predicted_image_pil.save("test_tile_mlp_wp.jpg")
+
+        # initial loss is ~0.061
+        test.assertLess(loss.numpy()[0], 0.002)
 
 
 @unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")

From e095a66dcbfff2a2930ca31b8b9b08cc6358b98c Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 18 Oct 2024 02:55:05 +0000
Subject: [PATCH 077/102] Fixes for backward smem synchronization.

---
 warp/native/tile.h        | 31 +++++++++++++------------------
 warp/native/tile_reduce.h |  2 ++
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index 3f995221..bb911a17 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -110,11 +110,12 @@
 // ensures subsequent read operations from the tile do not cause a race condition.
 //
 // For tile_shared_t adjoints, the gradient accumulation is done through shared
-// memory atomics, i.e.: atomic_add(), so explicit synchronization is not
-// required, with the exception of some operations like GEMMs, which use
-// standard shared memory loads and stores to compute and  accumulate gradients.
+// memory atomics, i.e.: atomic_add() since for broadcast tiles multiple threads
+// may map to the same location. Synchronization is still required after these 
+// updates, since subsequent operations e.g.: adj_tile_load() will store the
+// gradients to memory, and all updates must be visible at that point.
 //
-// The current synchronization strategy is conservative, can lead to more
+// The current synchronization strategy is conservative, and can lead to more
 // synchronization than necessary. A more sophisticated strategy would be
 // to track the 'dirty' state of shared tiles, and synchronize only when
 // necessary. In addition, custom synchronization for e.g.: tile_load()
@@ -550,6 +551,8 @@ struct tile_shared_t
     {
         if (threadIdx.x == 0)
             (*this)(i, j) += adj_ret;
+
+        WP_TILE_SYNC();
     }
 
 
@@ -585,10 +588,12 @@ struct tile_shared_t
                 break;
 
             // use shared memory atomics to accumulate gradients
-            // since for broadcast tiles multiple incoming values 
+            // since for broadcast tiles multiple incoming threads 
             // may map to a single location in shared memory
             atomic_add(&(*this)(linear), tile.data[i]);
         }
+
+        WP_TILE_SYNC();
     }
 
     inline CUDA_CALLABLE void print()
@@ -1063,9 +1068,6 @@ inline CUDA_CALLABLE void adj_tile_load(array_t<T>& src, int x, int y,
 template <typename T, typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, Tile& t, array_t<T>& adj_dest, int adj_x, AdjTile& adj_t)
 {  
-    // if (!dest.grad)
-    //     return;
-
     // convert to register if necessary
     tile_register_t<T, AdjTile::M, AdjTile::N> adj_reg;
 
@@ -1092,10 +1094,7 @@ inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, Tile& t, array
 template <typename T, typename Tile, typename AdjTile>
 inline CUDA_CALLABLE void adj_tile_store(array_t<T>& dest, int x, int y, Tile& t, array_t<T>& adj_dest, int adj_x, int adj_y, AdjTile& adj_t)
 {  
-    // if (!dest.grad)
-    //     return;
-
-    // convert to register if necessary
+    // allocate register tile to load grads into
     tile_register_t<T, AdjTile::M, AdjTile::N> adj_reg;
 
     const int tile_i = x*adj_reg.M;
@@ -1335,9 +1334,6 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 {   
     using T = typename TileA::Type;    
 
-    // need to sync here because previous operations
-    // may still be performing atomic adds onto adj_A, adj_B, adjC
-    WP_TILE_SYNC();
     fun_backward_A(T(1.0), adj_C.data, B.data, T(1.0), adj_A.data);
     fun_backward_B(T(1.0), A.data, adj_C.data, T(1.0), adj_B.data);
     WP_TILE_SYNC();
@@ -1350,9 +1346,6 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
 {   
     using T = typename TileA::Type;    
 
-    // need to sync here because previous operations
-    // may still be performing atomic adds onto adj_A, adj_B, adjC
-    WP_TILE_SYNC();
     fun_backward_A(T(1.0), adj_C.data, B.data, T(1.0), adj_A.data);
     fun_backward_B(T(1.0), A.data, adj_C.data, T(1.0), adj_B.data);
     WP_TILE_SYNC();
@@ -1426,6 +1419,8 @@ inline CUDA_CALLABLE void adj_tile_broadcast(Tile& t, Tile& adj_t, AdjTile& adj_
     {
         atomic_add(&adj_t.data[i], adj_ret.data[i]);
     }
+
+    WP_TILE_SYNC();
 }
 
 
diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index 35107f35..3b5da6d9 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -168,7 +168,9 @@ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 
     WP_TILE_SYNC();
 
+    // convert the destination adjoint to a register
     auto adj_t_reg = adj_t.copy_to_register();
+    // broadcast scalar across input dimensions (note zero strides)
     auto adj_ret_reg = tile_shared_t<T, Tile::M, Tile::N, 0, 0>(&scratch).copy_to_register();
 
     adj_t.assign(tile_add(adj_t_reg, adj_ret_reg));

From 4eeec1642e4eaf8dce65eb83975dc381bf76d67c Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Fri, 18 Oct 2024 03:00:30 +0000
Subject: [PATCH 078/102] Update docs on adjoint synchronization.

---
 warp/native/tile.h | 60 ++++++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index bb911a17..6d164d7f 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -102,26 +102,46 @@
 
 */
 
-// Notes on shared memory synchronization
-// ======================================
-//
-// Currently operations that wite to shared memory tiles (e.g.: tile_load())
-// must synchronize before they return through WP_TILE_SYNC(), this
-// ensures subsequent read operations from the tile do not cause a race condition.
-//
-// For tile_shared_t adjoints, the gradient accumulation is done through shared
-// memory atomics, i.e.: atomic_add() since for broadcast tiles multiple threads
-// may map to the same location. Synchronization is still required after these 
-// updates, since subsequent operations e.g.: adj_tile_load() will store the
-// gradients to memory, and all updates must be visible at that point.
-//
-// The current synchronization strategy is conservative, and can lead to more
-// synchronization than necessary. A more sophisticated strategy would be
-// to track the 'dirty' state of shared tiles, and synchronize only when
-// necessary. In addition, custom synchronization for e.g.: tile_load()
-// operations could be added through a SyncProvider template parameter on
-// the tile_shared_t type, for example to support barrier synchronization
-// for asynchronous global to shared loads.
+/*
+Notes on shared memory synchronization
+======================================
+
+Currently operations that wite to shared memory tiles (e.g.: tile_load())
+must synchronize before they return through WP_TILE_SYNC(), this
+ensures subsequent read operations from the tile do not cause a race condition.
+
+For tile_shared_t adjoints, the gradient accumulation is done through shared
+memory atomics, i.e.: atomic_add(), since for broadcast tiles multiple threads
+may map to the same location. Synchronization is still required after these 
+updates, since subsequent operations e.g.: adj_tile_load() will store the
+gradients to memory, and all updates must be visible at that point, e.g.:
+
+    a = wp.tile_load(...)
+    b = wp.tile_load(...)
+    c = wp.tile_matmul(a, b)
+    wp.tile_store(c)
+
+    // loads incoming adjoints from global -> shared
+    wp.adj_tile_store(c, adj_c)
+    // consumes adj_c, requires synchronization
+    wp.adj_tile_matmul(a, b, adj_a, adj_b, adj_c)
+    // consumes adj_b, requires synchronization
+    wp.adj_tile_load(..., adj_b)
+    // consumes adj_b, requires synchronization
+    wp.adj_tile_load(..., adj_a)
+
+Generally synchronization to adjoint tiles will happen through the
+tile_shared_t::add() and tile_shared_t::assign() function automatically,
+but in some cases e.g.: tile_matmul() it is done manually.
+
+The current synchronization strategy is conservative, and can lead to more
+synchronization than necessary. A more sophisticated strategy would be
+to track the 'dirty' state of shared tiles, and synchronize only when
+necessary. In addition, custom synchronization for e.g.: tile_load()
+operations could be added through a SyncProvider template parameter on
+the tile_shared_t type, for example to support barrier synchronization
+for asynchronous global to shared loads.
+*/
 
 namespace wp
 {

From 3abf3a81dc1a4e37d11e304ef2955f67e4c26c57 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Mon, 21 Oct 2024 08:44:09 +0000
Subject: [PATCH 079/102] Clean up example_tile_mlp.py

---
 warp/examples/tile/example_tile_mlp.py | 254 +++++++++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 warp/examples/tile/example_tile_mlp.py

diff --git a/warp/examples/tile/example_tile_mlp.py b/warp/examples/tile/example_tile_mlp.py
new file mode 100644
index 00000000..893b344c
--- /dev/null
+++ b/warp/examples/tile/example_tile_mlp.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+###########################################################################
+# Example Image Multilayer Perceptron (MLP)
+#
+# Shows how to train a coordinate-based MLP on an image to predict the RGB
+# color at a given input position. By default, a positional encoding is
+# applied to the input coordinates to improve the ability of the MLP to
+# represent higher-frequency content. This can be disabled by passing the
+# '--no_encoding' option.
+#
+# References:
+#   Ben Mildenhall et al. 2021. NeRF: representing scenes
+#   as neural radiance fields for view synthesis. Commun. ACM 65, 1
+#   (January 2022), 99–106. https://doi.org/10.1145/3503250
+#
+###########################################################################
+
+import numpy as np
+import warp as wp
+import warp.examples
+import warp.optim
+
+import math
+import os
+
+from PIL import Image
+
+rng = np.random.default_rng(45)
+
+def create_layer(dim_in, dim_hid, dtype=float):
+
+    w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
+
+    weights = wp.array(w, dtype=dtype, requires_grad=True)
+    bias = wp.array(b, dtype=dtype, requires_grad=True)
+
+    return (weights, bias)
+
+def create_array(dim_in, dim_hid, dtype=float):
+
+    s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    a = wp.array(s, dtype=dtype, requires_grad=True)
+
+    return a
+
+
+# number of frequencies for the positional encoding
+NUM_FREQ = wp.constant(8)
+
+DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
+DIM_HID = 32
+DIM_OUT = 3
+
+# threads per-block
+NUM_THREADS = 32
+
+IMG_WIDTH = NUM_THREADS*16
+IMG_HEIGHT = NUM_THREADS*16
+
+BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
+
+# dtype for our weights and bias matrices
+dtype = wp.float16
+
+@wp.func
+def relu(x: dtype):
+    return wp.max(x, dtype(0.0))
+
+@wp.kernel
+def compute(indices: wp.array(dtype=int),
+            weights_0: wp.array2d(dtype=dtype), bias_0: wp.array2d(dtype=dtype),
+            weights_1: wp.array2d(dtype=dtype), bias_1: wp.array2d(dtype=dtype),
+            weights_2: wp.array2d(dtype=dtype), bias_2: wp.array2d(dtype=dtype),
+            weights_3: wp.array2d(dtype=dtype), bias_3: wp.array2d(dtype=dtype),
+            reference: wp.array2d(dtype=float),
+            loss: wp.array1d(dtype=float),
+            out: wp.array2d(dtype=float)):
+
+    if indices:
+        # use batch indices if provided
+        linear = indices[wp.tid()]
+    else:
+        linear = wp.tid()
+
+    row = linear/IMG_WIDTH
+    col = linear%IMG_WIDTH
+
+    # normalize input coordinates to [-1, 1]
+    x = (float(row)/float(IMG_WIDTH) - 0.5)*2.0
+    y = (float(col)/float(IMG_HEIGHT) - 0.5)*2.0
+
+    local = wp.vector(dtype=dtype, length=DIM_IN)
+
+    # construct positional encoding
+    for s in range(NUM_FREQ):
+
+        scale = wp.pow(2.0, float(s))*wp.pi
+
+        # x-coord
+        local[s*4 + 0] = dtype(wp.sin(x * scale))
+        local[s*4 + 1] = dtype(wp.cos(x * scale))
+
+        # y-coord
+        local[s*4 + 2] = dtype(wp.sin(y * scale))
+        local[s*4 + 3] = dtype(wp.cos(y * scale))
+
+
+    # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
+    f = wp.tile(local)
+    
+    # input layer
+    w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
+    b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
+    z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, m=DIM_HID, n=NUM_THREADS))
+
+    # hidden layer
+    w1 = wp.tile_load(weights_1, 0, 0, m=DIM_HID, n=DIM_HID)
+    b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
+    z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
+
+    w2 = wp.tile_load(weights_2, 0, 0, m=DIM_HID, n=DIM_HID)
+    b2 = wp.tile_load(bias_2, 0, 0, m=DIM_HID, n=1)
+    z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_HID, n=NUM_THREADS))
+
+    # output layer
+    w3 = wp.tile_load(weights_3, 0, 0, m=DIM_OUT, n=DIM_HID)
+    b3 = wp.tile_load(bias_3, 0, 0, m=DIM_OUT, n=1)
+    o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
+
+    # untile back to SIMT
+    output = wp.untile(o)
+
+    # compute error
+    error = wp.vec3(float(output[0]) - reference[0,linear],
+                    float(output[1]) - reference[1,linear],
+                    float(output[2]) - reference[2,linear])
+
+    # write MSE loss
+    if loss:
+        wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*BATCH_SIZE))
+
+    #  write image output
+    if out:
+        for i in range(DIM_OUT):
+            out[i, linear] = float(output[i])
+            
+
+class Example:
+
+    def __init__(self):
+        pass
+
+    def train(self):
+
+        weights_0, bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
+        weights_1, bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+        weights_2, bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+        weights_3, bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
+
+        input = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_IN, dtype=dtype)
+        output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
+
+        # reference 
+        reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
+        with Image.open(reference_path) as im:
+            reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0    
+        reference = wp.array(reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T, dtype=float)
+
+        loss = wp.zeros(1, dtype=float, requires_grad=True)
+
+        params = [weights_0, bias_0,
+                  weights_1, bias_1, 
+                  weights_2, bias_2,
+                  weights_3, bias_3]
+
+        optimizer_grads = [p.grad.flatten() for p in params]
+        optimizer_inputs = [p.flatten() for p in params]
+        optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
+
+        num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
+        max_iters = 20000
+        max_epochs = int(max_iters/num_batches)
+            
+        # create randomized batch indices
+        indices = np.arange(0, IMG_WIDTH*IMG_HEIGHT, dtype=np.int32)
+        rng.shuffle(indices)
+        indices = wp.array(indices)
+
+        with wp.ScopedTimer("Training"):
+
+            for i in range(max_epochs):
+
+                for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
+
+                    loss.zero_()
+
+                    with wp.Tape() as tape:
+                        wp.launch(
+                            compute, 
+                            dim=[BATCH_SIZE],
+                            inputs=[indices[b:b+BATCH_SIZE],
+                                    weights_0, bias_0,
+                                    weights_1, bias_1,
+                                    weights_2, bias_2, 
+                                    weights_3, bias_3, 
+                                    reference,
+                                    loss,
+                                    None],
+                            block_dim=NUM_THREADS)
+
+                    tape.backward(loss)
+
+                    optimizer.step(optimizer_grads)
+
+                    tape.zero()
+
+                print(f"Epoch: {i} Loss: {loss.numpy()}")
+
+
+        # evaluate full image
+        wp.launch(
+            compute, 
+            dim=[IMG_WIDTH*IMG_HEIGHT],
+            inputs=[None,
+                    weights_0, bias_0,
+                    weights_1, bias_1,
+                    weights_2, bias_2, 
+                    weights_3, bias_3, 
+                    reference,
+                    loss,
+                    output],
+            block_dim=NUM_THREADS)        
+                
+        predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
+        predicted_image = (predicted_image * 255).astype(np.uint8)
+
+        predicted_image_pil = Image.fromarray(predicted_image)
+        predicted_image_pil.save("example_tile_mlp.jpg")
+
+
+
+if __name__ == "__main__":
+
+    with wp.ScopedDevice("cuda:0"):
+
+        example = Example()
+        example.train()

From b155a70a7393cc66c1529db39c45fdf569c99b72 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 23 Oct 2024 02:18:49 +0000
Subject: [PATCH 080/102] Add Torch impl. to MLP example + CUDA graph support

---
 warp/examples/tile/example_tile_mlp.py | 240 +++++++++++++++++++------
 1 file changed, 185 insertions(+), 55 deletions(-)

diff --git a/warp/examples/tile/example_tile_mlp.py b/warp/examples/tile/example_tile_mlp.py
index 893b344c..abac75d4 100644
--- a/warp/examples/tile/example_tile_mlp.py
+++ b/warp/examples/tile/example_tile_mlp.py
@@ -66,6 +66,8 @@ def create_array(dim_in, dim_hid, dtype=float):
 
 BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
 
+wp.set_module_options({"fast_math": True})
+
 # dtype for our weights and bias matrices
 dtype = wp.float16
 
@@ -75,6 +77,7 @@ def relu(x: dtype):
 
 @wp.kernel
 def compute(indices: wp.array(dtype=int),
+            encoding: wp.array2d(dtype=dtype),
             weights_0: wp.array2d(dtype=dtype), bias_0: wp.array2d(dtype=dtype),
             weights_1: wp.array2d(dtype=dtype), bias_1: wp.array2d(dtype=dtype),
             weights_2: wp.array2d(dtype=dtype), bias_2: wp.array2d(dtype=dtype),
@@ -106,11 +109,17 @@ def compute(indices: wp.array(dtype=int),
         # x-coord
         local[s*4 + 0] = dtype(wp.sin(x * scale))
         local[s*4 + 1] = dtype(wp.cos(x * scale))
-
         # y-coord
         local[s*4 + 2] = dtype(wp.sin(y * scale))
         local[s*4 + 3] = dtype(wp.cos(y * scale))
 
+        # if requested then write the encoding back to device memory
+        if encoding:
+            encoding[s*4 + 0, linear] = local[s*4 + 0]
+            encoding[s*4 + 1, linear] = local[s*4 + 1]
+            encoding[s*4 + 2, linear] = local[s*4 + 2]
+            encoding[s*4 + 3, linear] = local[s*4 + 3]
+
 
     # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
     f = wp.tile(local)
@@ -155,73 +164,77 @@ def compute(indices: wp.array(dtype=int),
 class Example:
 
     def __init__(self):
-        pass
-
-    def train(self):
 
-        weights_0, bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
-        weights_1, bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
-        weights_2, bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
-        weights_3, bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
-
-        input = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_IN, dtype=dtype)
-        output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
+        self.weights_0, self.bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
+        self.weights_1, self.bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+        self.weights_2, self.bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+        self.weights_3, self.bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
 
         # reference 
         reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
         with Image.open(reference_path) as im:
             reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0    
-        reference = wp.array(reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T, dtype=float)
+        self.reference = wp.array(reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T, dtype=float)
 
-        loss = wp.zeros(1, dtype=float, requires_grad=True)
+        # create randomized batch indices
+        indices = np.arange(0, IMG_WIDTH*IMG_HEIGHT, dtype=np.int32)
+        rng.shuffle(indices)
+        self.indices = wp.array(indices)
+
+        self.num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
+        self.max_iters = 20000
+        self.max_epochs = int(self.max_iters/self.num_batches)
+
+    def train_warp(self):
 
-        params = [weights_0, bias_0,
-                  weights_1, bias_1, 
-                  weights_2, bias_2,
-                  weights_3, bias_3]
+        params = [self.weights_0, self.bias_0,
+                  self.weights_1, self.bias_1, 
+                  self.weights_2, self.bias_2,
+                  self.weights_3, self.bias_3]
 
         optimizer_grads = [p.grad.flatten() for p in params]
         optimizer_inputs = [p.flatten() for p in params]
         optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
+       
+        loss = wp.zeros(1, dtype=float, requires_grad=True)
+        output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
 
-        num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
-        max_iters = 20000
-        max_epochs = int(max_iters/num_batches)
-            
-        # create randomized batch indices
-        indices = np.arange(0, IMG_WIDTH*IMG_HEIGHT, dtype=np.int32)
-        rng.shuffle(indices)
-        indices = wp.array(indices)
-
-        with wp.ScopedTimer("Training"):
+        # capture graph for whole epoch
+        wp.capture_begin()
+    
+        for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
 
-            for i in range(max_epochs):
+            loss.zero_()
 
-                for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
+            with wp.Tape() as tape:
+                wp.launch(
+                    compute, 
+                    dim=[BATCH_SIZE],
+                    inputs=[self.indices[b:b+BATCH_SIZE],
+                            None,
+                            self.weights_0, self.bias_0,
+                            self.weights_1, self.bias_1,
+                            self.weights_2, self.bias_2, 
+                            self.weights_3, self.bias_3, 
+                            self.reference,
+                            loss,
+                            None],
+                    block_dim=NUM_THREADS)
 
-                    loss.zero_()
+            tape.backward(loss)
+            optimizer.step(optimizer_grads)
+            tape.zero()
 
-                    with wp.Tape() as tape:
-                        wp.launch(
-                            compute, 
-                            dim=[BATCH_SIZE],
-                            inputs=[indices[b:b+BATCH_SIZE],
-                                    weights_0, bias_0,
-                                    weights_1, bias_1,
-                                    weights_2, bias_2, 
-                                    weights_3, bias_3, 
-                                    reference,
-                                    loss,
-                                    None],
-                            block_dim=NUM_THREADS)
+        graph = wp.capture_end()
 
-                    tape.backward(loss)
 
-                    optimizer.step(optimizer_grads)
+        with wp.ScopedTimer("Training"):
 
-                    tape.zero()
+            for i in range(self.max_epochs):
 
-                print(f"Epoch: {i} Loss: {loss.numpy()}")
+                with wp.ScopedTimer("Epoch"):
+                    wp.capture_launch(graph)
+                    print(f"Epoch: {i} Loss: {loss.numpy()}")
 
 
         # evaluate full image
@@ -229,16 +242,131 @@ def train(self):
             compute, 
             dim=[IMG_WIDTH*IMG_HEIGHT],
             inputs=[None,
-                    weights_0, bias_0,
-                    weights_1, bias_1,
-                    weights_2, bias_2, 
-                    weights_3, bias_3, 
-                    reference,
+                    None,
+                    self.weights_0, self.bias_0,
+                    self.weights_1, self.bias_1,
+                    self.weights_2, self.bias_2, 
+                    self.weights_3, self.bias_3, 
+                    self.reference,
                     loss,
                     output],
-            block_dim=NUM_THREADS)        
+            block_dim=NUM_THREADS)
+
+        
+        self.save_image(output.numpy())
+        
+
+
+    def train_torch(self):
+
+        import torch as tc
+
+        weights_0 = tc.nn.Parameter(wp.to_torch(self.weights_0))
+        weights_1 = tc.nn.Parameter(wp.to_torch(self.weights_1))
+        weights_2 = tc.nn.Parameter(wp.to_torch(self.weights_2))
+        weights_3 = tc.nn.Parameter(wp.to_torch(self.weights_3))
+
+        bias_0 = tc.nn.Parameter(wp.to_torch(self.bias_0))
+        bias_1 = tc.nn.Parameter(wp.to_torch(self.bias_1))
+        bias_2 = tc.nn.Parameter(wp.to_torch(self.bias_2))
+        bias_3 = tc.nn.Parameter(wp.to_torch(self.bias_3))
+
+        indices = wp.to_torch(self.indices)
+        reference = wp.to_torch(self.reference)
+
+        optimizer = tc.optim.Adam([weights_0,
+                                   bias_0, 
+                                   weights_1,
+                                   bias_1,
+                                   weights_2,
+                                   bias_2,
+                                   weights_3,
+                                   bias_3], capturable=True, lr=0.0001, betas=(0.9, 0.95), eps=1.e-6)
+
+
+        # generate frequency space encoding of pixels
+        # based on their linear index in the image
+        def encode(linear):
+            
+            row = (linear // IMG_WIDTH).float()
+            col = (linear % IMG_WIDTH).float()
+
+            x = (row / float(IMG_WIDTH) - 0.5) * 2.0
+            y = (col / float(IMG_HEIGHT) - 0.5) * 2.0
+
+            encoding = tc.zeros((NUM_FREQ * 4, len(linear)), dtype=tc.float16, device="cuda")
+
+            for s in range(NUM_FREQ):
+                scale = math.pow(2.0, float(s)) * math.pi
+
+                # Directly write the computed values into the encoding tensor
+                encoding[s * 4 + 0, :] = tc.sin(scale * x)
+                encoding[s * 4 + 1, :] = tc.cos(scale * x)
+                encoding[s * 4 + 2, :] = tc.sin(scale * y)
+                encoding[s * 4 + 3, :] = tc.cos(scale * y)        
+
+            return encoding
+
+
+        stream = tc.cuda.Stream()
+        graph = tc.cuda.CUDAGraph()
+
+        # warm-up
+        with tc.cuda.stream(stream):
+            f = tc.rand((NUM_FREQ*4, BATCH_SIZE), dtype=tc.float16, device="cuda")
+            z = tc.relu(weights_0 @ f + bias_0)
+            z = tc.relu(weights_1 @ z + bias_1)
+            z = tc.relu(weights_2 @ z + bias_2)
+            z = tc.relu(weights_3 @ z + bias_3)
+            ref = tc.rand((3, BATCH_SIZE), dtype=tc.float16, device="cuda")
+            loss = tc.mean((z - ref) ** 2)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        with tc.cuda.graph(graph):
+
+            for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
+
+                linear = indices[b:b+BATCH_SIZE]
+
+                f = encode(linear)
+
+                z = tc.relu(weights_0 @ f + bias_0)
+                z = tc.relu(weights_1 @ z + bias_1)
+                z = tc.relu(weights_2 @ z + bias_2)
+                z = tc.relu(weights_3 @ z + bias_3)
+
+                ref = reference[:, linear]
+                loss = tc.mean((z - ref) ** 2)
                 
-        predicted_image = output.numpy().T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+
+        with wp.ScopedTimer("Training (Torch)"):
+
+            for i in range(self.max_epochs):
+
+                with wp.ScopedTimer("Epoch"):
+                    graph.replay()
+
+                    print(loss)
+        
+
+        f = encode(tc.arange(0, IMG_WIDTH*IMG_HEIGHT))
+        z = tc.relu(weights_0 @ f + bias_0)
+        z = tc.relu(weights_1 @ z + bias_1)
+        z = tc.relu(weights_2 @ z + bias_2)
+        z = tc.relu(weights_3 @ z + bias_3)
+
+        self.save_image(z.detach().cpu().numpy())
+
+
+    def save_image(self, output):
+
+        predicted_image = output.T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
         predicted_image = (predicted_image * 255).astype(np.uint8)
 
         predicted_image_pil = Image.fromarray(predicted_image)
@@ -246,9 +374,11 @@ def train(self):
 
 
 
+
 if __name__ == "__main__":
 
     with wp.ScopedDevice("cuda:0"):
 
         example = Example()
-        example.train()
+        #example.train_warp()
+        example.train_torch()

From 89256ad6c086820372962be59ea71034673e8ce6 Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 23 Oct 2024 02:54:06 +0000
Subject: [PATCH 081/102] Add support for specifying max iterations (useful for
 profiling)

---
 warp/examples/tile/example_tile_mlp.py | 40 +++++++++++---------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/warp/examples/tile/example_tile_mlp.py b/warp/examples/tile/example_tile_mlp.py
index abac75d4..18614bfe 100644
--- a/warp/examples/tile/example_tile_mlp.py
+++ b/warp/examples/tile/example_tile_mlp.py
@@ -77,7 +77,6 @@ def relu(x: dtype):
 
 @wp.kernel
 def compute(indices: wp.array(dtype=int),
-            encoding: wp.array2d(dtype=dtype),
             weights_0: wp.array2d(dtype=dtype), bias_0: wp.array2d(dtype=dtype),
             weights_1: wp.array2d(dtype=dtype), bias_1: wp.array2d(dtype=dtype),
             weights_2: wp.array2d(dtype=dtype), bias_2: wp.array2d(dtype=dtype),
@@ -86,11 +85,8 @@ def compute(indices: wp.array(dtype=int),
             loss: wp.array1d(dtype=float),
             out: wp.array2d(dtype=float)):
 
-    if indices:
-        # use batch indices if provided
-        linear = indices[wp.tid()]
-    else:
-        linear = wp.tid()
+    # batch indices
+    linear = indices[wp.tid()]
 
     row = linear/IMG_WIDTH
     col = linear%IMG_WIDTH
@@ -113,13 +109,6 @@ def compute(indices: wp.array(dtype=int),
         local[s*4 + 2] = dtype(wp.sin(y * scale))
         local[s*4 + 3] = dtype(wp.cos(y * scale))
 
-        # if requested then write the encoding back to device memory
-        if encoding:
-            encoding[s*4 + 0, linear] = local[s*4 + 0]
-            encoding[s*4 + 1, linear] = local[s*4 + 1]
-            encoding[s*4 + 2, linear] = local[s*4 + 2]
-            encoding[s*4 + 3, linear] = local[s*4 + 3]
-
 
     # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
     f = wp.tile(local)
@@ -163,7 +152,7 @@ def compute(indices: wp.array(dtype=int),
 
 class Example:
 
-    def __init__(self):
+    def __init__(self, train_iters):
 
         self.weights_0, self.bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
         self.weights_1, self.bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
@@ -182,8 +171,8 @@ def __init__(self):
         self.indices = wp.array(indices)
 
         self.num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
-        self.max_iters = 20000
-        self.max_epochs = int(self.max_iters/self.num_batches)
+        self.max_iters = train_iters
+        self.max_epochs = max(1, int(self.max_iters/self.num_batches))
 
     def train_warp(self):
 
@@ -202,7 +191,7 @@ def train_warp(self):
         # capture graph for whole epoch
         wp.capture_begin()
     
-        for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
+        for b in range(0, min(self.max_iters, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE)):
 
             loss.zero_()
 
@@ -211,7 +200,6 @@ def train_warp(self):
                     compute, 
                     dim=[BATCH_SIZE],
                     inputs=[self.indices[b:b+BATCH_SIZE],
-                            None,
                             self.weights_0, self.bias_0,
                             self.weights_1, self.bias_1,
                             self.weights_2, self.bias_2, 
@@ -241,8 +229,7 @@ def train_warp(self):
         wp.launch(
             compute, 
             dim=[IMG_WIDTH*IMG_HEIGHT],
-            inputs=[None,
-                    None,
+            inputs=[self.indices,
                     self.weights_0, self.bias_0,
                     self.weights_1, self.bias_1,
                     self.weights_2, self.bias_2, 
@@ -377,8 +364,15 @@ def save_image(self, output):
 
 if __name__ == "__main__":
 
+    import argparse
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--train_iters", type=int, default=20000, help="Total number of training iterations.")
+
+    args = parser.parse_known_args()[0]
+
     with wp.ScopedDevice("cuda:0"):
 
-        example = Example()
-        #example.train_warp()
-        example.train_torch()
+        example = Example(args.train_iters)
+        example.train_warp()
+        #example.train_torch()

From a19a67939a81b715fd9103cce565304bc3c7494b Mon Sep 17 00:00:00 2001
From: Miles Macklin <mmacklin@nvidia.com>
Date: Wed, 23 Oct 2024 05:07:03 +0000
Subject: [PATCH 082/102] Fix for typo batch iteration

---
 warp/examples/tile/example_tile_mlp.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/warp/examples/tile/example_tile_mlp.py b/warp/examples/tile/example_tile_mlp.py
index 18614bfe..ef0f49e4 100644
--- a/warp/examples/tile/example_tile_mlp.py
+++ b/warp/examples/tile/example_tile_mlp.py
@@ -66,8 +66,6 @@ def create_array(dim_in, dim_hid, dtype=float):
 
 BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
 
-wp.set_module_options({"fast_math": True})
-
 # dtype for our weights and bias matrices
 dtype = wp.float16
 
@@ -191,7 +189,7 @@ def train_warp(self):
         # capture graph for whole epoch
         wp.capture_begin()
     
-        for b in range(0, min(self.max_iters, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE)):
+        for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
 
             loss.zero_()
 
@@ -238,9 +236,8 @@ def train_warp(self):
                     loss,
                     output],
             block_dim=NUM_THREADS)
-
         
-        self.save_image(output.numpy())
+        self.save_image(f"example_tile_mlp.jpg", output.numpy())
         
 
 
@@ -348,16 +345,16 @@ def encode(linear):
         z = tc.relu(weights_2 @ z + bias_2)
         z = tc.relu(weights_3 @ z + bias_3)
 
-        self.save_image(z.detach().cpu().numpy())
+        self.save_image("example_tile_mlp_torch.jpg", z.detach().cpu().numpy())
 
 
-    def save_image(self, output):
+    def save_image(self, name, output):
 
         predicted_image = output.T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
         predicted_image = (predicted_image * 255).astype(np.uint8)
 
         predicted_image_pil = Image.fromarray(predicted_image)
-        predicted_image_pil.save("example_tile_mlp.jpg")
+        predicted_image_pil.save(name)
 
 
 

From 1521890a5794105205d7e024b6aa3c4e3a6a85c8 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Wed, 23 Oct 2024 17:10:56 -0700
Subject: [PATCH 083/102] Use libmathdx with embedded headers

---
 .gitlab/ci/mathdx-support.yml |  8 +++---
 warp/builtins.py              | 16 +++++-------
 warp/mathdx.py                | 47 -----------------------------------
 warp/native/warp.cu           | 27 +++++++-------------
 4 files changed, 19 insertions(+), 79 deletions(-)

diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index 4b85d124..b5ef9e4a 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -36,7 +36,7 @@ linux-x86_64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/69/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/88/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -59,7 +59,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/69/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/88/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -101,7 +101,7 @@ linux-x86_64 test:
     - python -m pip install --upgrade usd-core
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -U "jax[cuda12]"
-    - python -m pip install --upgrade nvidia-mathdx==24.8.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
+    - python -m pip install --upgrade nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
     # HACK: disable P2P tests due to misbehaving agents
@@ -118,7 +118,7 @@ linux-aarch64 test jetson:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
     - !reference [.snippets, install-python+warp-aarch64]
     - python -m pip install -U "jax[cuda12]"
-    - python -m pip install --upgrade nvidia-mathdx==24.8.0 nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
+    - python -m pip install --upgrade nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
     - python -m warp.tests --junit-report-xml rspec.xml -s autodetect --failfast
diff --git a/warp/builtins.py b/warp/builtins.py
index 87ad6815..0d2d51b2 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -10,7 +10,6 @@
 from typing import Any, Callable, Mapping, Sequence
 
 from warp.codegen import Reference, Var, strip_reference
-from warp.mathdx import get_cuda_include_dirs, get_mathdx_include_dirs
 from warp.types import *
 
 from .context import add_builtin
@@ -5675,13 +5674,12 @@ def make_function(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout):
 
         # otherwise compile LTO
         lto_code = tempfile.NamedTemporaryFile()
-        include_dirs = get_cuda_include_dirs()
         result = warp.context.runtime.core.cuda_compile_dot(
             lto_code.name.encode("utf-8"),
             lto_symbol.encode("utf-8"),
-            len(include_dirs),
-            include_dirs,
-            get_mathdx_include_dirs(),
+            0,
+            None,
+            None,
             arch,
             M,
             N,
@@ -5877,14 +5875,12 @@ def tile_fft_generic_lto_dispatch_func(
     lto_code = tempfile.NamedTemporaryFile()
     shared_memory_size = ctypes.c_int(0)
 
-    include_dirs = get_cuda_include_dirs()
-
     result = warp.context.runtime.core.cuda_compile_fft(
         lto_code.name.encode("utf-8"),
         lto_symbol.encode("utf-8"),
-        len(include_dirs),
-        include_dirs,
-        get_mathdx_include_dirs(),
+        0,
+        None,
+        None,
         arch,
         size,
         ept,
diff --git a/warp/mathdx.py b/warp/mathdx.py
index dab9fbc8..e71faf06 100644
--- a/warp/mathdx.py
+++ b/warp/mathdx.py
@@ -14,8 +14,6 @@
 from importlib.metadata import PackageNotFoundError, files
 
 CUDA_HOME = None
-MATHDX_HOME = None
-CUTLASS_HOME = None
 
 
 PLATFORM_LINUX = sys.platform.startswith("linux")
@@ -96,51 +94,6 @@ def _check_cuda_home():
     CUDA_HOME = (CUDA_HOME,)
 
 
-def _check_mathdx_home():
-    # Find mathDx headers
-    global MATHDX_HOME
-
-    # Try wheel
-    try:
-        MATHDX_HOME = files("nvidia-mathdx")
-    except PackageNotFoundError:
-        pass
-    else:
-        # use cufftdx.hpp as a proxy
-        MATHDX_HOME = [f for f in MATHDX_HOME if "cufftdx.hpp" in str(f)][0]
-        MATHDX_HOME = os.path.join(os.path.dirname(MATHDX_HOME.locate()), "..")
-        return
-
-    # Try conda
-    if "CONDA_PREFIX" in os.environ:
-        if PLATFORM_LINUX:
-            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "include")
-        elif PLATFORM_WIN:
-            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "Library", "include")
-        if os.path.isfile(os.path.join(conda_include, "cufftdx.hpp")):
-            MATHDX_HOME = os.path.join(conda_include, "..")
-            return
-
-    # Try local
-    if "MATHDX_HOME" not in os.environ:
-        raise RuntimeError(
-            "mathDx headers not found. Depending on how you install nvmath-python and other CUDA packages, "
-            "you may need to perform one of the steps below:\n"
-            "   - pip install nvidia-mathdx\n"
-            "   - conda install -c conda-forge mathdx\n"
-            "   - export MATHDX_HOME=/path/to/mathdx"
-        )
-    else:
-        MATHDX_HOME = os.environ["MATHDX_HOME"]
-
-
-def get_mathdx_include_dirs():
-    _check_mathdx_home()
-
-    global MATHDX_HOME
-    return (MATHDX_HOME + "/include").encode("utf-8")
-
-
 def get_cuda_include_dirs():
     _check_cuda_home()
 
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
index b043aeba..c930a913 100644
--- a/warp/native/warp.cu
+++ b/warp/native/warp.cu
@@ -17,7 +17,7 @@
 #include <nvJitLink.h>
 #include <nvPTXCompiler.h>
 #if WP_ENABLE_MATHDX
-    #include <libmathdx.hpp>
+    #include <libmathdx.h>
 #endif
 
 #include <array>
@@ -2881,9 +2881,11 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
 
         CHECK_ANY(ltoir_output_path != nullptr);
         CHECK_ANY(symbol_name != nullptr);
-        CHECK_ANY(mathdx_include_dir != nullptr);
         CHECK_ANY(shared_memory_size != nullptr);
-        CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
+        // Includes currently unused
+        CHECK_ANY(include_dirs == nullptr);
+        CHECK_ANY(mathdx_include_dir == nullptr);
+        CHECK_ANY(num_include_dirs == 0);
 
         bool res = true;
         cufftdxHandle h;
@@ -2900,12 +2902,6 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         CHECK_CUFFTDX(cufftDxSetOperatorInt64(h, cufftDxOperatorType::CUFFTDX_OPERATOR_FFTS_PER_BLOCK, 1));
 
         CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
-        for(int dir = 0; dir < num_include_dirs; dir++) 
-        {
-            CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
-        }
-        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
-        CHECK_CUFFTDX(cufftDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/../external/cutlass/include").c_str()));
 
         size_t lto_size = 0;
         CHECK_CUFFTDX(cufftDxGetLTOIRSize(h, &lto_size));
@@ -2931,8 +2927,10 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
 
         CHECK_ANY(ltoir_output_path != nullptr);
         CHECK_ANY(symbol_name != nullptr);
-        CHECK_ANY(mathdx_include_dir != nullptr);
-        CHECK_ANY(num_include_dirs == 0 || include_dirs != nullptr);
+        // Includes currently unused
+        CHECK_ANY(include_dirs == nullptr);
+        CHECK_ANY(mathdx_include_dir == nullptr);
+        CHECK_ANY(num_include_dirs == 0);
 
         bool res = true;
         cublasdxHandle h;
@@ -2953,13 +2951,6 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
         CHECK_CUBLASDX(cublasDxSetOperatorInt64Array(h, cublasDxOperatorType::CUBLASDX_OPERATOR_ARRANGEMENT, arrangement.size(), arrangement.data()));
         
         CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_SYMBOL_NAME, symbol_name));
-        for(int dir = 0; dir < num_include_dirs; dir++) 
-        {
-            CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, include_dirs[dir]));
-        }
-        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, mathdx_include_dir));
-        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/cublasdx/include").c_str()));
-        CHECK_CUBLASDX(cublasDxSetOptionStr(h, commonDxOption::COMMONDX_OPTION_INCLUDE, (std::string(mathdx_include_dir) + "/../external/cutlass/include").c_str()));
 
         size_t lto_size = 0;
         CHECK_CUBLASDX(cublasDxGetLTOIRSize(h, &lto_size));

From bac57c4f9af4c510b911bd6be019c5507aef34e1 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Mon, 28 Oct 2024 15:12:03 -0700
Subject: [PATCH 084/102] Removing previous CCCL/CUDA runtime wheels + updating
 to new libmathdx_static

---
 .gitlab/ci/mathdx-support.yml |   7 +--
 warp/build.py                 |   7 +--
 warp/mathdx.py                | 104 ----------------------------------
 3 files changed, 4 insertions(+), 114 deletions(-)
 delete mode 100644 warp/mathdx.py

diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index b5ef9e4a..d13873e9 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -36,7 +36,7 @@ linux-x86_64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/88/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/92/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -59,7 +59,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/88/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/92/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -101,7 +101,6 @@ linux-x86_64 test:
     - python -m pip install --upgrade usd-core
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -U "jax[cuda12]"
-    - python -m pip install --upgrade nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - python -m pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
     # HACK: disable P2P tests due to misbehaving agents
@@ -118,7 +117,6 @@ linux-aarch64 test jetson:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
     - !reference [.snippets, install-python+warp-aarch64]
     - python -m pip install -U "jax[cuda12]"
-    - python -m pip install --upgrade nvidia-cuda-cccl-cu12 nvidia-cuda-runtime-cu12
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
     - python -m warp.tests --junit-report-xml rspec.xml -s autodetect --failfast
@@ -141,7 +139,6 @@ create pypi wheels:
     - python3 -m pip install --upgrade pip
     - python3 -m pip install build
   script:
-    - sed -i 's/dependencies = \["numpy"\]/dependencies = \["numpy", "nvidia-mathdx==24.4.0", "nvidia-cuda-cccl-cu12", "nvidia-cuda-runtime-cu12"\]/' pyproject.toml
     - sed -i "s/^\(.*\)$/\1+tile/" VERSION.md  # Modify VERSION.md with +tile
     - python3 -m build --wheel -C--build-option=-Plinux-x86_64
     - python3 -m build --wheel -C--build-option=-Plinux-aarch64
diff --git a/warp/build.py b/warp/build.py
index 8655201c..d5193ad0 100644
--- a/warp/build.py
+++ b/warp/build.py
@@ -9,7 +9,6 @@
 import os
 
 import warp.config
-from warp.mathdx import get_cuda_include_dirs
 from warp.thirdparty import appdirs
 
 
@@ -25,8 +24,6 @@ def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fa
             warp.context.runtime.llvm.compile_cuda(src, cu_path, inc_path, output_path, False)
 
         else:
-            cuda_include_dirs = get_cuda_include_dirs()
-
             if ltoirs is None:
                 ltoirs = []
 
@@ -37,8 +34,8 @@ def build_cuda(cu_path, arch, output_path, config="release", verify_fp=False, fa
                 src,
                 arch,
                 inc_path,
-                len(cuda_include_dirs),
-                cuda_include_dirs,
+                0,
+                None,
                 config == "debug",
                 warp.config.verbose,
                 verify_fp,
diff --git a/warp/mathdx.py b/warp/mathdx.py
deleted file mode 100644
index e71faf06..00000000
--- a/warp/mathdx.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-import ctypes
-import os
-import platform
-import re
-import sys
-import warnings
-from importlib.metadata import PackageNotFoundError, files
-
-CUDA_HOME = None
-
-
-PLATFORM_LINUX = sys.platform.startswith("linux")
-PLATFORM_WIN = sys.platform.startswith("win32")
-
-
-def _conda_get_target_name():
-    if PLATFORM_LINUX:
-        plat = platform.processor()
-        if plat == "aarch64":
-            return "sbsa-linux"
-        else:
-            return f"{plat}-linux"
-    elif PLATFORM_WIN:
-        return "x64"
-    else:
-        raise AssertionError
-
-
-def _check_cuda_home():
-    # We need some CUDA headers for compiling mathDx headers.
-    # We assume users properly managing their local envs (ex: no mix-n-match).
-    global CUDA_HOME
-
-    # Try wheel
-    try:
-        # We need CUDA 12+ for device API support
-        cudart = files("nvidia-cuda-runtime-cu12")
-        cccl = files("nvidia-cuda-cccl-cu12")
-        # use cuda_fp16.h (which we need) as a proxy
-        cudart = [f for f in cudart if "cuda_fp16.h" in str(f)][0]
-        cudart = os.path.join(os.path.dirname(cudart.locate()), "..")
-        # use cuda/std/type_traits as a proxy
-        cccl = min([f for f in cccl if re.match(".*cuda\\/std\\/type_traits.*", str(f))], key=lambda x: len(str(x)))
-        cccl = os.path.join(os.path.dirname(cccl.locate()), "../../..")
-    except PackageNotFoundError:
-        pass
-    except ValueError:
-        # cccl wheel is buggy (headers missing), skip using wheels
-        pass
-    else:
-        CUDA_HOME = (cudart, cccl)
-        return
-
-    # Try conda
-    if "CONDA_PREFIX" in os.environ:
-        if PLATFORM_LINUX:
-            conda_include = os.path.join(
-                os.environ["CONDA_PREFIX"], "targets", f"{_conda_get_target_name()}", "include"
-            )
-        elif PLATFORM_WIN:
-            conda_include = os.path.join(os.environ["CONDA_PREFIX"], "Library", "include")
-        else:
-            assert AssertionError
-        if os.path.isfile(os.path.join(conda_include, "cuda_fp16.h")) and os.path.isfile(
-            os.path.join(conda_include, "cuda/std/type_traits")
-        ):
-            CUDA_HOME = (os.path.join(conda_include, ".."),)
-            return
-
-    # Try local
-    CUDA_PATH = os.environ.get("CUDA_PATH", None)
-    CUDA_HOME = os.environ.get("CUDA_HOME", None)
-    if CUDA_PATH is None and CUDA_HOME is None:
-        raise RuntimeError(
-            "cudart headers not found. Depending on how you install nvmath-python and other CUDA packages,\n"
-            "you may need to perform one of the steps below:\n"
-            "  - conda install -c conda-forge cuda-cudart-dev cuda-cccl cuda-version=12\n"
-            "  - export CUDA_HOME=/path/to/CUDA/Toolkit"
-        )
-    elif CUDA_PATH is not None and CUDA_HOME is None:
-        CUDA_HOME = CUDA_PATH
-    elif CUDA_PATH is not None and CUDA_HOME is not None:
-        if CUDA_HOME != CUDA_PATH:
-            warnings.warn(
-                "Both CUDA_HOME and CUDA_PATH are set but not consistent. " "Ignoring CUDA_PATH...", stacklevel=2
-            )
-    CUDA_HOME = (CUDA_HOME,)
-
-
-def get_cuda_include_dirs():
-    _check_cuda_home()
-
-    global CUDA_HOME
-    include_dirs = [(f"{h}" + "/include").encode("utf-8") for h in CUDA_HOME]
-    arr_include_dirs = (ctypes.c_char_p * len(include_dirs))()
-    arr_include_dirs[:] = include_dirs
-    return arr_include_dirs

From 6195e930666cff0135de21db3aa1312710531910 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Mon, 28 Oct 2024 16:35:00 -0700
Subject: [PATCH 085/102] Bump libmathdx build

---
 .gitlab/ci/mathdx-support.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
index d13873e9..bc711297 100644
--- a/.gitlab/ci/mathdx-support.yml
+++ b/.gitlab/ci/mathdx-support.yml
@@ -36,7 +36,7 @@ linux-x86_64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/92/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/93/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -59,7 +59,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/92/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/93/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps

From 5fa11a75540834c5a12749a49d399c37d21e907f Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 29 Oct 2024 10:29:20 -0700
Subject: [PATCH 086/102] Fix Ruff errors

---
 warp/examples/tile/example_tile_mlp.py | 230 ++++++++++++-------------
 1 file changed, 115 insertions(+), 115 deletions(-)

diff --git a/warp/examples/tile/example_tile_mlp.py b/warp/examples/tile/example_tile_mlp.py
index ef0f49e4..b5e4f82e 100644
--- a/warp/examples/tile/example_tile_mlp.py
+++ b/warp/examples/tile/example_tile_mlp.py
@@ -21,20 +21,20 @@
 #
 ###########################################################################
 
-import numpy as np
-import warp as wp
-import warp.examples
-import warp.optim
-
 import math
 import os
 
+import numpy as np
 from PIL import Image
 
+import warp as wp
+import warp.examples
+import warp.optim
+
 rng = np.random.default_rng(45)
 
-def create_layer(dim_in, dim_hid, dtype=float):
 
+def create_layer(dim_in, dim_hid, dtype=float):
     w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
     b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
 
@@ -43,8 +43,8 @@ def create_layer(dim_in, dim_hid, dtype=float):
 
     return (weights, bias)
 
-def create_array(dim_in, dim_hid, dtype=float):
 
+def create_array(dim_in, dim_hid, dtype=float):
     s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
     a = wp.array(s, dtype=dtype, requires_grad=True)
 
@@ -54,63 +54,68 @@ def create_array(dim_in, dim_hid, dtype=float):
 # number of frequencies for the positional encoding
 NUM_FREQ = wp.constant(8)
 
-DIM_IN = wp.constant(4*NUM_FREQ)  # sin,cos for both x,y at each frequenecy
+DIM_IN = wp.constant(4 * NUM_FREQ)  # sin,cos for both x,y at each frequenecy
 DIM_HID = 32
 DIM_OUT = 3
 
 # threads per-block
 NUM_THREADS = 32
 
-IMG_WIDTH = NUM_THREADS*16
-IMG_HEIGHT = NUM_THREADS*16
+IMG_WIDTH = NUM_THREADS * 16
+IMG_HEIGHT = NUM_THREADS * 16
 
-BATCH_SIZE = min(1024, int((IMG_WIDTH*IMG_HEIGHT)/8))
+BATCH_SIZE = min(1024, int((IMG_WIDTH * IMG_HEIGHT) / 8))
 
 # dtype for our weights and bias matrices
 dtype = wp.float16
 
+
 @wp.func
 def relu(x: dtype):
     return wp.max(x, dtype(0.0))
 
-@wp.kernel
-def compute(indices: wp.array(dtype=int),
-            weights_0: wp.array2d(dtype=dtype), bias_0: wp.array2d(dtype=dtype),
-            weights_1: wp.array2d(dtype=dtype), bias_1: wp.array2d(dtype=dtype),
-            weights_2: wp.array2d(dtype=dtype), bias_2: wp.array2d(dtype=dtype),
-            weights_3: wp.array2d(dtype=dtype), bias_3: wp.array2d(dtype=dtype),
-            reference: wp.array2d(dtype=float),
-            loss: wp.array1d(dtype=float),
-            out: wp.array2d(dtype=float)):
 
+@wp.kernel
+def compute(
+    indices: wp.array(dtype=int),
+    weights_0: wp.array2d(dtype=dtype),
+    bias_0: wp.array2d(dtype=dtype),
+    weights_1: wp.array2d(dtype=dtype),
+    bias_1: wp.array2d(dtype=dtype),
+    weights_2: wp.array2d(dtype=dtype),
+    bias_2: wp.array2d(dtype=dtype),
+    weights_3: wp.array2d(dtype=dtype),
+    bias_3: wp.array2d(dtype=dtype),
+    reference: wp.array2d(dtype=float),
+    loss: wp.array1d(dtype=float),
+    out: wp.array2d(dtype=float),
+):
     # batch indices
     linear = indices[wp.tid()]
 
-    row = linear/IMG_WIDTH
-    col = linear%IMG_WIDTH
+    row = linear / IMG_WIDTH
+    col = linear % IMG_WIDTH
 
     # normalize input coordinates to [-1, 1]
-    x = (float(row)/float(IMG_WIDTH) - 0.5)*2.0
-    y = (float(col)/float(IMG_HEIGHT) - 0.5)*2.0
+    x = (float(row) / float(IMG_WIDTH) - 0.5) * 2.0
+    y = (float(col) / float(IMG_HEIGHT) - 0.5) * 2.0
 
     local = wp.vector(dtype=dtype, length=DIM_IN)
 
     # construct positional encoding
     for s in range(NUM_FREQ):
-
-        scale = wp.pow(2.0, float(s))*wp.pi
+        scale = wp.pow(2.0, float(s)) * wp.pi
 
         # x-coord
-        local[s*4 + 0] = dtype(wp.sin(x * scale))
-        local[s*4 + 1] = dtype(wp.cos(x * scale))
+        local[s * 4 + 0] = dtype(wp.sin(x * scale))
+        local[s * 4 + 1] = dtype(wp.cos(x * scale))
         # y-coord
-        local[s*4 + 2] = dtype(wp.sin(y * scale))
-        local[s*4 + 3] = dtype(wp.cos(y * scale))
-
+        local[s * 4 + 2] = dtype(wp.sin(y * scale))
+        local[s * 4 + 3] = dtype(wp.cos(y * scale))
 
     # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
     f = wp.tile(local)
-    
+
     # input layer
     w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
     b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
@@ -134,78 +139,89 @@ def compute(indices: wp.array(dtype=int),
     output = wp.untile(o)
 
     # compute error
-    error = wp.vec3(float(output[0]) - reference[0,linear],
-                    float(output[1]) - reference[1,linear],
-                    float(output[2]) - reference[2,linear])
+    error = wp.vec3(
+        float(output[0]) - reference[0, linear],
+        float(output[1]) - reference[1, linear],
+        float(output[2]) - reference[2, linear],
+    )
 
     # write MSE loss
     if loss:
-        wp.atomic_add(loss, 0, wp.length_sq(error)/float(3*BATCH_SIZE))
+        wp.atomic_add(loss, 0, wp.length_sq(error) / float(3 * BATCH_SIZE))
 
     #  write image output
     if out:
         for i in range(DIM_OUT):
             out[i, linear] = float(output[i])
-            
 
-class Example:
 
+class Example:
     def __init__(self, train_iters):
-
         self.weights_0, self.bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
         self.weights_1, self.bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
         self.weights_2, self.bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
         self.weights_3, self.bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
 
-        # reference 
+        # reference
         reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
         with Image.open(reference_path) as im:
-            reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0    
-        self.reference = wp.array(reference_image.reshape(IMG_WIDTH*IMG_HEIGHT, 3).T, dtype=float)
+            reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0
+        self.reference = wp.array(reference_image.reshape(IMG_WIDTH * IMG_HEIGHT, 3).T, dtype=float)
 
         # create randomized batch indices
-        indices = np.arange(0, IMG_WIDTH*IMG_HEIGHT, dtype=np.int32)
+        indices = np.arange(0, IMG_WIDTH * IMG_HEIGHT, dtype=np.int32)
         rng.shuffle(indices)
         self.indices = wp.array(indices)
 
-        self.num_batches = int((IMG_WIDTH*IMG_HEIGHT)/BATCH_SIZE)
+        self.num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
         self.max_iters = train_iters
-        self.max_epochs = max(1, int(self.max_iters/self.num_batches))
+        self.max_epochs = max(1, int(self.max_iters / self.num_batches))
 
     def train_warp(self):
-
-        params = [self.weights_0, self.bias_0,
-                  self.weights_1, self.bias_1, 
-                  self.weights_2, self.bias_2,
-                  self.weights_3, self.bias_3]
+        params = [
+            self.weights_0,
+            self.bias_0,
+            self.weights_1,
+            self.bias_1,
+            self.weights_2,
+            self.bias_2,
+            self.weights_3,
+            self.bias_3,
+        ]
 
         optimizer_grads = [p.grad.flatten() for p in params]
         optimizer_inputs = [p.flatten() for p in params]
         optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
-       
+
         loss = wp.zeros(1, dtype=float, requires_grad=True)
-        output = create_array(IMG_WIDTH*IMG_HEIGHT, DIM_OUT)
+        output = create_array(IMG_WIDTH * IMG_HEIGHT, DIM_OUT)
 
         # capture graph for whole epoch
         wp.capture_begin()
-    
-        for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
 
+        for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
             loss.zero_()
 
             with wp.Tape() as tape:
                 wp.launch(
-                    compute, 
+                    compute,
                     dim=[BATCH_SIZE],
-                    inputs=[self.indices[b:b+BATCH_SIZE],
-                            self.weights_0, self.bias_0,
-                            self.weights_1, self.bias_1,
-                            self.weights_2, self.bias_2, 
-                            self.weights_3, self.bias_3, 
-                            self.reference,
-                            loss,
-                            None],
-                    block_dim=NUM_THREADS)
+                    inputs=[
+                        self.indices[b : b + BATCH_SIZE],
+                        self.weights_0,
+                        self.bias_0,
+                        self.weights_1,
+                        self.bias_1,
+                        self.weights_2,
+                        self.bias_2,
+                        self.weights_3,
+                        self.bias_3,
+                        self.reference,
+                        loss,
+                        None,
+                    ],
+                    block_dim=NUM_THREADS,
+                )
 
             tape.backward(loss)
             optimizer.step(optimizer_grads)
@@ -213,36 +229,36 @@ def train_warp(self):
 
         graph = wp.capture_end()
 
-
         with wp.ScopedTimer("Training"):
-
             for i in range(self.max_epochs):
-
                 with wp.ScopedTimer("Epoch"):
                     wp.capture_launch(graph)
                     print(f"Epoch: {i} Loss: {loss.numpy()}")
 
-
         # evaluate full image
         wp.launch(
-            compute, 
-            dim=[IMG_WIDTH*IMG_HEIGHT],
-            inputs=[self.indices,
-                    self.weights_0, self.bias_0,
-                    self.weights_1, self.bias_1,
-                    self.weights_2, self.bias_2, 
-                    self.weights_3, self.bias_3, 
-                    self.reference,
-                    loss,
-                    output],
-            block_dim=NUM_THREADS)
-        
-        self.save_image(f"example_tile_mlp.jpg", output.numpy())
-        
-
+            compute,
+            dim=[IMG_WIDTH * IMG_HEIGHT],
+            inputs=[
+                self.indices,
+                self.weights_0,
+                self.bias_0,
+                self.weights_1,
+                self.bias_1,
+                self.weights_2,
+                self.bias_2,
+                self.weights_3,
+                self.bias_3,
+                self.reference,
+                loss,
+                output,
+            ],
+            block_dim=NUM_THREADS,
+        )
+
+        self.save_image("example_tile_mlp.jpg", output.numpy())
 
     def train_torch(self):
-
         import torch as tc
 
         weights_0 = tc.nn.Parameter(wp.to_torch(self.weights_0))
@@ -258,20 +274,17 @@ def train_torch(self):
         indices = wp.to_torch(self.indices)
         reference = wp.to_torch(self.reference)
 
-        optimizer = tc.optim.Adam([weights_0,
-                                   bias_0, 
-                                   weights_1,
-                                   bias_1,
-                                   weights_2,
-                                   bias_2,
-                                   weights_3,
-                                   bias_3], capturable=True, lr=0.0001, betas=(0.9, 0.95), eps=1.e-6)
-
+        optimizer = tc.optim.Adam(
+            [weights_0, bias_0, weights_1, bias_1, weights_2, bias_2, weights_3, bias_3],
+            capturable=True,
+            lr=0.0001,
+            betas=(0.9, 0.95),
+            eps=1.0e-6,
+        )
 
         # generate frequency space encoding of pixels
         # based on their linear index in the image
         def encode(linear):
-            
             row = (linear // IMG_WIDTH).float()
             col = (linear % IMG_WIDTH).float()
 
@@ -287,17 +300,16 @@ def encode(linear):
                 encoding[s * 4 + 0, :] = tc.sin(scale * x)
                 encoding[s * 4 + 1, :] = tc.cos(scale * x)
                 encoding[s * 4 + 2, :] = tc.sin(scale * y)
-                encoding[s * 4 + 3, :] = tc.cos(scale * y)        
+                encoding[s * 4 + 3, :] = tc.cos(scale * y)
 
             return encoding
 
-
         stream = tc.cuda.Stream()
         graph = tc.cuda.CUDAGraph()
 
         # warm-up
         with tc.cuda.stream(stream):
-            f = tc.rand((NUM_FREQ*4, BATCH_SIZE), dtype=tc.float16, device="cuda")
+            f = tc.rand((NUM_FREQ * 4, BATCH_SIZE), dtype=tc.float16, device="cuda")
             z = tc.relu(weights_0 @ f + bias_0)
             z = tc.relu(weights_1 @ z + bias_1)
             z = tc.relu(weights_2 @ z + bias_2)
@@ -309,10 +321,8 @@ def encode(linear):
             optimizer.step()
 
         with tc.cuda.graph(graph):
-
-            for b in range(0, IMG_WIDTH*IMG_HEIGHT, BATCH_SIZE):
-
-                linear = indices[b:b+BATCH_SIZE]
+            for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
+                linear = indices[b : b + BATCH_SIZE]
 
                 f = encode(linear)
 
@@ -323,23 +333,19 @@ def encode(linear):
 
                 ref = reference[:, linear]
                 loss = tc.mean((z - ref) ** 2)
-                
+
                 optimizer.zero_grad()
                 loss.backward()
                 optimizer.step()
 
-
         with wp.ScopedTimer("Training (Torch)"):
-
-            for i in range(self.max_epochs):
-
+            for _i in range(self.max_epochs):
                 with wp.ScopedTimer("Epoch"):
                     graph.replay()
 
                     print(loss)
-        
 
-        f = encode(tc.arange(0, IMG_WIDTH*IMG_HEIGHT))
+        f = encode(tc.arange(0, IMG_WIDTH * IMG_HEIGHT))
         z = tc.relu(weights_0 @ f + bias_0)
         z = tc.relu(weights_1 @ z + bias_1)
         z = tc.relu(weights_2 @ z + bias_2)
@@ -347,9 +353,7 @@ def encode(linear):
 
         self.save_image("example_tile_mlp_torch.jpg", z.detach().cpu().numpy())
 
-
     def save_image(self, name, output):
-
         predicted_image = output.T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
         predicted_image = (predicted_image * 255).astype(np.uint8)
 
@@ -357,10 +361,7 @@ def save_image(self, name, output):
         predicted_image_pil.save(name)
 
 
-
-
 if __name__ == "__main__":
-
     import argparse
 
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -369,7 +370,6 @@ def save_image(self, name, output):
     args = parser.parse_known_args()[0]
 
     with wp.ScopedDevice("cuda:0"):
-
         example = Example(args.train_iters)
         example.train_warp()
-        #example.train_torch()
+        # example.train_torch()

From a14bac183b863e2ff1518158176c6bb88c84ed45 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 29 Oct 2024 11:41:40 -0700
Subject: [PATCH 087/102] Merge with main

---
 .gitlab-ci.yml                            |  24 ++-
 .gitlab/ci/additional-tests.yml           |   6 +-
 .gitlab/ci/cuda-11-build-and-test.yml     |  10 +-
 .gitlab/ci/debug-build-and-test.yml       |   6 +-
 CHANGELOG.md                              | 144 +++++++++++-------
 README.md                                 |   6 +-
 VERSION.md                                |   2 +-
 docs/changelog.md                         |   8 +
 docs/codegen.rst                          | 148 +++++++++++++++++-
 docs/conf.py                              |   6 +
 docs/index.rst                            |   1 +
 docs/installation.rst                     |   6 +-
 docs/modules/differentiability.rst        |   8 +-
 docs/modules/functions.rst                | 168 +++++++++++----------
 docs/modules/sim.rst                      |   2 +
 docs/requirements.txt                     |   1 +
 exts/omni.warp.core/config/extension.toml |   2 +-
 exts/omni.warp.core/docs/CHANGELOG.md     | 125 +++++++++-------
 exts/omni.warp/config/extension.toml      |   4 +-
 exts/omni.warp/docs/CHANGELOG.md          | 125 +++++++++-------
 warp/__init__.py                          |   3 +
 warp/builtins.py                          | 140 +++++++++--------
 warp/codegen.py                           |  43 +++---
 warp/config.py                            |   2 +-
 warp/context.py                           |  52 +++----
 warp/examples/fem/utils.py                |   3 +-
 warp/examples/optim/example_walker.py     |   4 +-
 warp/fem/utils.py                         |  13 +-
 warp/native/array.h                       |  80 +++++-----
 warp/native/builtin.h                     |  78 +++++++---
 warp/native/bvh.cu                        |   4 +-
 warp/native/bvh.h                         |   4 +
 warp/native/exports.h                     |  17 +++
 warp/native/hashgrid.h                    |   4 +
 warp/native/mesh.cu                       |   4 +-
 warp/native/mesh.h                        |   4 +
 warp/native/range.h                       |  17 ++-
 warp/sim/integrator_xpbd.py               |   8 +-
 warp/sim/model.py                         |   5 +-
 warp/sparse.py                            |  16 +-
 warp/stubs.py                             | 174 +++++++++++-----------
 warp/tests/test_array.py                  |  82 ++++++++++
 warp/tests/test_codegen.py                |  70 +++++++++
 warp/tests/test_fabricarray.py            |  33 ++++
 warp/tests/test_fem.py                    |  18 ++-
 warp/tests/test_func.py                   |  36 ++++-
 warp/tests/test_generics.py               |  52 +++++++
 warp/tests/test_iter.py                   |  68 +++++++++
 warp/tests/test_model.py                  |  13 ++
 warp/tests/test_print.py                  | 135 +++++++++++++++++
 warp/tests/test_static.py                 | 158 +++++++++++++++++++-
 warp/tests/unittest_suites.py             |   4 +
 warp/types.py                             |  37 +++--
 53 files changed, 1603 insertions(+), 580 deletions(-)
 create mode 100644 docs/changelog.md
 create mode 100644 warp/tests/test_iter.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 554b9273..566a12bc 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,7 +9,11 @@
 # CI/CD Pipeline Configuration
 # ==============================================================================
 
-include: /.gitlab/ci/common.yml
+include:
+  - local: /.gitlab/ci/common.yml
+  - project: "omniverse/devplat/gitlab/templates/common/compliance"
+    file: "modules/omniverse-repo-compliance.gitlab-ci.yml"
+    ref: v1_latest
 
 workflow:
   rules:
@@ -66,7 +70,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
-    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker # We are already using the builder image
+    - ./tools/ci/building/build-linux-aarch64/build.sh --no-docker # We are already using the builder image
     - mkdir -p warp/bin/linux-aarch64
     - mv warp/bin/warp.so warp/bin/linux-aarch64
     - mv warp/bin/warp-clang.so warp/bin/linux-aarch64
@@ -139,6 +143,12 @@ ruff format:
   script:
     - ruff format --diff
 
+osec:sonarqube:
+  variables:
+    # Disable C/C++ analyzer until project specific work is done to enable it.
+    # See: https://confluence.nvidia.com/display/OMNIVERSE/SonarQube+Gitlab+CI+Integration#C+Project+Enablement+Additions
+    SONAR_EXTRA_ARGS: "-Dsonar.c.file.suffixes=- -Dsonar.cpp.file.suffixes=- -Dsonar.objc.file.suffixes=-"
+
 # ==============================================================================
 # Main Unit Testing Jobs
 #
@@ -340,11 +350,7 @@ windows-x86_64 test mgpu:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"
@@ -377,6 +383,7 @@ linux-x86_64 test warp-init:
         - build_llvm.py
     - when: manual # If not auto-triggered, allow any pipeline to run this job manually
       allow_failure: true
+  timeout: 10m
   before_script:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
     - df -h
@@ -728,6 +735,7 @@ merge request docs:
       - public
   rules:
     - if: $CI_PIPELINE_SOURCE == 'merge_request_event'
+  timeout: 10m
   environment:
     name: review/$CI_MERGE_REQUEST_IID
     url: https://$CI_PROJECT_ROOT_NAMESPACE.$CI_PAGES_DOMAIN/-/$CI_PROJECT_NAME/-/jobs/$CI_JOB_ID/artifacts/public/index.html
@@ -752,6 +760,7 @@ check generated files:
       - docs/modules/functions.rst
   rules:
     - if: $CI_PIPELINE_SOURCE == 'merge_request_event'
+  timeout: 10m
   extends:
     - .runner-utility-linux-x86_64
   script:
@@ -769,6 +778,7 @@ pages:
     - .build-docs-common
   rules:
     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+  timeout: 10m
   environment:
     name: GitLab Pages
     deployment_tier: staging
diff --git a/.gitlab/ci/additional-tests.yml b/.gitlab/ci/additional-tests.yml
index aba4a45d..7a59be88 100644
--- a/.gitlab/ci/additional-tests.yml
+++ b/.gitlab/ci/additional-tests.yml
@@ -67,11 +67,7 @@ windows-x86_64 test:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"
diff --git a/.gitlab/ci/cuda-11-build-and-test.yml b/.gitlab/ci/cuda-11-build-and-test.yml
index 7282d9e8..3f5cd25d 100644
--- a/.gitlab/ci/cuda-11-build-and-test.yml
+++ b/.gitlab/ci/cuda-11-build-and-test.yml
@@ -25,6 +25,7 @@ include:
       - "templates/v3/windows/codesign.gitlab-ci.yml"
       - "templates/v3/linux/packman_s3.gitlab-ci.yml"
       - "templates/v3/windows/packman_s3.gitlab-ci.yml"
+      - "templates/v3/linux/nucleus/kit-extensions.ov.nvidia.com/kit-extension-svc.gitlab-ci.yml"
 
     ref: v1_latest
 
@@ -52,7 +53,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
-    - ./tools/ci/building/build-linux-x86_64/build.sh --cuda 11 --no-docker # We are already using the builder image
+    - ./tools/ci/building/build-linux-aarch64/build.sh --cuda 11 --no-docker # We are already using the builder image
     - mkdir -p warp/bin/linux-aarch64
     - mv warp/bin/warp.so warp/bin/linux-aarch64
     - mv warp/bin/warp-clang.so warp/bin/linux-aarch64
@@ -144,11 +145,7 @@ windows-x86_64 test:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"
@@ -314,4 +311,5 @@ publish extensions to packman:
   script:
     - !reference [.osec:vault:v3:linux, codesign:perform_vault_requests]
     - !reference [.osec:vault:v3:linux, packman_s3:perform_vault_requests]
+    - !reference [.osec:vault:v3:linux, nucleus:kit-extensions.ov.nvidia.com:kit-extension-svc:perform_vault_requests]
     - tools/repo.sh publish_exts --publish-all
diff --git a/.gitlab/ci/debug-build-and-test.yml b/.gitlab/ci/debug-build-and-test.yml
index e041739a..d028af2e 100644
--- a/.gitlab/ci/debug-build-and-test.yml
+++ b/.gitlab/ci/debug-build-and-test.yml
@@ -136,11 +136,7 @@ windows-x86_64 test:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ac1e54d..fae03b44 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,35 @@
-# CHANGELOG
+# Changelog
+
+## [1.4.0] - 2024-10-01
+
+### Added
+
+- Expose a `reversed()` built-in for iterators to test ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
+
+### Changed
+
+- Promote the `wp.Int`, `wp.Float`, and `wp.Scalar` generic annotation types to the public API.
+- Make the output of `wp.print()` in backward kernels consistent for all supported data types.
+
+### Fixed
+
+- Fix to relax the integer types expected when indexing arrays (regression in 1.3.0).
+- Fix printing vector and matrix adjoints in backward kernels.
+- Fix kernel compile error when printing structs.
+- Fix an incorrect user function being sometimes resolved when multiple overloads are available with array parameters with different `dtype` values.
+- Fix error being raised when static and dynamic for-loops are written in sequence with the same iteration variable names ([GH-331](https://github.com/NVIDIA/warp/issues/331)).
+
+## [1.4.1] - 2024-10-15
+
+### Fixed
+
+- Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
+- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes.
+- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`.
+- Fix invalid code generation error messages when nesting dynamic and static for-loops.
+- Fix caching of kernels with static expressions.
+- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+- Re-introduced the `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` onto the Python scope to revert a breaking change.
 
 ## [1.4.0] - 2024-10-01
 
@@ -204,7 +235,7 @@
 - Fix for handling of `bool` types in generic kernels
 - Publish CUDA 12.5 binaries for Hopper support, see https://github.com/nvidia/warp?tab=readme-ov-file#installing for details
 
-## [1.1.1] - 2024-05-24
+## 1.1.1 - 2024-05-24
 
 - `wp.init()` is no longer required to be called explicitly and will be performed on first call to the API
 - Speed up `omni.warp.core`'s startup time
@@ -239,7 +270,7 @@
 - Support gradient propagation for indexing sliced multi-dimensional arrays, i.e. `a[i][j]` vs. `a[i, j]`
 - Provide an informative message if setting DLL C-types failed, instructing to try rebuilding the library
 
-## [1.0.3] - 2024-04-17
+## 1.0.3 - 2024-04-17
 
 - Add a `support_level` entry to the configuration file of the extensions
 
@@ -317,7 +348,7 @@
 - Added `wp.ones()` to efficiently create one-initialized arrays
 - Rename `wp.config.graph_capture_module_load_default` to `wp.config.enable_graph_capture_module_load_by_default`
 
-## [0.14.0] - 2024-02-19
+## 0.14.0 - 2024-02-19
 
 - Add support for CUDA pooled (stream-ordered) allocators
   - Support memory allocation during graph capture
@@ -354,7 +385,7 @@
   - Fixed a small CPU memory leak related to DLPack interop
 - Improved performance of creating arrays
 
-## [0.13.1] - 2024-02-22
+## 0.13.1 - 2024-02-22
 
 - Ensure that the results from the `Noise Deform` are deterministic across different Kit sessions
 
@@ -367,7 +398,7 @@
 - Add missing `.py` extension to `warp/tests/walkthrough_debug`
 - Allow `wp.bool` usage in vector and matrix types
 
-## [0.12.0] - 2024-02-05
+## 0.12.0 - 2024-02-05
 
 - Add a warning when the `enable_backward` setting is set to `False` upon calling `wp.Tape.backward()`
 - Fix kernels not being recompiled as expected when defined using a closure
@@ -383,7 +414,7 @@
   - Point releases (if any) go on the same minor release branch and only contain bug fixes, not new features.
   - The `public` branch, previously used to merge releases into and corresponding with the GitHub `main` branch, is retired.
 
-## [1.0.0-beta.7] - 2024-01-23
+## 1.0.0-beta.7 - 2024-01-23
 
 - Ensure captures are always enclosed in `try`/`finally`
 - Only include .py files from the warp subdirectory into wheel packages
@@ -445,7 +476,7 @@
 - Documentation update for `wp.BVH`
 - Documentation and simplified API for runtime kernel specialization `wp.Kernel`
 
-## [1.0.0-beta.4] - 2023-11-01
+## 1.0.0-beta.4 - 2023-11-01
 
 - Add `wp.cbrt()` for cube root calculation
 - Add `wp.mesh_furthest_point_no_sign()` to compute furthest point on a surface from a query point
@@ -457,7 +488,7 @@
 - Fix for `wp.utils.array_sum()` output initialization when used with vector types
 - Coverage and documentation updates
 
-## [1.0.0-beta.3] - 2023-10-19
+## 1.0.0-beta.3 - 2023-10-19
 
 - Add support for code coverage scans (test_coverage.py), coverage at 85% in `omni.warp.core`
 - Add support for named component access for vector types, e.g.: `a = v.x`
@@ -479,13 +510,13 @@
 
 - To support grid-stride kernels, `wp.tid()` can no longer be called inside `wp.func` functions.
 
-## [1.0.0-beta.2] - 2023-09-01
+## 1.0.0-beta.2 - 2023-09-01
 
 - Fix for passing bool into `wp.func` functions
 - Fix for deprecation warnings appearing on `stderr`, now redirected to `stdout`
 - Fix for using `for i in wp.hash_grid_query(..)` syntax
 
-## [1.0.0-beta.1] - 2023-08-29
+## 1.0.0-beta.1 - 2023-08-29
 
 - Fix for `wp.float16` being passed as kernel arguments
 - Fix for compile errors with kernels using structs in backward pass
@@ -524,7 +555,7 @@
 - Update margin used by for mesh queries when using `wp.sim.create_soft_body_contacts()`
 - Improvements to gradient handling with `wp.from_torch()`, `wp.to_torch()` plus documentation
 
-## [0.10.0] - 2023-07-05
+## 0.10.0 - 2023-07-05
 
 - Add support for macOS universal binaries (x86 + aarch64) for M1+ support
 - Add additional methods for SDF generation please see the following new methods:
@@ -600,7 +631,7 @@
 - Deprecate `wp.Model.soft_contact_distance` which is now replaced by `wp.Model.particle_radius`
 - Deprecate single scalar particle radius (should be a per-particle array)
 
-## [0.8.2] - 2023-04-21
+## 0.8.2 - 2023-04-21
 
 - Add `ModelBuilder.soft_contact_max` to control the maximum number of soft contacts that can be registered. Use `Model.allocate_soft_contacts(new_count)` to change count on existing `Model` objects.
 - Add support for `bool` parameters
@@ -611,12 +642,12 @@
 - Add sign determination using winding number of `wp.mesh_query_point()` as `wp.mesh_query_sign_winding_number()`
 - Add query point without sign determination `wp.mesh_query_no_sign()`
 
-## [0.8.1] - 2023-04-13
+## 0.8.1 - 2023-04-13
 
 - Fix for regression when passing flattened numeric lists as matrix arguments to kernels
 - Fix for regressions when passing `wp.struct` types with uninitialized (`None`) member attributes
 
-## [0.8.0] - 2023-04-05
+## 0.8.0 - 2023-04-05
 
 - Add `Texture Write` node for updating dynamic RTX textures from Warp kernels / nodes
 - Add multi-dimensional kernel support to Warp Kernel Node
@@ -660,14 +691,14 @@
 - `wp.sim.model.ground_plane` is now a `wp.array` to support gradient, users should call `builder.set_ground_plane()` to create the ground 
 - `wp.sim` capsule, cones, and cylinders are now aligned with the default USD up-axis
 
-## [0.7.2] - 2023-02-15
+## 0.7.2 - 2023-02-15
 
 - Reduce test time for vec/math types
 - Clean-up CUDA disabled build pipeline
 - Remove extension.gen.toml to make Kit packages Python version independent
 - Handle additional cases for array indexing inside Python
 
-## [0.7.1] - 2023-02-14
+## 0.7.1 - 2023-02-14
 
 - Disabling some slow tests for Kit
 - Make unit tests run on first GPU only by default
@@ -684,13 +715,13 @@
 - Add security pop-up for Kernel Node
 - Improve error handling for kernel return values
 
-## [0.6.3] - 2023-01-31
+## 0.6.3 - 2023-01-31
 
 - Add DLPack utilities, see `wp.from_dlpack()`, `wp.to_dlpack()`
 - Add Jax utilities, see `wp.from_jax()`, `wp.to_jax()`, `wp.device_from_jax()`, `wp.device_to_jax()`
 - Fix for Linux Kit extensions OM-80132, OM-80133
 
-## [0.6.2] - 2023-01-19
+## 0.6.2 - 2023-01-19
 
 - Updated `wp.from_torch()` to support more data types
 - Updated `wp.from_torch()` to automatically determine the target Warp data type if not specified
@@ -705,14 +736,14 @@
 - Replace Python `imp` package with `importlib`
 - Fix for quaternion slerp gradients (`wp.quat_slerp()`)
 
-## [0.6.1] - 2022-12-05
+## 0.6.1 - 2022-12-05
 
 - Fix for non-CUDA builds
 - Fix strides computation in array_t constructor, fixes a bug with accessing mesh indices through mesh.indices[]
 - Disable backward pass code generation for kernel node (4-6x faster compilation)
 - Switch to linbuild for universal Linux binaries (affects TeamCity builds only)
 
-## [0.6.0] - 2022-11-28
+## 0.6.0 - 2022-11-28
 
 - Add support for CUDA streams, see `wp.Stream`, `wp.get_stream()`, `wp.set_stream()`, `wp.synchronize_stream()`, `wp.ScopedStream`
 - Add support for CUDA events, see `wp.Event`, `wp.record_event()`, `wp.wait_event()`, `wp.wait_stream()`, `wp.Stream.record_event()`, `wp.Stream.wait_event()`, `wp.Stream.wait_stream()`
@@ -737,7 +768,7 @@
 - Fix various deployment issues by statically linking with all CUDA libs
 - Update warp.so/warp.dll to CUDA Toolkit 11.5
 
-## [0.5.1] - 2022-11-01
+## 0.5.1 - 2022-11-01
 
 - Fix for unit tests in Kit
 
@@ -774,14 +805,14 @@
 - Fix for arrays > 2GB in length
 - Add support for per-vertex USD mesh colors with `wp.render` class
 
-## [0.4.2] - 2022-09-07
+## 0.4.2 - 2022-09-07
 
 - Register Warp samples to the sample browser in Kit
 - Add NDEBUG flag to release mode kernel builds
 - Fix for particle solver node when using a large number of particles
 - Fix for broken cameras in Warp sample scenes
 
-## [0.4.1] - 2022-08-30
+## 0.4.1 - 2022-08-30
 
 - Add geometry sampling methods, see `wp.sample_unit_cube()`, `wp.sample_unit_disk()`, etc
 - Add `wp.lower_bound()` for searching sorted arrays
@@ -791,7 +822,7 @@
 - Fix for debug flags not being set correctly on CUDA when `wp.config.mode == "debug"`, this enables bounds checking on CUDA kernels in debug mode
 - Fix for code gen of functions that do not return a value
 
-## [0.4.0] - 2022-08-09
+## 0.4.0 - 2022-08-09
 
 - Fix for FP16 conversions on GPUs without hardware support
 - Fix for `runtime = None` errors when reloading the Warp module
@@ -808,7 +839,7 @@
 
 - Removed `wp.runtime` reference from the top-level module, as it should be considered private
 
-## [0.3.2] - 2022-07-19
+## 0.3.2 - 2022-07-19
 
 - Remove Torch import from `__init__.py`, defer import to `wp.from_torch()`, `wp.to_torch()`
 
@@ -830,7 +861,7 @@
 - `wp.synchronize()` now synchronizes all devices; for finer-grained control, use `wp.synchronize_device()`
 - Device alias `"cuda"` now refers to the current CUDA context, rather than a specific device like `"cuda:0"` or `"cuda:1"`
 
-## [0.3.0] - 2022-07-08
+## 0.3.0 - 2022-07-08
 
 - Add support for FP16 storage type, see `wp.float16`
 - Add support for per-dimension byte strides, see `wp.array.strides`
@@ -867,7 +898,7 @@
 - Tape `capture` option has been removed, users can now capture tapes inside existing CUDA graphs (e.g.: inside Torch)
 - Scalar loss arrays should now explicitly set `requires_grad=True` at creation time
 
-## [0.2.2] - 2022-05-30
+## 0.2.2 - 2022-05-30
 
 - Fix for `from import *` inside Warp initialization
 - Fix for body space velocity when using deforming Mesh objects with scale
@@ -891,7 +922,7 @@
 - Local `@wp.func` functions should not be namespaced when called, e.g.: previously `wp.myfunc()` would work even if `myfunc()` was not a builtin
 - Removed `wp.rpy2quat()`, please use `wp.quat_rpy()` instead
 
-## [0.2.1] - 2022-05-11
+## 0.2.1 - 2022-05-11
 
 - Fix for unit tests in Kit
 
@@ -940,7 +971,7 @@
 - `wp.array.length` member has been removed, please use `wp.array.shape` to access array dimensions, or use `wp.array.size` to get total element count
 - Marking `dense_gemm()`, `dense_chol()`, etc methods as experimental until we revisit them
 
-## [0.1.25] - 2022-03-20
+## 0.1.25 - 2022-03-20
 
 - Add support for class methods to be Warp kernels
 - Add HashGrid reserve() so it can be used with CUDA graphs
@@ -950,7 +981,7 @@
 - Add support for floored division on integer types
 - Move tests into core library so they can be run in Kit environment
 
-## [0.1.24] - 2022-03-03
+## 0.1.24 - 2022-03-03
 
 ### Warp Core
 
@@ -966,7 +997,7 @@
 - Fix for ranged for loops with negative step sizes
 - Fix for 3d and 4d spherical gradient distributions
 
-## [0.1.23] - 2022-02-17
+## 0.1.23 - 2022-02-17
 
 ### Warp Core
 
@@ -976,7 +1007,7 @@
 - Add procedural noise primitives, see `wp.noise()`, `wp.pnoise()`, `wp.curlnoise()`
 - Move simulation helpers our of test into `wp.sim` module
 
-## [0.1.22] - 2022-02-14
+## 0.1.22 - 2022-02-14
 
 ### Warp Core
 
@@ -990,7 +1021,7 @@
 
 - Add support for universal and compound joint types
 
-## [0.1.21] - 2022-01-19
+## 0.1.21 - 2022-01-19
 
 ### Warp Core
 
@@ -1010,19 +1041,19 @@
 - New OgnParticleVolume node for sampling shapes -> particles
 - New OgnParticleSolver node for DEM style granular materials
 
-## [0.1.20] - 2021-11-02
+## 0.1.20 - 2021-11-02
 
 - Updates to the ripple solver for GTC (support for multiple colliders, buoyancy, etc)
 
-## [0.1.19] - 2021-10-15
+## 0.1.19 - 2021-10-15
 
 - Publish from 2021.3 to avoid omni.graph database incompatibilities
 
-## [0.1.18] - 2021-10-08
+## 0.1.18 - 2021-10-08
 
 - Enable Linux support (tested on 20.04)
 
-## [0.1.17] - 2021-09-30
+## 0.1.17 - 2021-09-30
 
 - Fix for 3x3 SVD adjoint
 - Fix for A6000 GPU (bump compute model to sm_52 minimum)
@@ -1031,12 +1062,12 @@
 - Rename spatial_transform -> transform
 - Documentation update
 
-## [0.1.16] - 2021-09-06
+## 0.1.16 - 2021-09-06
 
 - Fix for case where simple assignments (a = b) incorrectly generated reference rather than value copy
 - Handle passing zero-length (empty) arrays to kernels
 
-## [0.1.15] - 2021-09-03
+## 0.1.15 - 2021-09-03
 
 - Add additional math library functions (asin, etc)
 - Add builtin 3x3 SVD support
@@ -1049,62 +1080,62 @@
 - Removes the need to transfer array to CPU before numpy conversion (will be done implicitly)
 - Update the example OgnRipple wave equation solver to use bundles
 
-## [0.1.14] - 2021-08-09
+## 0.1.14 - 2021-08-09
 
 - Fix for out-of-bounds memory access in CUDA BVH
 - Better error checking after kernel launches (use `wp.config.verify_cuda=True`)
 - Fix for vec3 normalize adjoint code
 
-## [0.1.13] - 2021-07-29
+## 0.1.13 - 2021-07-29
 
 - Remove OgnShrinkWrap.py test node
 
-## [0.1.12] - 2021-07-29
+## 0.1.12 - 2021-07-29
 
 - Switch to Woop et al.'s watertight ray-tri intersection test
 - Disable --fast-math in CUDA compilation step for improved precision
 
-## [0.1.11] - 2021-07-28
+## 0.1.11 - 2021-07-28
 
 - Fix for `wp.mesh_query_ray()` returning incorrect t-value
 
-## [0.1.10] - 2021-07-28
+## 0.1.10 - 2021-07-28
 
 - Fix for OV extension fwatcher filters to avoid hot-reload loop due to OGN regeneration
 
-## [0.1.9] - 2021-07-21
+## 0.1.9 - 2021-07-21
 
 - Fix for loading sibling DLL paths
 - Better type checking for built-in function arguments
 - Added runtime docs, can now list all builtins using `wp.print_builtins()`
 
-## [0.1.8] - 2021-07-14
+## 0.1.8 - 2021-07-14
 
 - Fix for hot-reload of CUDA kernels
 - Add Tape object for replaying differentiable kernels
 - Add helpers for Torch interop (convert `torch.Tensor` to `wp.Array`)
 
-## [0.1.7] - 2021-07-05
+## 0.1.7 - 2021-07-05
 
 - Switch to NVRTC for CUDA runtime
 - Allow running without host compiler
 - Disable asserts in kernel release mode (small perf. improvement)
 
-## [0.1.6] - 2021-06-14
+## 0.1.6 - 2021-06-14
 
 - Look for CUDA toolchain in target-deps
 
-## [0.1.5] - 2021-06-14
+## 0.1.5 - 2021-06-14
 
 - Rename OgLang -> Warp
 - Improve CUDA environment error checking
 - Clean-up some logging, add verbose mode (`wp.config.verbose`)
 
-## [0.1.4] - 2021-06-10
+## 0.1.4 - 2021-06-10
 
 - Add support for mesh raycast
 
-## [0.1.3] - 2021-06-09
+## 0.1.3 - 2021-06-09
 
 - Add support for unary negation operator
 - Add support for mutating variables during dynamic loops (non-differentiable)
@@ -1112,7 +1143,7 @@
 - Improve kernel cache start up times (avoids adjointing before cache check)
 - Update README.md with requirements / examples
 
-## [0.1.2] - 2021-06-03
+## 0.1.2 - 2021-06-03
 
 - Add support for querying mesh velocities
 - Add CUDA graph support, see `wp.capture_begin()`, `wp.capture_end()`, `wp.capture_launch()`
@@ -1122,15 +1153,16 @@
 
 - Fix for Linux/macOS support
 
-## [0.1.1] - 2021-05-18
+## 0.1.1 - 2021-05-18
 
 - Fix bug with conflicting CUDA contexts
 
-## [0.1.0] - 2021-05-17
+## 0.1.0 - 2021-05-17
 
 - Initial publish for alpha testing
 
-[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.4.0...HEAD
+[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.4.1...HEAD
+[1.4.1]: https://github.com/NVIDIA/warp/releases/tag/v1.4.1
 [1.4.0]: https://github.com/NVIDIA/warp/releases/tag/v1.4.0
 [1.3.3]: https://github.com/NVIDIA/warp/releases/tag/v1.3.3
 [1.3.2]: https://github.com/NVIDIA/warp/releases/tag/v1.3.2
diff --git a/README.md b/README.md
index 54c1bbfd..ac8a11dc 100644
--- a/README.md
+++ b/README.md
@@ -45,9 +45,9 @@ the `pip install` command, e.g.
 
 | Platform        | Install Command                                                                                                               |
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_aarch64.whl` |
-| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_x86_64.whl`  |
-| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-win_amd64.whl`             |
+| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_aarch64.whl` |
+| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_x86_64.whl`  |
+| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-win_amd64.whl`             |
 
 The `--force-reinstall` option may need to be used to overwrite a previous installation.
 
diff --git a/VERSION.md b/VERSION.md
index 88c5fb89..347f5833 100644
--- a/VERSION.md
+++ b/VERSION.md
@@ -1 +1 @@
-1.4.0
+1.4.1
diff --git a/docs/changelog.md b/docs/changelog.md
new file mode 100644
index 00000000..4e68f707
--- /dev/null
+++ b/docs/changelog.md
@@ -0,0 +1,8 @@
+---
+tocdepth: 2
+---
+
+<!--- This file simply includes the top-level changelog --->
+
+```{include} ../CHANGELOG.md
+```
diff --git a/docs/codegen.rst b/docs/codegen.rst
index fe5ed81b..b4984781 100644
--- a/docs/codegen.rst
+++ b/docs/codegen.rst
@@ -446,6 +446,153 @@ The above program uses a static expression to select the right function given th
     [2. 0.]
 
 
+Advanced Example: Branching Elimination with Static Loop Unrolling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In computational simulations, it's common to apply different operations or boundary conditions based on runtime variables. However, conditional branching using runtime variables often leads to performance issues due to register pressure, as the GPU may allocate resources for all branches even if some of them are never taken. To tackle this, we can utilize static loop unrolling via ``wp.static(...)``, which helps eliminate unnecessary branching at compile-time and improve parallel execution.
+
+**Scenario:**
+
+Suppose we have three different functions ``apply_func_a``, ``apply_func_b``, and ``apply_func_c`` that perform different mathematical operations.
+
+We are currently interested in applying only two of these functions (``apply_func_a`` and ``apply_func_b``) on a given dataset. Which function we apply to each data point is determined by a runtime variable ``func_id``, which is provided as an array to the kernel called ``func_field``.
+
+In practice, ``func_field`` represents a mapping of which operation should be applied to each data point, and is particularly useful when dealing with boundary conditions or different regions of a physical simulation. For example, in a fluid simulation, different regions of the fluid might require different updates based on pre-defined boundary conditions.
+
+**Naive Approach Implementation**
+
+To start, let us first consider a naive approach to implement this, which involves straightforward runtime branching based on the value of func_id. This approach will highlight why we need to optimize further.
+
+.. code:: python
+
+    import warp as wp
+    import numpy as np
+
+    # Define three functions that perform different operations
+    @wp.func
+    def apply_func_a(x: float) -> float:
+        return x + 10.0
+
+    @wp.func
+    def apply_func_b(x: float) -> float:
+        return x * 2.0
+
+    @wp.func
+    def apply_func_c(x: float) -> float:
+        return x - 5.0
+
+    # Assign static IDs to represent each function
+    func_id_a = 0
+    func_id_b = 1
+    func_id_c = 2  # Not used in this kernel
+
+    # Kernel that applies the correct function to each element of the input array
+    @wp.kernel
+    def apply_func_conditions_naive(x: wp.array(dtype=wp.float32), func_field: wp.array(dtype=wp.int8)):
+        tid = wp.tid()
+        value = x[tid]
+        result = value
+        func_id = func_field[tid]  # Get the function ID for this element
+
+        # Apply the corresponding function based on func_id
+        if func_id == func_id_a:
+            result = apply_func_a(value)
+        elif func_id == func_id_b:
+            result = apply_func_b(value)
+        elif func_id == func_id_c:
+            result = apply_func_c(value)
+
+        x[tid] = result
+
+    # Example usage
+    data = wp.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=wp.float32)
+
+    # Create an array that specifies which function to apply to each element
+    func_field = wp.array([func_id_a, func_id_b, func_id_b, func_id_a, func_id_b], dtype=wp.int8)
+
+    # Launch the kernel
+    wp.launch(apply_func_conditions_naive, inputs=[data, func_field], dim=data.size)
+
+    print(data.numpy())
+
+**Output:**
+
+.. code:: python
+
+    [11.  4.  6. 14. 10.]
+
+Since ``func_id`` is not static, the compiler cannot eliminate the unused function at compile time. Looking at the generated CUDA code, we can see the kernel includes an extra branching for the unused ``apply_func_c``:
+
+.. code:: cpp
+
+    //...
+    var_11 = wp::select(var_9, var_4, var_10);
+    if (!var_9) {
+        var_13 = (var_7 == var_12);
+        if (var_13) {
+            var_14 = apply_func_b_0(var_3);
+        }
+        var_15 = wp::select(var_13, var_11, var_14);
+        if (!var_13) {
+            var_17 = (var_7 == var_16);
+            if (var_17) {
+                var_18 = apply_func_c_0(var_3);
+            }
+            var_19 = wp::select(var_17, var_15, var_18);
+        }
+        var_20 = wp::select(var_13, var_19, var_15);
+    }
+    //...
+
+**Optimization**
+
+To avoid the extra branching, we can use the static loop unrolling via ``wp.static(...)`` to effectively "compile out" the unnecessary branches and only keep the operations that are relevant.
+
+**Implementation:**
+
+.. code:: python
+
+    funcs = [apply_func_a, apply_func_b, apply_func_c]
+
+    # Assign static IDs to represent each function
+    func_id_a = 0
+    func_id_b = 1
+    func_id_c = 2  # Not used in this kernel
+
+    # Define which function IDs are actually used in this kernel
+    used_func_ids = (func_id_a, func_id_b)
+
+    @wp.kernel
+    def apply_func_conditions(x: wp.array(dtype=wp.float32), func_field: wp.array(dtype=wp.int8)):
+        tid = wp.tid()
+        value = x[tid]
+        result = value
+        func_id = func_field[tid]  # Get the function ID for this element
+
+        # Unroll the loop over the used function IDs
+        for i in range(wp.static(len(used_func_ids))):
+            func_static_id = wp.static(used_func_ids[i])
+            if func_id == func_static_id:
+                result = wp.static(funcs[i])(value)
+
+        x[tid] = result
+
+
+In the generated CUDA code, we can see that the optimized code does not branch for the unused function.
+
+.. code:: cpp
+    
+    //...
+    var_10 = (var_7 == var_9);
+    if (var_10) {
+        var_11 = apply_func_a_1(var_3);
+    }
+    var_12 = wp::select(var_10, var_4, var_11);
+    var_15 = (var_7 == var_14);
+    if (var_15) {
+        var_16 = apply_func_b_1(var_3);
+    }
+    //...
+
 .. _dynamic_generation:
 
 Dynamic Kernel Creation
@@ -566,7 +713,6 @@ Output:
     [ 1.  4.  9.  16.  25.]
     [ 1.  8.  27.  64.  125.]
 
-
 Function Closures
 ~~~~~~~~~~~~~~~~~
 
diff --git a/docs/conf.py b/docs/conf.py
index 400d0c77..f77e02ff 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,6 +34,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "myst_parser",  # Parse markdown files
     "sphinx.ext.autodoc",
     "sphinx.ext.napoleon",  # Convert docstrings to reStructuredText
     "sphinx.ext.intersphinx",
@@ -74,6 +75,11 @@
     "github": ("https://github.com/NVIDIA/warp/blob/main/%s", "%s"),
 }
 
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "markdown",
+}
+
 
 def linkcode_resolve(domain, info):
     """Tries to generate external links to code hosted on the Warp GitHub
diff --git a/docs/index.rst b/docs/index.rst
index 4338cb9f..135d0871 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -357,6 +357,7 @@ Full Table of Contents
     limitations
     modules/contribution_guide
     faq
+    changelog
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/installation.rst b/docs/installation.rst
index b432a326..3e2e6354 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -25,11 +25,11 @@ the ``pip install`` command, e.g.
    * - Platform
      - Install Command
    * - Linux aarch64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_aarch64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_aarch64.whl``
    * - Linux x86-64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_x86_64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_x86_64.whl``
    * - Windows x86-64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-win_amd64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-win_amd64.whl``
 
 The ``--force-reinstall`` option may need to be used to overwrite a previous installation.
 
diff --git a/docs/modules/differentiability.rst b/docs/modules/differentiability.rst
index 3f1b8243..d3db5c53 100644
--- a/docs/modules/differentiability.rst
+++ b/docs/modules/differentiability.rst
@@ -778,9 +778,11 @@ In the example above we can see that the array ``c`` does not have its ``require
 Array Overwrite Tracking
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-It is a common mistake to inadvertently overwrite an array that participates in the computation graph. For example::
+It is a common mistake to inadvertently overwrite an array that participates in the computation graph. For example:
 
-    with tape as wp.Tape():
+.. code-block:: python
+
+    with wp.Tape() as tape:
 
         # step 1
         wp.launch(compute_forces, dim=n, inputs=[pos0, vel0], outputs=[force])
@@ -791,7 +793,7 @@ It is a common mistake to inadvertently overwrite an array that participates in
         wp.launch(simulate, dim=n, inputs=[pos1, vel1, force], outputs=[pos2, vel2])
 
         # compute loss
-        wp.launch(loss, dim=n, inputs=[pos2])
+        wp.launch(compute_loss, dim=n, inputs=[pos2], outputs=[loss])
 
     tape.backward(loss)
 
diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index ca1bea38..7982952d 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -712,7 +712,8 @@ Transformations
     Apply the transform to a point ``point`` treating the homogeneous coordinate as w=1.
 
     The transformation is applied treating ``point`` as a column vector, e.g.: ``y = mat*point``.
-    Note this is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = point^T*mat^T``.
+
+    This is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = point^T*mat^T``.
     If the transform is coming from a library that uses row-vectors, then users should transpose the transformation
     matrix before calling this method.
 
@@ -728,8 +729,9 @@ Transformations
 
     Apply the transform to a vector ``vec`` treating the homogeneous coordinate as w=0.
 
-    The transformation is applied treating ``vec`` as a column vector, e.g.: ``y = mat*vec``
-    note this is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = vec^T*mat^T``.
+    The transformation is applied treating ``vec`` as a column vector, e.g.: ``y = mat*vec``.
+
+    This is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = vec^T*mat^T``.
     If the transform is coming from a library that uses row-vectors, then users should transpose the transformation
     matrix before calling this method.
 
@@ -1291,6 +1293,11 @@ Utility
            All matrices are assumed to be stored in flattened row-major memory layout (NumPy default).
 
 
+.. py:function:: reversed(range: range_t) -> range_t
+
+    Returns the range in reversed order.
+
+
 .. py:function:: printf(fmt: str, *args: Any) -> None
 
     Allows printing formatted strings using C-style format specifiers.
@@ -1417,380 +1424,380 @@ Utility
     Select between two arguments, if ``arr`` is null then return ``value_if_false``, otherwise return ``value_if_true``
 
 
-.. py:function:: atomic_add(arr: Array[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: Array[Any], i: Int, value: Any) -> Any
 
     Atomically add ``value`` onto ``arr[i]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: Array[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: FabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: FabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value.
 
 
-.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_add(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: Array[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: Array[Any], i: Int, value: Any) -> Any
 
     Atomically subtract ``value`` onto ``arr[i]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: Array[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: FabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: FabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value.
 
 
-.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_sub(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value.
 
 
-.. py:function:: atomic_min(arr: Array[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: Array[Any], i: Int, value: Any) -> Any
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: Array[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: FabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: FabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_min(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: Array[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: Array[Any], i: Int, value: Any) -> Any
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: Array[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: FabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: FabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
-.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any
+.. py:function:: atomic_max(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any
     :noindex:
     :nocontentsentry:
 
     Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
 
 
 .. py:function:: lerp(a: Float, b: Float, t: Float) -> Float
@@ -2693,13 +2700,12 @@ Code Generation
 
     Evaluates a static Python expression and replaces it with its result.
 
-    See the `codegen.html#static-expressions <section on code generation>`_ for more details.
+    See the :ref:`code generation guide <static_expressions>` for more details.
 
-    Note:
-        The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
-        which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
-        The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
-        (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).
+    The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
+    which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
+    The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
+    (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).
 
 
 .. rubric:: Footnotes
diff --git a/docs/modules/sim.rst b/docs/modules/sim.rst
index 973401ad..eebd37ec 100644
--- a/docs/modules/sim.rst
+++ b/docs/modules/sim.rst
@@ -1,3 +1,5 @@
+:tocdepth: 3
+
 warp.sim
 ========
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index b8b6bd59..c8626adc 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -3,3 +3,4 @@ sphinx==8.0.2
 sphinx_copybutton==0.5.2
 numpy==2.1.1
 ruff==0.6.8
+myst-parser==4.0.0
diff --git a/exts/omni.warp.core/config/extension.toml b/exts/omni.warp.core/config/extension.toml
index 841caf50..04df653c 100644
--- a/exts/omni.warp.core/config/extension.toml
+++ b/exts/omni.warp.core/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.4.0"
+version = "1.4.1"
 authors = ["NVIDIA"]
 title = "Warp Core"
 description="The core Warp Python module"
diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md
index 82fb2e73..94d42d6e 100644
--- a/exts/omni.warp.core/docs/CHANGELOG.md
+++ b/exts/omni.warp.core/docs/CHANGELOG.md
@@ -1,5 +1,17 @@
 # CHANGELOG
 
+## [1.4.1] - 2024-10-15
+
+### Fixed
+
+- Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
+- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes.
+- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`.
+- Fix invalid code generation error messages when nesting dynamic and static for-loops.
+- Fix caching of kernels with static expressions.
+- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+- Re-introduced the `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` onto the Python scope to revert a breaking change.
+
 ## [1.4.0] - 2024-10-01
 
 ### Added
@@ -72,15 +84,14 @@
 
 - Bug fixes
   - Fix an aliasing issue with zero-copy array initialization from NumPy introduced in Warp 1.3.0.
-  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values.
+  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values ([GH-312](https://github.com/NVIDIA/warp/pull/312)).
 
 ## [1.3.2] - 2024-08-30
 
 - Bug fixes
   - Fix accuracy of 3x3 SVD ``wp.svd3`` with fp64 numbers ([GH-281](https://github.com/NVIDIA/warp/issues/281)).
   - Fix module hashing when a kernel argument contained a struct array ([GH-287](https://github.com/NVIDIA/warp/issues/287)).
-  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used
-  ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
+  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
   - Fix errors when launching a CUDA graph after a module is reloaded. Modules that were used during graph capture
     will no longer be unloaded before the graph is released.
   - Fix a bug in `wp.sim.collide.triangle_closest_point_barycentric()` where the returned barycentric coordinates may be
@@ -205,7 +216,7 @@
 - Fix for handling of `bool` types in generic kernels
 - Publish CUDA 12.5 binaries for Hopper support, see https://github.com/nvidia/warp?tab=readme-ov-file#installing for details
 
-## [1.1.1] - 2024-05-24
+## 1.1.1 - 2024-05-24
 
 - `wp.init()` is no longer required to be called explicitly and will be performed on first call to the API
 - Speed up `omni.warp.core`'s startup time
@@ -240,7 +251,7 @@
 - Support gradient propagation for indexing sliced multi-dimensional arrays, i.e. `a[i][j]` vs. `a[i, j]`
 - Provide an informative message if setting DLL C-types failed, instructing to try rebuilding the library
 
-## [1.0.3] - 2024-04-17
+## 1.0.3 - 2024-04-17
 
 - Add a `support_level` entry to the configuration file of the extensions
 
@@ -318,7 +329,7 @@
 - Added `wp.ones()` to efficiently create one-initialized arrays
 - Rename `wp.config.graph_capture_module_load_default` to `wp.config.enable_graph_capture_module_load_by_default`
 
-## [0.14.0] - 2024-02-19
+## 0.14.0 - 2024-02-19
 
 - Add support for CUDA pooled (stream-ordered) allocators
   - Support memory allocation during graph capture
@@ -355,7 +366,7 @@
   - Fixed a small CPU memory leak related to DLPack interop
 - Improved performance of creating arrays
 
-## [0.13.1] - 2024-02-22
+## 0.13.1 - 2024-02-22
 
 - Ensure that the results from the `Noise Deform` are deterministic across different Kit sessions
 
@@ -368,7 +379,7 @@
 - Add missing `.py` extension to `warp/tests/walkthrough_debug`
 - Allow `wp.bool` usage in vector and matrix types
 
-## [0.12.0] - 2024-02-05
+## 0.12.0 - 2024-02-05
 
 - Add a warning when the `enable_backward` setting is set to `False` upon calling `wp.Tape.backward()`
 - Fix kernels not being recompiled as expected when defined using a closure
@@ -384,7 +395,7 @@
   - Point releases (if any) go on the same minor release branch and only contain bug fixes, not new features.
   - The `public` branch, previously used to merge releases into and corresponding with the GitHub `main` branch, is retired.
 
-## [1.0.0-beta.7] - 2024-01-23
+## 1.0.0-beta.7 - 2024-01-23
 
 - Ensure captures are always enclosed in `try`/`finally`
 - Only include .py files from the warp subdirectory into wheel packages
@@ -446,7 +457,7 @@
 - Documentation update for `wp.BVH`
 - Documentation and simplified API for runtime kernel specialization `wp.Kernel`
 
-## [1.0.0-beta.4] - 2023-11-01
+## 1.0.0-beta.4 - 2023-11-01
 
 - Add `wp.cbrt()` for cube root calculation
 - Add `wp.mesh_furthest_point_no_sign()` to compute furthest point on a surface from a query point
@@ -458,7 +469,7 @@
 - Fix for `wp.utils.array_sum()` output initialization when used with vector types
 - Coverage and documentation updates
 
-## [1.0.0-beta.3] - 2023-10-19
+## 1.0.0-beta.3 - 2023-10-19
 
 - Add support for code coverage scans (test_coverage.py), coverage at 85% in `omni.warp.core`
 - Add support for named component access for vector types, e.g.: `a = v.x`
@@ -480,13 +491,13 @@
 
 - To support grid-stride kernels, `wp.tid()` can no longer be called inside `wp.func` functions.
 
-## [1.0.0-beta.2] - 2023-09-01
+## 1.0.0-beta.2 - 2023-09-01
 
 - Fix for passing bool into `wp.func` functions
 - Fix for deprecation warnings appearing on `stderr`, now redirected to `stdout`
 - Fix for using `for i in wp.hash_grid_query(..)` syntax
 
-## [1.0.0-beta.1] - 2023-08-29
+## 1.0.0-beta.1 - 2023-08-29
 
 - Fix for `wp.float16` being passed as kernel arguments
 - Fix for compile errors with kernels using structs in backward pass
@@ -525,7 +536,7 @@
 - Update margin used by for mesh queries when using `wp.sim.create_soft_body_contacts()`
 - Improvements to gradient handling with `wp.from_torch()`, `wp.to_torch()` plus documentation
 
-## [0.10.0] - 2023-07-05
+## 0.10.0 - 2023-07-05
 
 - Add support for macOS universal binaries (x86 + aarch64) for M1+ support
 - Add additional methods for SDF generation please see the following new methods:
@@ -601,7 +612,7 @@
 - Deprecate `wp.Model.soft_contact_distance` which is now replaced by `wp.Model.particle_radius`
 - Deprecate single scalar particle radius (should be a per-particle array)
 
-## [0.8.2] - 2023-04-21
+## 0.8.2 - 2023-04-21
 
 - Add `ModelBuilder.soft_contact_max` to control the maximum number of soft contacts that can be registered. Use `Model.allocate_soft_contacts(new_count)` to change count on existing `Model` objects.
 - Add support for `bool` parameters
@@ -612,12 +623,12 @@
 - Add sign determination using winding number of `wp.mesh_query_point()` as `wp.mesh_query_sign_winding_number()`
 - Add query point without sign determination `wp.mesh_query_no_sign()`
 
-## [0.8.1] - 2023-04-13
+## 0.8.1 - 2023-04-13
 
 - Fix for regression when passing flattened numeric lists as matrix arguments to kernels
 - Fix for regressions when passing `wp.struct` types with uninitialized (`None`) member attributes
 
-## [0.8.0] - 2023-04-05
+## 0.8.0 - 2023-04-05
 
 - Add `Texture Write` node for updating dynamic RTX textures from Warp kernels / nodes
 - Add multi-dimensional kernel support to Warp Kernel Node
@@ -661,14 +672,14 @@
 - `wp.sim.model.ground_plane` is now a `wp.array` to support gradient, users should call `builder.set_ground_plane()` to create the ground 
 - `wp.sim` capsule, cones, and cylinders are now aligned with the default USD up-axis
 
-## [0.7.2] - 2023-02-15
+## 0.7.2 - 2023-02-15
 
 - Reduce test time for vec/math types
 - Clean-up CUDA disabled build pipeline
 - Remove extension.gen.toml to make Kit packages Python version independent
 - Handle additional cases for array indexing inside Python
 
-## [0.7.1] - 2023-02-14
+## 0.7.1 - 2023-02-14
 
 - Disabling some slow tests for Kit
 - Make unit tests run on first GPU only by default
@@ -685,13 +696,13 @@
 - Add security pop-up for Kernel Node
 - Improve error handling for kernel return values
 
-## [0.6.3] - 2023-01-31
+## 0.6.3 - 2023-01-31
 
 - Add DLPack utilities, see `wp.from_dlpack()`, `wp.to_dlpack()`
 - Add Jax utilities, see `wp.from_jax()`, `wp.to_jax()`, `wp.device_from_jax()`, `wp.device_to_jax()`
 - Fix for Linux Kit extensions OM-80132, OM-80133
 
-## [0.6.2] - 2023-01-19
+## 0.6.2 - 2023-01-19
 
 - Updated `wp.from_torch()` to support more data types
 - Updated `wp.from_torch()` to automatically determine the target Warp data type if not specified
@@ -706,14 +717,14 @@
 - Replace Python `imp` package with `importlib`
 - Fix for quaternion slerp gradients (`wp.quat_slerp()`)
 
-## [0.6.1] - 2022-12-05
+## 0.6.1 - 2022-12-05
 
 - Fix for non-CUDA builds
 - Fix strides computation in array_t constructor, fixes a bug with accessing mesh indices through mesh.indices[]
 - Disable backward pass code generation for kernel node (4-6x faster compilation)
 - Switch to linbuild for universal Linux binaries (affects TeamCity builds only)
 
-## [0.6.0] - 2022-11-28
+## 0.6.0 - 2022-11-28
 
 - Add support for CUDA streams, see `wp.Stream`, `wp.get_stream()`, `wp.set_stream()`, `wp.synchronize_stream()`, `wp.ScopedStream`
 - Add support for CUDA events, see `wp.Event`, `wp.record_event()`, `wp.wait_event()`, `wp.wait_stream()`, `wp.Stream.record_event()`, `wp.Stream.wait_event()`, `wp.Stream.wait_stream()`
@@ -738,7 +749,7 @@
 - Fix various deployment issues by statically linking with all CUDA libs
 - Update warp.so/warp.dll to CUDA Toolkit 11.5
 
-## [0.5.1] - 2022-11-01
+## 0.5.1 - 2022-11-01
 
 - Fix for unit tests in Kit
 
@@ -775,14 +786,14 @@
 - Fix for arrays > 2GB in length
 - Add support for per-vertex USD mesh colors with `wp.render` class
 
-## [0.4.2] - 2022-09-07
+## 0.4.2 - 2022-09-07
 
 - Register Warp samples to the sample browser in Kit
 - Add NDEBUG flag to release mode kernel builds
 - Fix for particle solver node when using a large number of particles
 - Fix for broken cameras in Warp sample scenes
 
-## [0.4.1] - 2022-08-30
+## 0.4.1 - 2022-08-30
 
 - Add geometry sampling methods, see `wp.sample_unit_cube()`, `wp.sample_unit_disk()`, etc
 - Add `wp.lower_bound()` for searching sorted arrays
@@ -792,7 +803,7 @@
 - Fix for debug flags not being set correctly on CUDA when `wp.config.mode == "debug"`, this enables bounds checking on CUDA kernels in debug mode
 - Fix for code gen of functions that do not return a value
 
-## [0.4.0] - 2022-08-09
+## 0.4.0 - 2022-08-09
 
 - Fix for FP16 conversions on GPUs without hardware support
 - Fix for `runtime = None` errors when reloading the Warp module
@@ -809,7 +820,7 @@
 
 - Removed `wp.runtime` reference from the top-level module, as it should be considered private
 
-## [0.3.2] - 2022-07-19
+## 0.3.2 - 2022-07-19
 
 - Remove Torch import from `__init__.py`, defer import to `wp.from_torch()`, `wp.to_torch()`
 
@@ -831,7 +842,7 @@
 - `wp.synchronize()` now synchronizes all devices; for finer-grained control, use `wp.synchronize_device()`
 - Device alias `"cuda"` now refers to the current CUDA context, rather than a specific device like `"cuda:0"` or `"cuda:1"`
 
-## [0.3.0] - 2022-07-08
+## 0.3.0 - 2022-07-08
 
 - Add support for FP16 storage type, see `wp.float16`
 - Add support for per-dimension byte strides, see `wp.array.strides`
@@ -868,7 +879,7 @@
 - Tape `capture` option has been removed, users can now capture tapes inside existing CUDA graphs (e.g.: inside Torch)
 - Scalar loss arrays should now explicitly set `requires_grad=True` at creation time
 
-## [0.2.2] - 2022-05-30
+## 0.2.2 - 2022-05-30
 
 - Fix for `from import *` inside Warp initialization
 - Fix for body space velocity when using deforming Mesh objects with scale
@@ -892,7 +903,7 @@
 - Local `@wp.func` functions should not be namespaced when called, e.g.: previously `wp.myfunc()` would work even if `myfunc()` was not a builtin
 - Removed `wp.rpy2quat()`, please use `wp.quat_rpy()` instead
 
-## [0.2.1] - 2022-05-11
+## 0.2.1 - 2022-05-11
 
 - Fix for unit tests in Kit
 
@@ -941,7 +952,7 @@
 - `wp.array.length` member has been removed, please use `wp.array.shape` to access array dimensions, or use `wp.array.size` to get total element count
 - Marking `dense_gemm()`, `dense_chol()`, etc methods as experimental until we revisit them
 
-## [0.1.25] - 2022-03-20
+## 0.1.25 - 2022-03-20
 
 - Add support for class methods to be Warp kernels
 - Add HashGrid reserve() so it can be used with CUDA graphs
@@ -951,7 +962,7 @@
 - Add support for floored division on integer types
 - Move tests into core library so they can be run in Kit environment
 
-## [0.1.24] - 2022-03-03
+## 0.1.24 - 2022-03-03
 
 ### Warp Core
 
@@ -967,7 +978,7 @@
 - Fix for ranged for loops with negative step sizes
 - Fix for 3d and 4d spherical gradient distributions
 
-## [0.1.23] - 2022-02-17
+## 0.1.23 - 2022-02-17
 
 ### Warp Core
 
@@ -977,7 +988,7 @@
 - Add procedural noise primitives, see `wp.noise()`, `wp.pnoise()`, `wp.curlnoise()`
 - Move simulation helpers our of test into `wp.sim` module
 
-## [0.1.22] - 2022-02-14
+## 0.1.22 - 2022-02-14
 
 ### Warp Core
 
@@ -991,7 +1002,7 @@
 
 - Add support for universal and compound joint types
 
-## [0.1.21] - 2022-01-19
+## 0.1.21 - 2022-01-19
 
 ### Warp Core
 
@@ -1011,19 +1022,19 @@
 - New OgnParticleVolume node for sampling shapes -> particles
 - New OgnParticleSolver node for DEM style granular materials
 
-## [0.1.20] - 2021-11-02
+## 0.1.20 - 2021-11-02
 
 - Updates to the ripple solver for GTC (support for multiple colliders, buoyancy, etc)
 
-## [0.1.19] - 2021-10-15
+## 0.1.19 - 2021-10-15
 
 - Publish from 2021.3 to avoid omni.graph database incompatibilities
 
-## [0.1.18] - 2021-10-08
+## 0.1.18 - 2021-10-08
 
 - Enable Linux support (tested on 20.04)
 
-## [0.1.17] - 2021-09-30
+## 0.1.17 - 2021-09-30
 
 - Fix for 3x3 SVD adjoint
 - Fix for A6000 GPU (bump compute model to sm_52 minimum)
@@ -1032,12 +1043,12 @@
 - Rename spatial_transform -> transform
 - Documentation update
 
-## [0.1.16] - 2021-09-06
+## 0.1.16 - 2021-09-06
 
 - Fix for case where simple assignments (a = b) incorrectly generated reference rather than value copy
 - Handle passing zero-length (empty) arrays to kernels
 
-## [0.1.15] - 2021-09-03
+## 0.1.15 - 2021-09-03
 
 - Add additional math library functions (asin, etc)
 - Add builtin 3x3 SVD support
@@ -1050,62 +1061,62 @@
 - Removes the need to transfer array to CPU before numpy conversion (will be done implicitly)
 - Update the example OgnRipple wave equation solver to use bundles
 
-## [0.1.14] - 2021-08-09
+## 0.1.14 - 2021-08-09
 
 - Fix for out-of-bounds memory access in CUDA BVH
 - Better error checking after kernel launches (use `wp.config.verify_cuda=True`)
 - Fix for vec3 normalize adjoint code
 
-## [0.1.13] - 2021-07-29
+## 0.1.13 - 2021-07-29
 
 - Remove OgnShrinkWrap.py test node
 
-## [0.1.12] - 2021-07-29
+## 0.1.12 - 2021-07-29
 
 - Switch to Woop et al.'s watertight ray-tri intersection test
 - Disable --fast-math in CUDA compilation step for improved precision
 
-## [0.1.11] - 2021-07-28
+## 0.1.11 - 2021-07-28
 
 - Fix for `wp.mesh_query_ray()` returning incorrect t-value
 
-## [0.1.10] - 2021-07-28
+## 0.1.10 - 2021-07-28
 
 - Fix for OV extension fwatcher filters to avoid hot-reload loop due to OGN regeneration
 
-## [0.1.9] - 2021-07-21
+## 0.1.9 - 2021-07-21
 
 - Fix for loading sibling DLL paths
 - Better type checking for built-in function arguments
 - Added runtime docs, can now list all builtins using `wp.print_builtins()`
 
-## [0.1.8] - 2021-07-14
+## 0.1.8 - 2021-07-14
 
 - Fix for hot-reload of CUDA kernels
 - Add Tape object for replaying differentiable kernels
 - Add helpers for Torch interop (convert `torch.Tensor` to `wp.Array`)
 
-## [0.1.7] - 2021-07-05
+## 0.1.7 - 2021-07-05
 
 - Switch to NVRTC for CUDA runtime
 - Allow running without host compiler
 - Disable asserts in kernel release mode (small perf. improvement)
 
-## [0.1.6] - 2021-06-14
+## 0.1.6 - 2021-06-14
 
 - Look for CUDA toolchain in target-deps
 
-## [0.1.5] - 2021-06-14
+## 0.1.5 - 2021-06-14
 
 - Rename OgLang -> Warp
 - Improve CUDA environment error checking
 - Clean-up some logging, add verbose mode (`wp.config.verbose`)
 
-## [0.1.4] - 2021-06-10
+## 0.1.4 - 2021-06-10
 
 - Add support for mesh raycast
 
-## [0.1.3] - 2021-06-09
+## 0.1.3 - 2021-06-09
 
 - Add support for unary negation operator
 - Add support for mutating variables during dynamic loops (non-differentiable)
@@ -1113,7 +1124,7 @@
 - Improve kernel cache start up times (avoids adjointing before cache check)
 - Update README.md with requirements / examples
 
-## [0.1.2] - 2021-06-03
+## 0.1.2 - 2021-06-03
 
 - Add support for querying mesh velocities
 - Add CUDA graph support, see `wp.capture_begin()`, `wp.capture_end()`, `wp.capture_launch()`
@@ -1123,10 +1134,10 @@
 
 - Fix for Linux/macOS support
 
-## [0.1.1] - 2021-05-18
+## 0.1.1 - 2021-05-18
 
 - Fix bug with conflicting CUDA contexts
 
-## [0.1.0] - 2021-05-17
+## 0.1.0 - 2021-05-17
 
 - Initial publish for alpha testing
diff --git a/exts/omni.warp/config/extension.toml b/exts/omni.warp/config/extension.toml
index cfebd3b6..46985a75 100644
--- a/exts/omni.warp/config/extension.toml
+++ b/exts/omni.warp/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.4.0"
+version = "1.4.1"
 authors = ["NVIDIA"]
 title = "Warp"
 description="Warp OmniGraph Nodes and Sample Scenes"
@@ -35,7 +35,7 @@ exclude = ["Ogn*Database.py", "*/ogn*"]
 "omni.timeline" = {}
 "omni.ui" = {optional = true}
 "omni.usd" = {}
-"omni.warp.core" = {version = "1.4.0", exact = true}
+"omni.warp.core" = {version = "1.4.1", exact = true}
 
 [[python.module]]
 name = "omni.warp._extension"
diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md
index 82fb2e73..94d42d6e 100644
--- a/exts/omni.warp/docs/CHANGELOG.md
+++ b/exts/omni.warp/docs/CHANGELOG.md
@@ -1,5 +1,17 @@
 # CHANGELOG
 
+## [1.4.1] - 2024-10-15
+
+### Fixed
+
+- Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
+- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes.
+- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`.
+- Fix invalid code generation error messages when nesting dynamic and static for-loops.
+- Fix caching of kernels with static expressions.
+- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+- Re-introduced the `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` onto the Python scope to revert a breaking change.
+
 ## [1.4.0] - 2024-10-01
 
 ### Added
@@ -72,15 +84,14 @@
 
 - Bug fixes
   - Fix an aliasing issue with zero-copy array initialization from NumPy introduced in Warp 1.3.0.
-  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values.
+  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values ([GH-312](https://github.com/NVIDIA/warp/pull/312)).
 
 ## [1.3.2] - 2024-08-30
 
 - Bug fixes
   - Fix accuracy of 3x3 SVD ``wp.svd3`` with fp64 numbers ([GH-281](https://github.com/NVIDIA/warp/issues/281)).
   - Fix module hashing when a kernel argument contained a struct array ([GH-287](https://github.com/NVIDIA/warp/issues/287)).
-  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used
-  ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
+  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
   - Fix errors when launching a CUDA graph after a module is reloaded. Modules that were used during graph capture
     will no longer be unloaded before the graph is released.
   - Fix a bug in `wp.sim.collide.triangle_closest_point_barycentric()` where the returned barycentric coordinates may be
@@ -205,7 +216,7 @@
 - Fix for handling of `bool` types in generic kernels
 - Publish CUDA 12.5 binaries for Hopper support, see https://github.com/nvidia/warp?tab=readme-ov-file#installing for details
 
-## [1.1.1] - 2024-05-24
+## 1.1.1 - 2024-05-24
 
 - `wp.init()` is no longer required to be called explicitly and will be performed on first call to the API
 - Speed up `omni.warp.core`'s startup time
@@ -240,7 +251,7 @@
 - Support gradient propagation for indexing sliced multi-dimensional arrays, i.e. `a[i][j]` vs. `a[i, j]`
 - Provide an informative message if setting DLL C-types failed, instructing to try rebuilding the library
 
-## [1.0.3] - 2024-04-17
+## 1.0.3 - 2024-04-17
 
 - Add a `support_level` entry to the configuration file of the extensions
 
@@ -318,7 +329,7 @@
 - Added `wp.ones()` to efficiently create one-initialized arrays
 - Rename `wp.config.graph_capture_module_load_default` to `wp.config.enable_graph_capture_module_load_by_default`
 
-## [0.14.0] - 2024-02-19
+## 0.14.0 - 2024-02-19
 
 - Add support for CUDA pooled (stream-ordered) allocators
   - Support memory allocation during graph capture
@@ -355,7 +366,7 @@
   - Fixed a small CPU memory leak related to DLPack interop
 - Improved performance of creating arrays
 
-## [0.13.1] - 2024-02-22
+## 0.13.1 - 2024-02-22
 
 - Ensure that the results from the `Noise Deform` are deterministic across different Kit sessions
 
@@ -368,7 +379,7 @@
 - Add missing `.py` extension to `warp/tests/walkthrough_debug`
 - Allow `wp.bool` usage in vector and matrix types
 
-## [0.12.0] - 2024-02-05
+## 0.12.0 - 2024-02-05
 
 - Add a warning when the `enable_backward` setting is set to `False` upon calling `wp.Tape.backward()`
 - Fix kernels not being recompiled as expected when defined using a closure
@@ -384,7 +395,7 @@
   - Point releases (if any) go on the same minor release branch and only contain bug fixes, not new features.
   - The `public` branch, previously used to merge releases into and corresponding with the GitHub `main` branch, is retired.
 
-## [1.0.0-beta.7] - 2024-01-23
+## 1.0.0-beta.7 - 2024-01-23
 
 - Ensure captures are always enclosed in `try`/`finally`
 - Only include .py files from the warp subdirectory into wheel packages
@@ -446,7 +457,7 @@
 - Documentation update for `wp.BVH`
 - Documentation and simplified API for runtime kernel specialization `wp.Kernel`
 
-## [1.0.0-beta.4] - 2023-11-01
+## 1.0.0-beta.4 - 2023-11-01
 
 - Add `wp.cbrt()` for cube root calculation
 - Add `wp.mesh_furthest_point_no_sign()` to compute furthest point on a surface from a query point
@@ -458,7 +469,7 @@
 - Fix for `wp.utils.array_sum()` output initialization when used with vector types
 - Coverage and documentation updates
 
-## [1.0.0-beta.3] - 2023-10-19
+## 1.0.0-beta.3 - 2023-10-19
 
 - Add support for code coverage scans (test_coverage.py), coverage at 85% in `omni.warp.core`
 - Add support for named component access for vector types, e.g.: `a = v.x`
@@ -480,13 +491,13 @@
 
 - To support grid-stride kernels, `wp.tid()` can no longer be called inside `wp.func` functions.
 
-## [1.0.0-beta.2] - 2023-09-01
+## 1.0.0-beta.2 - 2023-09-01
 
 - Fix for passing bool into `wp.func` functions
 - Fix for deprecation warnings appearing on `stderr`, now redirected to `stdout`
 - Fix for using `for i in wp.hash_grid_query(..)` syntax
 
-## [1.0.0-beta.1] - 2023-08-29
+## 1.0.0-beta.1 - 2023-08-29
 
 - Fix for `wp.float16` being passed as kernel arguments
 - Fix for compile errors with kernels using structs in backward pass
@@ -525,7 +536,7 @@
 - Update margin used by for mesh queries when using `wp.sim.create_soft_body_contacts()`
 - Improvements to gradient handling with `wp.from_torch()`, `wp.to_torch()` plus documentation
 
-## [0.10.0] - 2023-07-05
+## 0.10.0 - 2023-07-05
 
 - Add support for macOS universal binaries (x86 + aarch64) for M1+ support
 - Add additional methods for SDF generation please see the following new methods:
@@ -601,7 +612,7 @@
 - Deprecate `wp.Model.soft_contact_distance` which is now replaced by `wp.Model.particle_radius`
 - Deprecate single scalar particle radius (should be a per-particle array)
 
-## [0.8.2] - 2023-04-21
+## 0.8.2 - 2023-04-21
 
 - Add `ModelBuilder.soft_contact_max` to control the maximum number of soft contacts that can be registered. Use `Model.allocate_soft_contacts(new_count)` to change count on existing `Model` objects.
 - Add support for `bool` parameters
@@ -612,12 +623,12 @@
 - Add sign determination using winding number of `wp.mesh_query_point()` as `wp.mesh_query_sign_winding_number()`
 - Add query point without sign determination `wp.mesh_query_no_sign()`
 
-## [0.8.1] - 2023-04-13
+## 0.8.1 - 2023-04-13
 
 - Fix for regression when passing flattened numeric lists as matrix arguments to kernels
 - Fix for regressions when passing `wp.struct` types with uninitialized (`None`) member attributes
 
-## [0.8.0] - 2023-04-05
+## 0.8.0 - 2023-04-05
 
 - Add `Texture Write` node for updating dynamic RTX textures from Warp kernels / nodes
 - Add multi-dimensional kernel support to Warp Kernel Node
@@ -661,14 +672,14 @@
 - `wp.sim.model.ground_plane` is now a `wp.array` to support gradient, users should call `builder.set_ground_plane()` to create the ground 
 - `wp.sim` capsule, cones, and cylinders are now aligned with the default USD up-axis
 
-## [0.7.2] - 2023-02-15
+## 0.7.2 - 2023-02-15
 
 - Reduce test time for vec/math types
 - Clean-up CUDA disabled build pipeline
 - Remove extension.gen.toml to make Kit packages Python version independent
 - Handle additional cases for array indexing inside Python
 
-## [0.7.1] - 2023-02-14
+## 0.7.1 - 2023-02-14
 
 - Disabling some slow tests for Kit
 - Make unit tests run on first GPU only by default
@@ -685,13 +696,13 @@
 - Add security pop-up for Kernel Node
 - Improve error handling for kernel return values
 
-## [0.6.3] - 2023-01-31
+## 0.6.3 - 2023-01-31
 
 - Add DLPack utilities, see `wp.from_dlpack()`, `wp.to_dlpack()`
 - Add Jax utilities, see `wp.from_jax()`, `wp.to_jax()`, `wp.device_from_jax()`, `wp.device_to_jax()`
 - Fix for Linux Kit extensions OM-80132, OM-80133
 
-## [0.6.2] - 2023-01-19
+## 0.6.2 - 2023-01-19
 
 - Updated `wp.from_torch()` to support more data types
 - Updated `wp.from_torch()` to automatically determine the target Warp data type if not specified
@@ -706,14 +717,14 @@
 - Replace Python `imp` package with `importlib`
 - Fix for quaternion slerp gradients (`wp.quat_slerp()`)
 
-## [0.6.1] - 2022-12-05
+## 0.6.1 - 2022-12-05
 
 - Fix for non-CUDA builds
 - Fix strides computation in array_t constructor, fixes a bug with accessing mesh indices through mesh.indices[]
 - Disable backward pass code generation for kernel node (4-6x faster compilation)
 - Switch to linbuild for universal Linux binaries (affects TeamCity builds only)
 
-## [0.6.0] - 2022-11-28
+## 0.6.0 - 2022-11-28
 
 - Add support for CUDA streams, see `wp.Stream`, `wp.get_stream()`, `wp.set_stream()`, `wp.synchronize_stream()`, `wp.ScopedStream`
 - Add support for CUDA events, see `wp.Event`, `wp.record_event()`, `wp.wait_event()`, `wp.wait_stream()`, `wp.Stream.record_event()`, `wp.Stream.wait_event()`, `wp.Stream.wait_stream()`
@@ -738,7 +749,7 @@
 - Fix various deployment issues by statically linking with all CUDA libs
 - Update warp.so/warp.dll to CUDA Toolkit 11.5
 
-## [0.5.1] - 2022-11-01
+## 0.5.1 - 2022-11-01
 
 - Fix for unit tests in Kit
 
@@ -775,14 +786,14 @@
 - Fix for arrays > 2GB in length
 - Add support for per-vertex USD mesh colors with `wp.render` class
 
-## [0.4.2] - 2022-09-07
+## 0.4.2 - 2022-09-07
 
 - Register Warp samples to the sample browser in Kit
 - Add NDEBUG flag to release mode kernel builds
 - Fix for particle solver node when using a large number of particles
 - Fix for broken cameras in Warp sample scenes
 
-## [0.4.1] - 2022-08-30
+## 0.4.1 - 2022-08-30
 
 - Add geometry sampling methods, see `wp.sample_unit_cube()`, `wp.sample_unit_disk()`, etc
 - Add `wp.lower_bound()` for searching sorted arrays
@@ -792,7 +803,7 @@
 - Fix for debug flags not being set correctly on CUDA when `wp.config.mode == "debug"`, this enables bounds checking on CUDA kernels in debug mode
 - Fix for code gen of functions that do not return a value
 
-## [0.4.0] - 2022-08-09
+## 0.4.0 - 2022-08-09
 
 - Fix for FP16 conversions on GPUs without hardware support
 - Fix for `runtime = None` errors when reloading the Warp module
@@ -809,7 +820,7 @@
 
 - Removed `wp.runtime` reference from the top-level module, as it should be considered private
 
-## [0.3.2] - 2022-07-19
+## 0.3.2 - 2022-07-19
 
 - Remove Torch import from `__init__.py`, defer import to `wp.from_torch()`, `wp.to_torch()`
 
@@ -831,7 +842,7 @@
 - `wp.synchronize()` now synchronizes all devices; for finer-grained control, use `wp.synchronize_device()`
 - Device alias `"cuda"` now refers to the current CUDA context, rather than a specific device like `"cuda:0"` or `"cuda:1"`
 
-## [0.3.0] - 2022-07-08
+## 0.3.0 - 2022-07-08
 
 - Add support for FP16 storage type, see `wp.float16`
 - Add support for per-dimension byte strides, see `wp.array.strides`
@@ -868,7 +879,7 @@
 - Tape `capture` option has been removed, users can now capture tapes inside existing CUDA graphs (e.g.: inside Torch)
 - Scalar loss arrays should now explicitly set `requires_grad=True` at creation time
 
-## [0.2.2] - 2022-05-30
+## 0.2.2 - 2022-05-30
 
 - Fix for `from import *` inside Warp initialization
 - Fix for body space velocity when using deforming Mesh objects with scale
@@ -892,7 +903,7 @@
 - Local `@wp.func` functions should not be namespaced when called, e.g.: previously `wp.myfunc()` would work even if `myfunc()` was not a builtin
 - Removed `wp.rpy2quat()`, please use `wp.quat_rpy()` instead
 
-## [0.2.1] - 2022-05-11
+## 0.2.1 - 2022-05-11
 
 - Fix for unit tests in Kit
 
@@ -941,7 +952,7 @@
 - `wp.array.length` member has been removed, please use `wp.array.shape` to access array dimensions, or use `wp.array.size` to get total element count
 - Marking `dense_gemm()`, `dense_chol()`, etc methods as experimental until we revisit them
 
-## [0.1.25] - 2022-03-20
+## 0.1.25 - 2022-03-20
 
 - Add support for class methods to be Warp kernels
 - Add HashGrid reserve() so it can be used with CUDA graphs
@@ -951,7 +962,7 @@
 - Add support for floored division on integer types
 - Move tests into core library so they can be run in Kit environment
 
-## [0.1.24] - 2022-03-03
+## 0.1.24 - 2022-03-03
 
 ### Warp Core
 
@@ -967,7 +978,7 @@
 - Fix for ranged for loops with negative step sizes
 - Fix for 3d and 4d spherical gradient distributions
 
-## [0.1.23] - 2022-02-17
+## 0.1.23 - 2022-02-17
 
 ### Warp Core
 
@@ -977,7 +988,7 @@
 - Add procedural noise primitives, see `wp.noise()`, `wp.pnoise()`, `wp.curlnoise()`
 - Move simulation helpers our of test into `wp.sim` module
 
-## [0.1.22] - 2022-02-14
+## 0.1.22 - 2022-02-14
 
 ### Warp Core
 
@@ -991,7 +1002,7 @@
 
 - Add support for universal and compound joint types
 
-## [0.1.21] - 2022-01-19
+## 0.1.21 - 2022-01-19
 
 ### Warp Core
 
@@ -1011,19 +1022,19 @@
 - New OgnParticleVolume node for sampling shapes -> particles
 - New OgnParticleSolver node for DEM style granular materials
 
-## [0.1.20] - 2021-11-02
+## 0.1.20 - 2021-11-02
 
 - Updates to the ripple solver for GTC (support for multiple colliders, buoyancy, etc)
 
-## [0.1.19] - 2021-10-15
+## 0.1.19 - 2021-10-15
 
 - Publish from 2021.3 to avoid omni.graph database incompatibilities
 
-## [0.1.18] - 2021-10-08
+## 0.1.18 - 2021-10-08
 
 - Enable Linux support (tested on 20.04)
 
-## [0.1.17] - 2021-09-30
+## 0.1.17 - 2021-09-30
 
 - Fix for 3x3 SVD adjoint
 - Fix for A6000 GPU (bump compute model to sm_52 minimum)
@@ -1032,12 +1043,12 @@
 - Rename spatial_transform -> transform
 - Documentation update
 
-## [0.1.16] - 2021-09-06
+## 0.1.16 - 2021-09-06
 
 - Fix for case where simple assignments (a = b) incorrectly generated reference rather than value copy
 - Handle passing zero-length (empty) arrays to kernels
 
-## [0.1.15] - 2021-09-03
+## 0.1.15 - 2021-09-03
 
 - Add additional math library functions (asin, etc)
 - Add builtin 3x3 SVD support
@@ -1050,62 +1061,62 @@
 - Removes the need to transfer array to CPU before numpy conversion (will be done implicitly)
 - Update the example OgnRipple wave equation solver to use bundles
 
-## [0.1.14] - 2021-08-09
+## 0.1.14 - 2021-08-09
 
 - Fix for out-of-bounds memory access in CUDA BVH
 - Better error checking after kernel launches (use `wp.config.verify_cuda=True`)
 - Fix for vec3 normalize adjoint code
 
-## [0.1.13] - 2021-07-29
+## 0.1.13 - 2021-07-29
 
 - Remove OgnShrinkWrap.py test node
 
-## [0.1.12] - 2021-07-29
+## 0.1.12 - 2021-07-29
 
 - Switch to Woop et al.'s watertight ray-tri intersection test
 - Disable --fast-math in CUDA compilation step for improved precision
 
-## [0.1.11] - 2021-07-28
+## 0.1.11 - 2021-07-28
 
 - Fix for `wp.mesh_query_ray()` returning incorrect t-value
 
-## [0.1.10] - 2021-07-28
+## 0.1.10 - 2021-07-28
 
 - Fix for OV extension fwatcher filters to avoid hot-reload loop due to OGN regeneration
 
-## [0.1.9] - 2021-07-21
+## 0.1.9 - 2021-07-21
 
 - Fix for loading sibling DLL paths
 - Better type checking for built-in function arguments
 - Added runtime docs, can now list all builtins using `wp.print_builtins()`
 
-## [0.1.8] - 2021-07-14
+## 0.1.8 - 2021-07-14
 
 - Fix for hot-reload of CUDA kernels
 - Add Tape object for replaying differentiable kernels
 - Add helpers for Torch interop (convert `torch.Tensor` to `wp.Array`)
 
-## [0.1.7] - 2021-07-05
+## 0.1.7 - 2021-07-05
 
 - Switch to NVRTC for CUDA runtime
 - Allow running without host compiler
 - Disable asserts in kernel release mode (small perf. improvement)
 
-## [0.1.6] - 2021-06-14
+## 0.1.6 - 2021-06-14
 
 - Look for CUDA toolchain in target-deps
 
-## [0.1.5] - 2021-06-14
+## 0.1.5 - 2021-06-14
 
 - Rename OgLang -> Warp
 - Improve CUDA environment error checking
 - Clean-up some logging, add verbose mode (`wp.config.verbose`)
 
-## [0.1.4] - 2021-06-10
+## 0.1.4 - 2021-06-10
 
 - Add support for mesh raycast
 
-## [0.1.3] - 2021-06-09
+## 0.1.3 - 2021-06-09
 
 - Add support for unary negation operator
 - Add support for mutating variables during dynamic loops (non-differentiable)
@@ -1113,7 +1124,7 @@
 - Improve kernel cache start up times (avoids adjointing before cache check)
 - Update README.md with requirements / examples
 
-## [0.1.2] - 2021-06-03
+## 0.1.2 - 2021-06-03
 
 - Add support for querying mesh velocities
 - Add CUDA graph support, see `wp.capture_begin()`, `wp.capture_end()`, `wp.capture_launch()`
@@ -1123,10 +1134,10 @@
 
 - Fix for Linux/macOS support
 
-## [0.1.1] - 2021-05-18
+## 0.1.1 - 2021-05-18
 
 - Fix bug with conflicting CUDA contexts
 
-## [0.1.0] - 2021-05-17
+## 0.1.0 - 2021-05-17
 
 - Initial publish for alpha testing
diff --git a/warp/__init__.py b/warp/__init__.py
index b051f837..243d7ae1 100644
--- a/warp/__init__.py
+++ b/warp/__init__.py
@@ -26,6 +26,9 @@
 from warp.types import spatial_vector, spatial_vectorh, spatial_vectorf, spatial_vectord
 from warp.types import spatial_matrix, spatial_matrixh, spatial_matrixf, spatial_matrixd
 
+# annotation types
+from warp.types import Int, Float, Scalar
+
 # geometry types
 from warp.types import Bvh, Mesh, HashGrid, Volume, MarchingCubes
 from warp.types import BvhQuery, HashGridQuery, MeshQueryAABB, MeshQueryPoint, MeshQueryRay
diff --git a/warp/builtins.py b/warp/builtins.py
index 0d2d51b2..21b62429 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1499,7 +1499,8 @@ def transform_identity_dispatch_func(input_types: Mapping[str, type], return_typ
     doc="""Apply the transform to a point ``point`` treating the homogeneous coordinate as w=1.
 
     The transformation is applied treating ``point`` as a column vector, e.g.: ``y = mat*point``.
-    Note this is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = point^T*mat^T``.
+
+    This is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = point^T*mat^T``.
     If the transform is coming from a library that uses row-vectors, then users should transpose the transformation
     matrix before calling this method.""",
 )
@@ -1517,8 +1518,9 @@ def transform_identity_dispatch_func(input_types: Mapping[str, type], return_typ
     group="Vector Math",
     doc="""Apply the transform to a vector ``vec`` treating the homogeneous coordinate as w=0.
 
-    The transformation is applied treating ``vec`` as a column vector, e.g.: ``y = mat*vec``
-    note this is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = vec^T*mat^T``.
+    The transformation is applied treating ``vec`` as a column vector, e.g.: ``y = mat*vec``.
+
+    This is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = vec^T*mat^T``.
     If the transform is coming from a library that uses row-vectors, then users should transpose the transformation
     matrix before calling this method.""",
 )
@@ -3551,6 +3553,16 @@ def compute():
     "iter_next", input_types={"query": mesh_query_aabb_t}, value_type=int, group="Utility", export=False, hidden=True
 )
 
+add_builtin(
+    "reversed",
+    input_types={"range": range_t},
+    value_type=range_t,
+    native_func="iter_reverse",
+    group="Utility",
+    doc="""Returns the range in reversed order.""",
+    export=False,
+)
+
 # ---------------------------------
 # Volumes
 
@@ -3922,7 +3934,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "rand_init",
     input_types={"seed": int},
     value_type=uint32,
-    export=False,
     group="Random",
     doc="Initialize a new random number generator given a user-defined seed. Returns a 32-bit integer representing the RNG state.",
 )
@@ -3931,7 +3942,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "rand_init",
     input_types={"seed": int, "offset": int},
     value_type=uint32,
-    export=False,
     group="Random",
     doc="""Initialize a new random number generator given a user-defined seed and an offset.
 
@@ -3943,7 +3953,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randi",
     input_types={"state": uint32},
     value_type=int,
-    export=False,
     group="Random",
     doc="Return a random integer in the range [0, 2^32).",
 )
@@ -3951,7 +3960,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randi",
     input_types={"state": uint32, "low": int, "high": int},
     value_type=int,
-    export=False,
     group="Random",
     doc="Return a random integer between [low, high).",
 )
@@ -3959,7 +3967,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randf",
     input_types={"state": uint32},
     value_type=float,
-    export=False,
     group="Random",
     doc="Return a random float between [0.0, 1.0).",
 )
@@ -3967,24 +3974,17 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randf",
     input_types={"state": uint32, "low": float, "high": float},
     value_type=float,
-    export=False,
     group="Random",
     doc="Return a random float between [low, high).",
 )
 add_builtin(
-    "randn",
-    input_types={"state": uint32},
-    value_type=float,
-    export=False,
-    group="Random",
-    doc="Sample a normal distribution.",
+    "randn", input_types={"state": uint32}, value_type=float, group="Random", doc="Sample a normal distribution."
 )
 
 add_builtin(
     "sample_cdf",
     input_types={"state": uint32, "cdf": array(dtype=float)},
     value_type=int,
-    export=False,
     group="Random",
     doc="Inverse-transform sample a cumulative distribution function.",
 )
@@ -3992,7 +3992,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_triangle",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a triangle. Returns sample barycentric coordinates.",
 )
@@ -4000,7 +3999,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_ring",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a ring in the xy plane.",
 )
@@ -4008,7 +4006,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_disk",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a disk in the xy plane.",
 )
@@ -4016,7 +4013,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_sphere_surface",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit sphere surface.",
 )
@@ -4024,7 +4020,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_sphere",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit sphere.",
 )
@@ -4032,7 +4027,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_hemisphere_surface",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit hemisphere surface.",
 )
@@ -4040,7 +4034,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_hemisphere",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit hemisphere.",
 )
@@ -4048,7 +4041,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_square",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit square.",
 )
@@ -4056,7 +4048,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_cube",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit cube.",
 )
@@ -4065,7 +4056,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "poisson",
     input_types={"state": uint32, "lam": float},
     value_type=uint32,
-    export=False,
     group="Random",
     doc="""Generate a random sample from a Poisson distribution.
 
@@ -4353,7 +4343,8 @@ def address_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, A
 for array_type in array_types:
     add_builtin(
         "address",
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int},
+        constraint=sametypes,
         defaults={"j": None, "k": None, "l": None},
         hidden=True,
         value_func=address_value_func,
@@ -4397,8 +4388,9 @@ def view_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]
 for array_type in array_types:
     add_builtin(
         "view",
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int},
         defaults={"j": None, "k": None},
+        constraint=sametypes,
         hidden=True,
         value_func=view_value_func,
         group="Utility",
@@ -4440,7 +4432,8 @@ def array_store_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
 for array_type in array_types:
     add_builtin(
         "array_store",
-        input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=sametypes,
         hidden=True,
         value_func=array_store_value_func,
         skip_replay=True,
@@ -4448,7 +4441,8 @@ def array_store_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
     )
     add_builtin(
         "array_store",
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=sametypes,
         hidden=True,
         value_func=array_store_value_func,
         skip_replay=True,
@@ -4456,7 +4450,8 @@ def array_store_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
     )
     add_builtin(
         "array_store",
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=sametypes,
         hidden=True,
         value_func=array_store_value_func,
         skip_replay=True,
@@ -4464,7 +4459,8 @@ def array_store_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
     )
     add_builtin(
         "array_store",
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=sametypes,
         hidden=True,
         value_func=array_store_value_func,
         skip_replay=True,
@@ -4516,6 +4512,11 @@ def load_dispatch_func(input_types: Mapping[str, type], return_type: Any, args:
 )
 
 
+def atomic_op_constraint(arg_types: Mapping[str, Any]):
+    idx_types = tuple(arg_types[x] for x in "ijkl" if arg_types.get(x, None) is not None)
+    return all(types_equal(idx_types[0], t) for t in idx_types[1:]) and arg_types["arr"].ndim == len(idx_types)
+
+
 def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str, Any]):
     if arg_types is None:
         return Any
@@ -4560,7 +4561,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_add",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically add ``value`` onto ``arr[i]`` and return the old value.",
         group="Utility",
@@ -4569,7 +4571,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_add",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically add ``value`` onto ``arr[i,j]`` and return the old value.",
         group="Utility",
@@ -4578,7 +4581,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_add",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value.",
         group="Utility",
@@ -4587,7 +4591,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_add",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value.",
         group="Utility",
@@ -4597,7 +4602,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_sub",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically subtract ``value`` onto ``arr[i]`` and return the old value.",
         group="Utility",
@@ -4606,7 +4612,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_sub",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value.",
         group="Utility",
@@ -4615,7 +4622,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_sub",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value.",
         group="Utility",
@@ -4624,7 +4632,8 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_sub",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value.",
         group="Utility",
@@ -4634,44 +4643,48 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_min",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
     add_builtin(
         "atomic_min",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
     add_builtin(
         "atomic_min",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
     add_builtin(
         "atomic_min",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -4679,44 +4692,48 @@ def atomic_op_value_func(arg_types: Mapping[str, type], arg_values: Mapping[str,
     add_builtin(
         "atomic_max",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
     add_builtin(
         "atomic_max",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
     add_builtin(
         "atomic_max",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
     add_builtin(
         "atomic_max",
         hidden=hidden,
-        input_types={"arr": array_type(dtype=Any), "i": int, "j": int, "k": int, "l": int, "value": Any},
+        input_types={"arr": array_type(dtype=Any), "i": Int, "j": Int, "k": Int, "l": Int, "value": Any},
+        constraint=atomic_op_constraint,
         value_func=atomic_op_value_func,
         doc="""Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.""",
+    The operation is only atomic on a per-component basis for vectors and matrices.""",
         group="Utility",
         skip_replay=True,
     )
@@ -5958,13 +5975,12 @@ def tile_fft_generic_lto_dispatch_func(
     value_type=Any,
     doc="""Evaluates a static Python expression and replaces it with its result.
 
-    See the `codegen.html#static-expressions <section on code generation>`_ for more details.
+    See the :ref:`code generation guide <static_expressions>` for more details.
 
-    Note:
-        The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
-        which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
-        The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
-        (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).""",
+    The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
+    which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
+    The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
+    (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).""",
     group="Code Generation",
 )
 
diff --git a/warp/codegen.py b/warp/codegen.py
index 53519521..000ea4d5 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -951,7 +951,9 @@ def build(adj, builder, default_builder_options=None):
 
         adj.return_var = None  # return type for function or kernel
         adj.loop_symbols = []  # symbols at the start of each loop
-        adj.loop_const_iter_symbols = set()  # iteration variables (constant) for static loops
+        adj.loop_const_iter_symbols = (
+            set()
+        )  # constant iteration variables for static loops (mutating them does not raise an error)
 
         # blocks
         adj.blocks = [Block()]
@@ -1007,7 +1009,6 @@ def format_args(adj, prefix, args):
                     arg_strs.append(f"{a.namespace}{a.native_func}")
                 else:
                     arg_strs.append(f"{a.namespace}{prefix}_{a.native_func}")
-
             elif is_reference(a.type):
                 arg_strs.append(f"{prefix}_{a}")
             elif isinstance(a, Var):
@@ -1339,6 +1340,10 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         if return_type is None:
             # handles expression (zero output) functions, e.g.: void do_something();
+
+            output = None
+            output_list = []
+
             forward_call = (
                 f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
             )
@@ -1348,14 +1353,23 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         elif not isinstance(return_type, Sequence) or len(return_type) == 1:
             # handle simple function (one output)
-            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
 
+            if isinstance(return_type, Sequence):
+                return_type = return_type[0]
+            output = adj.add_var(return_type)
+            output_list = [output]
+
+            forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
             replay_call = forward_call
             if func.custom_replay_func is not None:
                 replay_call = f"var_{output} = {func.namespace}replay_{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
 
         else:
             # handle multiple value functions
+
+            output = [adj.add_var(v) for v in return_type]
+            output_list = output
+
             forward_call = (
                 f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args + output, use_initializer_list)});"
             )
@@ -1366,7 +1380,7 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
         else:
             adj.add_forward(forward_call, replay=replay_call)
 
-        if not func.missing_grad and len(args):
+        if not func.missing_grad and len(func_args):
             adj_args = tuple(strip_reference(x) for x in func_args)
             reverse_has_output_args = (
                 func.require_original_output_arg or len(output_list) > 1
@@ -1871,7 +1885,7 @@ def materialize_redefinitions(adj, symbols):
         # detect symbols with conflicting definitions (assigned inside the for loop)
         for items in symbols.items():
             sym = items[0]
-            if adj.loop_const_iter_symbols is not None and sym in adj.loop_const_iter_symbols:
+            if adj.is_constant_iter_symbol(sym):
                 # ignore constant overwriting in for-loops if it is a loop iterator
                 # (it is no problem to unroll static loops multiple times in sequence)
                 continue
@@ -2022,12 +2036,11 @@ def get_unroll_range(adj, loop):
         )
         return range_call
 
-    def begin_record_constant_iter_symbols(adj):
-        if adj.loop_const_iter_symbols is None:
-            adj.loop_const_iter_symbols = set()
+    def record_constant_iter_symbol(adj, sym):
+        adj.loop_const_iter_symbols.add(sym)
 
-    def end_record_constant_iter_symbols(adj):
-        adj.loop_const_iter_symbols = None
+    def is_constant_iter_symbol(adj, sym):
+        return sym in adj.loop_const_iter_symbols
 
     def emit_For(adj, node):
         # try and unroll simple range() statements that use constant args
@@ -2035,9 +2048,8 @@ def emit_For(adj, node):
 
         if isinstance(unroll_range, range):
             const_iter_sym = node.target.id
-            if adj.loop_const_iter_symbols is not None:
-                # prevent constant conflicts in `materialize_redefinitions()`
-                adj.loop_const_iter_symbols.add(const_iter_sym)
+            # prevent constant conflicts in `materialize_redefinitions()`
+            adj.record_constant_iter_symbol(const_iter_sym)
 
             # unroll static for-loop
             for i in unroll_range:
@@ -2058,7 +2070,6 @@ def emit_For(adj, node):
                 iter = adj.eval(node.iter)
 
             adj.symbols[node.target.id] = adj.begin_for(iter)
-            adj.begin_record_constant_iter_symbols()
 
             # for loops should be side-effect free, here we store a copy
             adj.loop_symbols.append(adj.symbols.copy())
@@ -2069,7 +2080,6 @@ def emit_For(adj, node):
 
             adj.materialize_redefinitions(adj.loop_symbols[-1])
             adj.loop_symbols.pop()
-            adj.end_record_constant_iter_symbols()
 
             adj.end_for(iter)
 
@@ -2288,8 +2298,8 @@ def emit_Subscript(adj, node):
             return var
 
         target, indices = adj.eval_subscript(node)
-        target_type = strip_reference(target.type)
 
+        target_type = strip_reference(target.type)
         if is_array(target_type):
             if len(indices) == target_type.ndim:
                 # handles array loads (where each dimension has an index specified)
@@ -3108,7 +3118,6 @@ def get_references(adj) -> Tuple[Dict[str, Any], Dict[Any, Any], Dict[warp.conte
 
 """
 
-
 cpu_kernel_template = """
 
 void {name}_cpu_kernel_forward(
diff --git a/warp/config.py b/warp/config.py
index 49df51ea..a703df21 100644
--- a/warp/config.py
+++ b/warp/config.py
@@ -7,7 +7,7 @@
 
 from typing import Optional
 
-version: str = "1.4.0"
+version: str = "1.4.1"
 """Warp version string"""
 
 verify_fp: bool = False
diff --git a/warp/context.py b/warp/context.py
index 65ddeebe..d3f6adbc 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -6,7 +6,6 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 import ast
-import builtins
 import ctypes
 import functools
 import hashlib
@@ -22,7 +21,6 @@
 import weakref
 from copy import copy as shallowcopy
 from pathlib import Path
-from struct import pack as struct_pack
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
 import numpy as np
@@ -1257,6 +1255,7 @@ def initializer_list_func(args, return_type):
                     key,
                     input_types=arg_types,
                     value_type=return_type,
+                    value_func=value_func if return_type is Any else None,
                     export_func=export_func,
                     dispatch_func=dispatch_func,
                     lto_dispatch_func=lto_dispatch_func,
@@ -1495,30 +1494,16 @@ def hash_adjoint(self, adj):
         # hash referenced constants
         for name, value in constants.items():
             ch.update(bytes(name, "utf-8"))
-            # hash the referenced object
-            if isinstance(value, builtins.bool):
-                # This needs to come before the check for `int` since all boolean
-                # values are also instances of `int`.
-                ch.update(struct_pack("?", value))
-            elif isinstance(value, int):
-                ch.update(struct_pack("<q", value))
-            elif isinstance(value, float):
-                ch.update(struct_pack("<d", value))
-            elif isinstance(value, warp.types.float16):
-                # float16 is a special case
-                p = ctypes.pointer(ctypes.c_float(value.value))
-                ch.update(p.contents)
-            elif isinstance(value, tuple(warp.types.scalar_types)):
-                p = ctypes.pointer(value._type_(value.value))
-                ch.update(p.contents)
-            elif isinstance(value, ctypes.Array):
-                ch.update(bytes(value))
-            else:
-                raise RuntimeError(f"Invalid constant type: {type(value)}")
+            ch.update(self.get_constant_bytes(value))
 
         # hash wp.static() expressions that were evaluated at declaration time
         for k, v in adj.static_expressions.items():
-            ch.update(bytes(f"{k} = {v}", "utf-8"))
+            ch.update(bytes(k, "utf-8"))
+            if isinstance(v, Function):
+                if v not in self.functions_in_progress:
+                    ch.update(self.hash_function(v))
+            else:
+                ch.update(self.get_constant_bytes(v))
 
         # hash referenced types
         for t in types.keys():
@@ -1531,6 +1516,24 @@ def hash_adjoint(self, adj):
 
         return ch.digest()
 
+    def get_constant_bytes(self, value):
+        if isinstance(value, int):
+            # this also handles builtins.bool
+            return bytes(ctypes.c_int(value))
+        elif isinstance(value, float):
+            return bytes(ctypes.c_float(value))
+        elif isinstance(value, warp.types.float16):
+            # float16 is a special case
+            return bytes(ctypes.c_float(value.value))
+        elif isinstance(value, tuple(warp.types.scalar_and_bool_types)):
+            return bytes(value._type_(value.value))
+        elif hasattr(value, "_wp_scalar_type_"):
+            return bytes(value)
+        elif isinstance(value, warp.codegen.StructInstance):
+            return bytes(value._ctype)
+        else:
+            raise TypeError(f"Invalid constant type: {type(value)}")
+
     def get_module_hash(self):
         return self.module_hash
 
@@ -5960,9 +5963,6 @@ def export_stubs(file):  # pragma: no cover
     print('Cols = TypeVar("Cols", bound=int)', file=file)
     print('DType = TypeVar("DType")', file=file)
 
-    print('Int = TypeVar("Int")', file=file)
-    print('Float = TypeVar("Float")', file=file)
-    print('Scalar = TypeVar("Scalar")', file=file)
     print("Vector = Generic[Length, Scalar]", file=file)
     print("Matrix = Generic[Rows, Cols, Scalar]", file=file)
     print("Quaternion = Generic[Float]", file=file)
diff --git a/warp/examples/fem/utils.py b/warp/examples/fem/utils.py
index c7c23f8f..65119d89 100644
--- a/warp/examples/fem/utils.py
+++ b/warp/examples/fem/utils.py
@@ -143,7 +143,7 @@ def gen_hexmesh(res, bounds_lo: Optional[wp.vec3] = None, bounds_hi: Optional[wp
 
     x = np.linspace(bounds_lo[0], bounds_hi[0], Nx + 1)
     y = np.linspace(bounds_lo[1], bounds_hi[1], Ny + 1)
-    z = np.linspace(bounds_lo[1], bounds_hi[1], Nz + 1)
+    z = np.linspace(bounds_lo[2], bounds_hi[2], Nz + 1)
 
     positions = np.transpose(np.meshgrid(x, y, z, indexing="ij"), axes=(1, 2, 3, 0)).reshape(-1, 3)
 
@@ -252,6 +252,7 @@ def print_callback(i, err, tol):
         check_every=check_every,
         M=M,
         callback=callback,
+        use_cuda_graph=not wp.config.verify_cuda,
     )
 
     if not quiet:
diff --git a/warp/examples/optim/example_walker.py b/warp/examples/optim/example_walker.py
index 24bac0fa..7a51b9d2 100644
--- a/warp/examples/optim/example_walker.py
+++ b/warp/examples/optim/example_walker.py
@@ -21,7 +21,7 @@
 import os
 
 import numpy as np
-from pxr import Usd, UsdGeom
+from pxr import Gf, Usd, UsdGeom
 
 import warp as wp
 import warp.examples
@@ -93,7 +93,7 @@ def __init__(self, stage_path="example_walker.usd", verbose=False, num_frames=30
         geom = UsdGeom.Mesh(asset_stage.GetPrimAtPath("/root/bear"))
         points = geom.GetPointsAttr().Get()
 
-        xform = geom.ComputeLocalToWorldTransform(0.0)
+        xform = Gf.Matrix4f(geom.ComputeLocalToWorldTransform(0.0))
         for i in range(len(points)):
             points[i] = xform.Transform(points[i])
 
diff --git a/warp/fem/utils.py b/warp/fem/utils.py
index 7e1e3f5d..5e12d582 100644
--- a/warp/fem/utils.py
+++ b/warp/fem/utils.py
@@ -193,9 +193,14 @@ def _givens_rotation(a: Any, b: Any):
     # Givens rotation [[c -s], [s c]] such that sa+cb =0
     zero = type(a)(0.0)
     one = type(a)(1.0)
-    abn_sq = a * a + b * b
-    abn = wp.select(abn_sq == zero, one / wp.sqrt(abn_sq), zero)
-    return a * abn, -b * abn
+
+    b2 = b * b
+    if b2 == zero:
+        # id rotation
+        return one, zero
+
+    scale = one / wp.sqrt(a * a + b2)
+    return a * scale, -b * scale
 
 
 @wp.func
@@ -229,7 +234,7 @@ def tridiagonal_symmetric_eigenvalues_qr(D: Any, L: Any, Q: Any, tol: Any):
     x = D.dtype(0.0)  # coeff atop buldge
 
     for _ in range(32 * m):  # failsafe, usually converges faster than that
-        # Iterate over all idependant (deflated) blocks
+        # Iterate over all independent (deflated) blocks
         end = int(-1)
 
         for k in range(m - 1):
diff --git a/warp/native/array.h b/warp/native/array.h
index d5706f6b..b9c32d7a 100644
--- a/warp/native/array.h
+++ b/warp/native/array.h
@@ -818,7 +818,7 @@ CUDA_CALLABLE inline void adj_atomic_add(bool* buf, bool value) { }
 
 // only generate gradients for T types
 template<typename T>
-inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int& adj_i, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_t<T>& adj_buf, int adj_i, const T& adj_output)
 {
     if (adj_buf.data)
         adj_atomic_add(&index(adj_buf, i), adj_output);
@@ -826,7 +826,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, const array_
         adj_atomic_add(&index_grad(buf, i), adj_output);
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const array_t<T>& adj_buf, int adj_i, int adj_j, const T& adj_output)
 {
     if (adj_buf.data)
         adj_atomic_add(&index(adj_buf, i, j), adj_output);
@@ -834,7 +834,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, const
         adj_atomic_add(&index_grad(buf, i, j), adj_output);
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output)
 {
     if (adj_buf.data)
         adj_atomic_add(&index(adj_buf, i, j, k), adj_output);
@@ -842,7 +842,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k
         adj_atomic_add(&index_grad(buf, i, j, k), adj_output);
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output)
+inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k, int l, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output)
 {
     if (adj_buf.data)
         adj_atomic_add(&index(adj_buf, i, j, k, l), adj_output);
@@ -851,7 +851,7 @@ inline CUDA_CALLABLE void adj_address(const array_t<T>& buf, int i, int j, int k
 }
 
 template<typename T>
-inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i);
@@ -861,7 +861,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, T value,
     FP_VERIFY_ADJ_1(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i, j);
@@ -871,7 +871,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, T
     FP_VERIFY_ADJ_2(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i, j, k);
@@ -881,7 +881,7 @@ inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, i
     FP_VERIFY_ADJ_3(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value)
+inline CUDA_CALLABLE void adj_array_store(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i, j, k, l);
@@ -905,7 +905,7 @@ inline CUDA_CALLABLE void adj_load(const T* address, const T& adj_address, T& ad
 }
 
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i);
@@ -915,7 +915,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, T value,
     FP_VERIFY_ADJ_1(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i, j);
@@ -925,7 +925,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, T
     FP_VERIFY_ADJ_2(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i, j, k);
@@ -935,7 +935,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, in
     FP_VERIFY_ADJ_3(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value += index(adj_buf, i, j, k, l);
@@ -946,7 +946,7 @@ inline CUDA_CALLABLE void adj_atomic_add(const array_t<T>& buf, int i, int j, in
 }
 
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value, const array_t<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value -= index(adj_buf, i);
@@ -956,7 +956,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, T value,
     FP_VERIFY_ADJ_1(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value -= index(adj_buf, i, j);
@@ -966,7 +966,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, T
     FP_VERIFY_ADJ_2(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value -= index(adj_buf, i, j, k);
@@ -976,7 +976,7 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, in
     FP_VERIFY_ADJ_3(value, adj_value)
 }
 template<typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret)
+inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, int k, int l, T value, const array_t<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret)
 {
     if (adj_buf.data)
         adj_value -= index(adj_buf, i, j, k, l);
@@ -988,44 +988,44 @@ inline CUDA_CALLABLE void adj_atomic_sub(const array_t<T>& buf, int i, int j, in
 
 // generic array types that do not support gradient computation (indexedarray, etc.)
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int& adj_i, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, const A2<T>& adj_buf, int adj_i, const T& adj_output) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int& adj_i, int& adj_j, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, const A2<T>& adj_buf, int adj_i, int adj_j, const T& adj_output) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, const T& adj_output) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, const T& adj_output) {}
+inline CUDA_CALLABLE void adj_address(const A1<T>& buf, int i, int j, int k, int l, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, const T& adj_output) {}
 
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value) {}
+inline CUDA_CALLABLE void adj_array_store(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value) {}
 
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_add(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
 
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {}
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {}
+inline CUDA_CALLABLE void adj_atomic_sub(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {}
 
 // generic handler for scalar values
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
     else if (buf.grad)
@@ -1034,7 +1034,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, T value, const
     FP_VERIFY_ADJ_1(value, adj_value)
 }
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
     else if (buf.grad)
@@ -1043,7 +1043,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, T value
     FP_VERIFY_ADJ_2(value, adj_value)
 }
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
     else if (buf.grad)
@@ -1052,7 +1052,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k,
     FP_VERIFY_ADJ_3(value, adj_value)
 }
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
     else if (buf.grad)
@@ -1062,7 +1062,7 @@ inline CUDA_CALLABLE void adj_atomic_min(const A1<T>& buf, int i, int j, int k,
 }
 
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int& adj_i, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const A2<T>& adj_buf, int adj_i, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i), &index(adj_buf, i), value, adj_value);
     else if (buf.grad)
@@ -1071,7 +1071,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, T value, const
     FP_VERIFY_ADJ_1(value, adj_value)
 }
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value, const A2<T>& adj_buf, int adj_i, int adj_j, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i, j), &index(adj_buf, i, j), value, adj_value);
     else if (buf.grad)
@@ -1080,7 +1080,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, T value
     FP_VERIFY_ADJ_2(value, adj_value)
 }
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i, j, k), &index(adj_buf, i, j, k), value, adj_value);
     else if (buf.grad)
@@ -1089,7 +1089,7 @@ inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k,
     FP_VERIFY_ADJ_3(value, adj_value)
 }
 template<template<typename> class A1, template<typename> class A2, typename T>
-inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int& adj_i, int& adj_j, int& adj_k, int& adj_l, T& adj_value, const T& adj_ret) {
+inline CUDA_CALLABLE void adj_atomic_max(const A1<T>& buf, int i, int j, int k, int l, T value, const A2<T>& adj_buf, int adj_i, int adj_j, int adj_k, int adj_l, T& adj_value, const T& adj_ret) {
     if (adj_buf.data)
         adj_atomic_minmax(&index(buf, i, j, k, l), &index(adj_buf, i, j, k, l), value, adj_value);
     else if (buf.grad)
diff --git a/warp/native/builtin.h b/warp/native/builtin.h
index bf12b765..6c8fb637 100644
--- a/warp/native/builtin.h
+++ b/warp/native/builtin.h
@@ -1608,35 +1608,73 @@ inline CUDA_CALLABLE void print(transform_t<Type> t)
     printf("(%g %g %g) (%g %g %g %g)\n", float(t.p[0]), float(t.p[1]), float(t.p[2]), float(t.q.x), float(t.q.y), float(t.q.z), float(t.q.w));
 }
 
-inline CUDA_CALLABLE void adj_print(bool i, bool adj_i) { printf("%d adj: %d\n", i, adj_i); }
-inline CUDA_CALLABLE void adj_print(int8 i, int8 adj_i) { printf("%hhd adj: %hhd\n", i, adj_i); }
-inline CUDA_CALLABLE void adj_print(int i, int adj_i) { printf("%d adj: %d\n", i, adj_i); }
-inline CUDA_CALLABLE void adj_print(float f, float adj_f) { printf("%g adj: %g\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(short f, short adj_f) { printf("%hd adj: %hd\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(long f, long adj_f) { printf("%ld adj: %ld\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(long long f, long long adj_f) { printf("%lld adj: %lld\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(uint8 i, uint8 adj_i) { printf("%hhu adj: %hhu\n", i, adj_i); }
-inline CUDA_CALLABLE void adj_print(unsigned f, unsigned adj_f) { printf("%u adj: %u\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(unsigned short f, unsigned short adj_f) { printf("%hu adj: %hu\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(unsigned long f, unsigned long adj_f) { printf("%lu adj: %lu\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(unsigned long long f, unsigned long long adj_f) { printf("%llu adj: %llu\n", f, adj_f); }
-inline CUDA_CALLABLE void adj_print(half h, half adj_h) { printf("%g adj: %g\n", half_to_float(h), half_to_float(adj_h)); }
-inline CUDA_CALLABLE void adj_print(double f, double adj_f) { printf("%g adj: %g\n", f, adj_f); }
+template<typename T>
+inline CUDA_CALLABLE void adj_print(const T& x, const T& adj_x)
+{
+    printf("adj: <type without print implementation>\n");
+}
+
+// note: adj_print() only prints the adjoint value, since the value itself gets printed in replay print()
+inline CUDA_CALLABLE void adj_print(half x, half adj_x) { printf("adj: %g\n", half_to_float(adj_x)); }
+inline CUDA_CALLABLE void adj_print(float x, float adj_x) { printf("adj: %g\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(double x, double adj_x) { printf("adj: %g\n", adj_x); }
+
+inline CUDA_CALLABLE void adj_print(signed char x, signed char adj_x) { printf("adj: %d\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(short x, short adj_x) { printf("adj: %d\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(int x, int adj_x) { printf("adj: %d\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(long x, long adj_x) { printf("adj: %ld\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(long long x, long long adj_x) { printf("adj: %lld\n", adj_x); }
+
+inline CUDA_CALLABLE void adj_print(unsigned char x, unsigned char adj_x) { printf("adj: %u\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(unsigned short x, unsigned short adj_x) { printf("adj: %u\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(unsigned x, unsigned adj_x) { printf("adj: %u\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(unsigned long x, unsigned long adj_x) { printf("adj: %lu\n", adj_x); }
+inline CUDA_CALLABLE void adj_print(unsigned long long x, unsigned long long adj_x) { printf("adj: %llu\n", adj_x); }
+
+inline CUDA_CALLABLE void adj_print(bool x, bool adj_x) { printf("adj: %s\n", (adj_x ? "True" : "False")); }
 
 template<unsigned Length, typename Type>
-inline CUDA_CALLABLE void adj_print(vec_t<Length, Type> v, vec_t<Length, Type>& adj_v) { printf("%g %g adj: %g %g \n", v[0], v[1], adj_v[0], adj_v[1]); }
+inline CUDA_CALLABLE void adj_print(const vec_t<Length, Type>& v, const vec_t<Length, Type>& adj_v)
+{
+    printf("adj:");
+    for (unsigned i = 0; i < Length; i++)
+        printf(" %g", float(adj_v[i]));
+    printf("\n");
+}
 
 template<unsigned Rows, unsigned Cols, typename Type>
-inline CUDA_CALLABLE void adj_print(mat_t<Rows, Cols, Type> m, mat_t<Rows, Cols, Type>& adj_m) { }
+inline CUDA_CALLABLE void adj_print(const mat_t<Rows, Cols, Type>& m, const mat_t<Rows, Cols, Type>& adj_m)
+{
+    for (unsigned i = 0; i < Rows; i++)
+    {
+        if (i == 0)
+            printf("adj:");
+        else
+            printf("    ");
+        for (unsigned j = 0; j < Cols; j++)
+            printf(" %g", float(adj_m.data[i][j]));
+        printf("\n");
+    }
+}
 
 template<typename Type>
-inline CUDA_CALLABLE void adj_print(quat_t<Type> q, quat_t<Type>& adj_q) { printf("%g %g %g %g adj: %g %g %g %g\n", q.x, q.y, q.z, q.w, adj_q.x, adj_q.y, adj_q.z, adj_q.w); }
+inline CUDA_CALLABLE void adj_print(const quat_t<Type>& q, const quat_t<Type>& adj_q)
+{
+    printf("adj: %g %g %g %g\n", float(adj_q.x), float(adj_q.y), float(adj_q.z), float(adj_q.w));
+}
 
 template<typename Type>
-inline CUDA_CALLABLE void adj_print(transform_t<Type> t, transform_t<Type>& adj_t) {}
-
-inline CUDA_CALLABLE void adj_print(str t, str& adj_t) {}
+inline CUDA_CALLABLE void adj_print(const transform_t<Type>& t, const transform_t<Type>& adj_t)
+{
+    printf("adj: (%g %g %g) (%g %g %g %g)\n",
+        float(adj_t.p[0]), float(adj_t.p[1]), float(adj_t.p[2]),
+        float(adj_t.q.x), float(adj_t.q.y), float(adj_t.q.z), float(adj_t.q.w));
+}
 
+inline CUDA_CALLABLE void adj_print(str t, str& adj_t)
+{
+    printf("adj: %s\n", t);
+}
 
 template <typename T>
 inline CUDA_CALLABLE void expect_eq(const T& actual, const T& expected)
diff --git a/warp/native/bvh.cu b/warp/native/bvh.cu
index b8bc69f6..6a67287b 100644
--- a/warp/native/bvh.cu
+++ b/warp/native/bvh.cu
@@ -65,7 +65,7 @@ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __
             int finished = atomicAdd(&child_count[parent], 1);
 
             // if we have are the last thread (such that the parent node is now complete)
-            // then update its bounds and move onto the the next parent in the hierarchy
+            // then update its bounds and move onto the next parent in the hierarchy
             if (finished == 1)
             {
                 const int left_child = node_lowers[parent].i;
@@ -273,7 +273,7 @@ __global__ void build_hierarchy(int n, int* root, const int* __restrict__ deltas
             }
 
             // if we have are the last thread (such that the parent node is now complete)
-            // then update its bounds and move onto the the next parent in the hierarchy
+            // then update its bounds and move onto the next parent in the hierarchy
             if (childCount == 1)
             {
                 const int left_child = lowers[parent].i;
diff --git a/warp/native/bvh.h b/warp/native/bvh.h
index eed1ffd8..e2dca507 100644
--- a/warp/native/bvh.h
+++ b/warp/native/bvh.h
@@ -404,6 +404,10 @@ CUDA_CALLABLE inline bvh_query_t iter_reverse(const bvh_query_t& query)
     return query;
 }
 
+CUDA_CALLABLE inline void adj_iter_reverse(const bvh_query_t& query, bvh_query_t& adj_query, bvh_query_t& adj_ret)
+{
+}
+
 
 // stub
 CUDA_CALLABLE inline void adj_bvh_query_next(bvh_query_t& query, int& index, bvh_query_t&, int&, bool&) 
diff --git a/warp/native/exports.h b/warp/native/exports.h
index 17778056..f8fd82af 100644
--- a/warp/native/exports.h
+++ b/warp/native/exports.h
@@ -1013,6 +1013,23 @@ WP_API void builtin_volume_index_to_world_uint64_vec3f(uint64 id, vec3f& uvw, ve
 WP_API void builtin_volume_world_to_index_uint64_vec3f(uint64 id, vec3f& xyz, vec3f* ret) { *ret = wp::volume_world_to_index(id, xyz); }
 WP_API void builtin_volume_index_to_world_dir_uint64_vec3f(uint64 id, vec3f& uvw, vec3f* ret) { *ret = wp::volume_index_to_world_dir(id, uvw); }
 WP_API void builtin_volume_world_to_index_dir_uint64_vec3f(uint64 id, vec3f& xyz, vec3f* ret) { *ret = wp::volume_world_to_index_dir(id, xyz); }
+WP_API void builtin_rand_init_int32(int32 seed, uint32* ret) { *ret = wp::rand_init(seed); }
+WP_API void builtin_rand_init_int32_int32(int32 seed, int32 offset, uint32* ret) { *ret = wp::rand_init(seed, offset); }
+WP_API void builtin_randi_uint32(uint32 state, int* ret) { *ret = wp::randi(state); }
+WP_API void builtin_randi_uint32_int32_int32(uint32 state, int32 low, int32 high, int* ret) { *ret = wp::randi(state, low, high); }
+WP_API void builtin_randf_uint32(uint32 state, float* ret) { *ret = wp::randf(state); }
+WP_API void builtin_randf_uint32_float32_float32(uint32 state, float32 low, float32 high, float* ret) { *ret = wp::randf(state, low, high); }
+WP_API void builtin_randn_uint32(uint32 state, float* ret) { *ret = wp::randn(state); }
+WP_API void builtin_sample_triangle_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_triangle(state); }
+WP_API void builtin_sample_unit_ring_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_unit_ring(state); }
+WP_API void builtin_sample_unit_disk_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_unit_disk(state); }
+WP_API void builtin_sample_unit_sphere_surface_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_sphere_surface(state); }
+WP_API void builtin_sample_unit_sphere_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_sphere(state); }
+WP_API void builtin_sample_unit_hemisphere_surface_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_hemisphere_surface(state); }
+WP_API void builtin_sample_unit_hemisphere_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_hemisphere(state); }
+WP_API void builtin_sample_unit_square_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_unit_square(state); }
+WP_API void builtin_sample_unit_cube_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_cube(state); }
+WP_API void builtin_poisson_uint32_float32(uint32 state, float32 lam, uint32* ret) { *ret = wp::poisson(state, lam); }
 WP_API void builtin_noise_uint32_float32(uint32 state, float32 x, float* ret) { *ret = wp::noise(state, x); }
 WP_API void builtin_noise_uint32_vec2f(uint32 state, vec2f& xy, float* ret) { *ret = wp::noise(state, xy); }
 WP_API void builtin_noise_uint32_vec3f(uint32 state, vec3f& xyz, float* ret) { *ret = wp::noise(state, xyz); }
diff --git a/warp/native/hashgrid.h b/warp/native/hashgrid.h
index 148f4ded..d5ed485b 100644
--- a/warp/native/hashgrid.h
+++ b/warp/native/hashgrid.h
@@ -209,6 +209,10 @@ CUDA_CALLABLE inline hash_grid_query_t iter_reverse(const hash_grid_query_t& que
     return query;
 }
 
+CUDA_CALLABLE inline void adj_iter_reverse(const hash_grid_query_t& query, hash_grid_query_t& adj_query, hash_grid_query_t& adj_ret)
+{
+}
+
 
 
 CUDA_CALLABLE inline int hash_grid_point_id(uint64_t id, int& index)
diff --git a/warp/native/mesh.cu b/warp/native/mesh.cu
index 4ebdf3f3..3bfac181 100644
--- a/warp/native/mesh.cu
+++ b/warp/native/mesh.cu
@@ -101,7 +101,7 @@ __global__ void bvh_refit_with_solid_angle_kernel(int n, const int* __restrict__
             int finished = atomicAdd(&child_count[parent], 1);
 
             // if we have are the last thread (such that the parent node is now complete)
-            // then update its bounds and move onto the the next parent in the hierarchy
+            // then update its bounds and move onto the next parent in the hierarchy
             if (finished == 1)
             {
                 //printf("Compute non-leaf at %d\n", index);
@@ -340,4 +340,4 @@ void mesh_set_velocities_device(uint64_t id, wp::array_t<wp::vec3> velocities)
         fprintf(stderr, "The mesh id provided to mesh_set_velocities_device is not valid!\n");
         return;
     }
-}
\ No newline at end of file
+}
diff --git a/warp/native/mesh.h b/warp/native/mesh.h
index 68680479..2f6ad0cb 100644
--- a/warp/native/mesh.h
+++ b/warp/native/mesh.h
@@ -1693,6 +1693,10 @@ CUDA_CALLABLE inline mesh_query_aabb_t iter_reverse(const mesh_query_aabb_t& que
     return query;
 }
 
+CUDA_CALLABLE inline void adj_iter_reverse(const mesh_query_aabb_t& query, mesh_query_aabb_t& adj_query, mesh_query_aabb_t& adj_ret)
+{
+}
+
 
 // stub
 CUDA_CALLABLE inline void adj_mesh_query_aabb_next(mesh_query_aabb_t& query, int& index, mesh_query_aabb_t&, int&, bool&) 
diff --git a/warp/native/range.h b/warp/native/range.h
index 408ad067..24458bdc 100644
--- a/warp/native/range.h
+++ b/warp/native/range.h
@@ -97,8 +97,17 @@ CUDA_CALLABLE inline range_t iter_reverse(const range_t& r)
 {
     // generates a reverse range, equivalent to reversed(range())
     range_t rev;
-    rev.start = r.end-1;
-    rev.end = r.start-1;
+
+    if (r.step > 0)
+    {
+        rev.start = r.start + int((r.end - r.start - 1) / r.step) * r.step;
+    }
+    else
+    {
+        rev.start = r.start + int((r.end - r.start + 1) / r.step) * r.step;
+    }
+
+    rev.end = r.start - r.step;
     rev.step = -r.step;
 
     rev.i = rev.start;
@@ -106,4 +115,8 @@ CUDA_CALLABLE inline range_t iter_reverse(const range_t& r)
     return rev;
 }
 
+CUDA_CALLABLE inline void adj_iter_reverse(const range_t& r, range_t& adj_r, range_t& adj_ret)
+{
+}
+
 } // namespace wp
\ No newline at end of file
diff --git a/warp/sim/integrator_xpbd.py b/warp/sim/integrator_xpbd.py
index ef585c29..d8d1d854 100644
--- a/warp/sim/integrator_xpbd.py
+++ b/warp/sim/integrator_xpbd.py
@@ -2808,12 +2808,8 @@ def simulate(self, model: Model, state_in: State, state_out: State, dt: float, c
 
         with wp.ScopedTimer("simulate", False):
             if model.particle_count:
-                if requires_grad:
-                    particle_q = state_out.particle_q
-                    particle_qd = state_out.particle_qd
-                else:
-                    particle_q = state_out.particle_q
-                    particle_qd = state_out.particle_qd
+                particle_q = state_out.particle_q
+                particle_qd = state_out.particle_qd
 
                 self.particle_q_init = wp.clone(state_in.particle_q)
                 if self.enable_restitution:
diff --git a/warp/sim/model.py b/warp/sim/model.py
index 4d9df0fb..98a055dd 100644
--- a/warp/sim/model.py
+++ b/warp/sim/model.py
@@ -641,7 +641,7 @@ class Model:
         joint_dof_count (int): Total number of velocity degrees of freedom of all joints in the system
         joint_coord_count (int): Total number of position degrees of freedom of all joints in the system
 
-        particle_coloring (list of array): The coloring of all the particles, used for VBD's Gauss-Seidel interation.
+        particle_coloring (list of array): The coloring of all the particles, used for VBD's Gauss-Seidel iteration.
 
         device (wp.Device): Device on which the Model was allocated
 
@@ -1404,9 +1404,8 @@ def add_builder(self, builder, xform=None, update_num_env_count=True, separate_c
             self.joint_X_p.extend(joint_X_p)
             self.joint_q.extend(joint_q)
 
-            self.add_articulation()
-
             # offset the indices
+            self.articulation_start.extend([a + self.joint_count for a in builder.articulation_start])
             self.joint_parent.extend([p + self.joint_count if p != -1 else -1 for p in builder.joint_parent])
             self.joint_child.extend([c + self.joint_count for c in builder.joint_child])
 
diff --git a/warp/sparse.py b/warp/sparse.py
index 37259e70..0b86bd17 100644
--- a/warp/sparse.py
+++ b/warp/sparse.py
@@ -106,7 +106,7 @@ def _setup_nnz_transfer(self):
             return
 
         BsrMatrix.__setattr__(
-            self, "_nnz_buf", wp.zeros(dtype=int, shape=(1,), device="cpu", pinned=self.device.is_cuda)
+            self, "_nnz_buf", wp.empty(dtype=int, shape=(1,), device="cpu", pinned=self.device.is_cuda)
         )
         if self.device.is_cuda:
             BsrMatrix.__setattr__(self, "_nnz_event", wp.Event(self.device))
@@ -524,7 +524,7 @@ def _bsr_assign_split_blocks(
     if dest_block >= dest_offsets[dest_row_count]:
         return
 
-    dest_row = wp.lower_bound(dest_offsets, dest_block + 1) - 1
+    dest_row = wp.lower_bound(dest_offsets, 0, dest_row_count + 1, dest_block + 1) - 1
     src_row = dest_row // row_factor
 
     dest_col_in_row = dest_block - dest_offsets[dest_row]
@@ -566,7 +566,7 @@ def _bsr_assign_merge_row_col(
         dest_rows[block] = -1  # invalid
         dest_cols[block] = -1
     else:
-        row = wp.lower_bound(src_offsets, block + 1) - 1
+        row = wp.lower_bound(src_offsets, 0, src_row_count + 1, block + 1) - 1
         dest_rows[block] = row // row_factor
         dest_cols[block] = src_columns[block] // col_factor
 
@@ -589,7 +589,7 @@ def _bsr_assign_merge_blocks(
     if src_block >= src_offsets[src_row_count]:
         return
 
-    src_row = wp.lower_bound(src_offsets, src_block + 1) - 1
+    src_row = wp.lower_bound(src_offsets, 0, src_row_count + 1, src_block + 1) - 1
     src_col = src_columns[src_block]
 
     dest_row = src_row // row_factor
@@ -828,7 +828,7 @@ def bsr_copy(
         block_type=block_type,
         device=A.device,
     )
-    bsr_assign(dest=copy, src=A)
+    bsr_assign(dest=copy, src=A, structure_only=structure_only)
     return copy
 
 
@@ -1190,7 +1190,7 @@ def _bsr_get_block_row(dest_offset: int, row_count: int, bsr_offsets: wp.array(d
     if i >= bsr_offsets[row_count]:
         rows[dest_offset + i] = -1  # invalid
     else:
-        row = wp.lower_bound(bsr_offsets, i + 1) - 1
+        row = wp.lower_bound(bsr_offsets, 0, row_count + 1, i + 1) - 1
         rows[dest_offset + i] = row
 
 
@@ -1461,13 +1461,14 @@ def _bsr_mm_compute_values(
     y_offsets: wp.array(dtype=int),
     y_columns: wp.array(dtype=int),
     y_values: wp.array(dtype=Any),
+    mm_row_count: int,
     mm_offsets: wp.array(dtype=int),
     mm_cols: wp.array(dtype=int),
     mm_values: wp.array(dtype=Any),
 ):
     mm_block = wp.tid()
 
-    row = wp.lower_bound(mm_offsets, mm_block + 1) - 1
+    row = wp.lower_bound(mm_offsets, 0, mm_row_count + 1, mm_block + 1) - 1
     col = mm_cols[mm_block]
 
     mm_val = mm_values.dtype(type(alpha)(0.0))
@@ -1759,6 +1760,7 @@ def bsr_mm(
             work_arrays._old_z_offsets if y == z else y.offsets,
             work_arrays._old_z_columns if y == z else y.columns,
             work_arrays._old_z_values if y == z else y.values,
+            z.nrow,
             z.offsets,
             z.columns,
             mm_values,
diff --git a/warp/stubs.py b/warp/stubs.py
index 77e1c548..eb9cdb6f 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -11,9 +11,6 @@
 Rows = TypeVar("Rows", bound=int)
 Cols = TypeVar("Cols", bound=int)
 DType = TypeVar("DType")
-Int = TypeVar("Int")
-Float = TypeVar("Float")
-Scalar = TypeVar("Scalar")
 Vector = Generic[Length, Scalar]
 Matrix = Generic[Rows, Cols, Scalar]
 Quaternion = Generic[Float]
@@ -39,6 +36,8 @@
 from warp.types import spatial_vector, spatial_vectorh, spatial_vectorf, spatial_vectord
 from warp.types import spatial_matrix, spatial_matrixh, spatial_matrixf, spatial_matrixd
 
+from warp.types import Int, Float, Scalar
+
 from warp.types import Bvh, Mesh, HashGrid, Volume, MarchingCubes
 from warp.types import BvhQuery, HashGridQuery, MeshQueryAABB, MeshQueryPoint, MeshQueryRay
 
@@ -786,7 +785,8 @@ def transform_point(mat: Matrix[4, 4, Float], point: Vector[3, Float]) -> Vector
     """Apply the transform to a point ``point`` treating the homogeneous coordinate as w=1.
 
     The transformation is applied treating ``point`` as a column vector, e.g.: ``y = mat*point``.
-    Note this is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = point^T*mat^T``.
+
+    This is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = point^T*mat^T``.
     If the transform is coming from a library that uses row-vectors, then users should transpose the transformation
     matrix before calling this method.
     """
@@ -803,8 +803,9 @@ def transform_vector(xform: Transformation[Float], vec: Vector[3, Float]) -> Vec
 def transform_vector(mat: Matrix[4, 4, Float], vec: Vector[3, Float]) -> Vector[3, Float]:
     """Apply the transform to a vector ``vec`` treating the homogeneous coordinate as w=0.
 
-    The transformation is applied treating ``vec`` as a column vector, e.g.: ``y = mat*vec``
-    note this is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = vec^T*mat^T``.
+    The transformation is applied treating ``vec`` as a column vector, e.g.: ``y = mat*vec``.
+
+    This is in contrast to some libraries, notably USD, which applies transforms to row vectors, ``y^T = vec^T*mat^T``.
     If the transform is coming from a library that uses row-vectors, then users should transpose the transformation
     matrix before calling this method.
     """
@@ -1604,6 +1605,12 @@ def closest_point_edge_edge(p1: vec3f, q1: vec3f, p2: vec3f, q2: vec3f, epsilon:
     ...
 
 
+@over
+def reversed(range: range_t) -> range_t:
+    """Returns the range in reversed order."""
+    ...
+
+
 @over
 def volume_sample(id: uint64, uvw: vec3f, sampling_mode: int32, dtype: Any) -> Any:
     """Sample the volume of type `dtype` given by ``id`` at the volume local-space point ``uvw``.
@@ -2082,361 +2089,361 @@ def select(arr: Array[Any], value_if_false: Any, value_if_true: Any) -> Any:
 
 
 @over
-def atomic_add(arr: Array[Any], i: int32, value: Any) -> Any:
+def atomic_add(arr: Array[Any], i: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_add(arr: Array[Any], i: Int, j: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_add(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_add(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: FabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_add(arr: FabricArray[Any], i: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_add(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_add(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_add(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_add(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_add(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_add(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j,k]`` and return the old value."""
     ...
 
 
 @over
-def atomic_add(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_add(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Atomically add ``value`` onto ``arr[i,j,k,l]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: Array[Any], i: int32, value: Any) -> Any:
+def atomic_sub(arr: Array[Any], i: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_sub(arr: Array[Any], i: Int, j: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_sub(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_sub(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: FabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_sub(arr: FabricArray[Any], i: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_sub(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_sub(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_sub(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_sub(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_sub(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_sub(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j,k]`` and return the old value."""
     ...
 
 
 @over
-def atomic_sub(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_sub(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Atomically subtract ``value`` onto ``arr[i,j,k,l]`` and return the old value."""
     ...
 
 
 @over
-def atomic_min(arr: Array[Any], i: int32, value: Any) -> Any:
+def atomic_min(arr: Array[Any], i: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_min(arr: Array[Any], i: Int, j: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_min(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_min(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: FabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_min(arr: FabricArray[Any], i: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_min(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_min(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_min(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_min(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_min(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_min(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_min(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_min(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Compute the minimum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: Array[Any], i: int32, value: Any) -> Any:
+def atomic_max(arr: Array[Any], i: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: Array[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_max(arr: Array[Any], i: Int, j: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_max(arr: Array[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: Array[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_max(arr: Array[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: FabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_max(arr: FabricArray[Any], i: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: FabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_max(arr: FabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_max(arr: FabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: FabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_max(arr: FabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: IndexedFabricArray[Any], i: int32, value: Any) -> Any:
+def atomic_max(arr: IndexedFabricArray[Any], i: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, value: Any) -> Any:
+def atomic_max(arr: IndexedFabricArray[Any], i: Int, j: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, value: Any) -> Any:
+def atomic_max(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j,k]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
 
 @over
-def atomic_max(arr: IndexedFabricArray[Any], i: int32, j: int32, k: int32, l: int32, value: Any) -> Any:
+def atomic_max(arr: IndexedFabricArray[Any], i: Int, j: Int, k: Int, l: Int, value: Any) -> Any:
     """Compute the maximum of ``value`` and ``arr[i,j,k,l]``, atomically update the array, and return the old value.
 
-    .. note:: The operation is only atomic on a per-component basis for vectors and matrices.
+    The operation is only atomic on a per-component basis for vectors and matrices.
     """
     ...
 
@@ -2945,12 +2952,11 @@ def tile_ifft(inout: Tile) -> Tile:
 def static(expr: Any) -> Any:
     """Evaluates a static Python expression and replaces it with its result.
 
-    See the `codegen.html#static-expressions <section on code generation>`_ for more details.
+    See the :ref:`code generation guide <static_expressions>` for more details.
 
-    Note:
-        The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
-        which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
-        The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
-        (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).
+    The inner expression must only reference variables that are available from the current scope where the Warp kernel or function containing the expression is defined,
+    which includes constant variables and variables captured in the current closure in which the function or kernel is implemented.
+    The return type of the expression must be either a Warp function, a string, or a type that is supported inside Warp kernels and functions
+    (excluding Warp arrays since they cannot be created in a Warp kernel at the moment).
     """
     ...
diff --git a/warp/tests/test_array.py b/warp/tests/test_array.py
index 77721ca5..3ffddb71 100644
--- a/warp/tests/test_array.py
+++ b/warp/tests/test_array.py
@@ -2609,6 +2609,87 @@ def test_numpy_array_interface(test, device):
         assert a1.strides == a2.strides
 
 
+@wp.kernel
+def kernel_indexing_types(
+    arr_1d: wp.array(dtype=wp.int32, ndim=1),
+    arr_2d: wp.array(dtype=wp.int32, ndim=2),
+    arr_3d: wp.array(dtype=wp.int32, ndim=3),
+    arr_4d: wp.array(dtype=wp.int32, ndim=4),
+):
+    x = arr_1d[wp.uint8(0)]
+    y = arr_1d[wp.int16(1)]
+    z = arr_1d[wp.uint32(2)]
+    w = arr_1d[wp.int64(3)]
+
+    x = arr_2d[wp.uint8(0), wp.uint8(0)]
+    y = arr_2d[wp.int16(1), wp.int16(1)]
+    z = arr_2d[wp.uint32(2), wp.uint32(2)]
+    w = arr_2d[wp.int64(3), wp.int64(3)]
+
+    x = arr_3d[wp.uint8(0), wp.uint8(0), wp.uint8(0)]
+    y = arr_3d[wp.int16(1), wp.int16(1), wp.int16(1)]
+    z = arr_3d[wp.uint32(2), wp.uint32(2), wp.uint32(2)]
+    w = arr_3d[wp.int64(3), wp.int64(3), wp.int64(3)]
+
+    x = arr_4d[wp.uint8(0), wp.uint8(0), wp.uint8(0), wp.uint8(0)]
+    y = arr_4d[wp.int16(1), wp.int16(1), wp.int16(1), wp.int16(1)]
+    z = arr_4d[wp.uint32(2), wp.uint32(2), wp.uint32(2), wp.uint32(2)]
+    w = arr_4d[wp.int64(3), wp.int64(3), wp.int64(3), wp.int64(3)]
+
+    arr_1d[wp.uint8(0)] = 123
+    arr_1d[wp.int16(1)] = 123
+    arr_1d[wp.uint32(2)] = 123
+    arr_1d[wp.int64(3)] = 123
+
+    arr_2d[wp.uint8(0), wp.uint8(0)] = 123
+    arr_2d[wp.int16(1), wp.int16(1)] = 123
+    arr_2d[wp.uint32(2), wp.uint32(2)] = 123
+    arr_2d[wp.int64(3), wp.int64(3)] = 123
+
+    arr_3d[wp.uint8(0), wp.uint8(0), wp.uint8(0)] = 123
+    arr_3d[wp.int16(1), wp.int16(1), wp.int16(1)] = 123
+    arr_3d[wp.uint32(2), wp.uint32(2), wp.uint32(2)] = 123
+    arr_3d[wp.int64(3), wp.int64(3), wp.int64(3)] = 123
+
+    arr_4d[wp.uint8(0), wp.uint8(0), wp.uint8(0), wp.uint8(0)] = 123
+    arr_4d[wp.int16(1), wp.int16(1), wp.int16(1), wp.int16(1)] = 123
+    arr_4d[wp.uint32(2), wp.uint32(2), wp.uint32(2), wp.uint32(2)] = 123
+    arr_4d[wp.int64(3), wp.int64(3), wp.int64(3), wp.int64(3)] = 123
+
+    wp.atomic_add(arr_1d, wp.uint8(0), 123)
+    wp.atomic_sub(arr_1d, wp.int16(1), 123)
+    wp.atomic_min(arr_1d, wp.uint32(2), 123)
+    wp.atomic_max(arr_1d, wp.int64(3), 123)
+
+    wp.atomic_add(arr_2d, wp.uint8(0), wp.uint8(0), 123)
+    wp.atomic_sub(arr_2d, wp.int16(1), wp.int16(1), 123)
+    wp.atomic_min(arr_2d, wp.uint32(2), wp.uint32(2), 123)
+    wp.atomic_max(arr_2d, wp.int64(3), wp.int64(3), 123)
+
+    wp.atomic_add(arr_3d, wp.uint8(0), wp.uint8(0), wp.uint8(0), 123)
+    wp.atomic_sub(arr_3d, wp.int16(1), wp.int16(1), wp.int16(1), 123)
+    wp.atomic_min(arr_3d, wp.uint32(2), wp.uint32(2), wp.uint32(2), 123)
+    wp.atomic_max(arr_3d, wp.int64(3), wp.int64(3), wp.int64(3), 123)
+
+    wp.atomic_add(arr_4d, wp.uint8(0), wp.uint8(0), wp.uint8(0), wp.uint8(0), 123)
+    wp.atomic_sub(arr_4d, wp.int16(1), wp.int16(1), wp.int16(1), wp.int16(1), 123)
+    wp.atomic_min(arr_4d, wp.uint32(2), wp.uint32(2), wp.uint32(2), wp.uint32(2), 123)
+    wp.atomic_max(arr_4d, wp.int64(3), wp.int64(3), wp.int64(3), wp.int64(3), 123)
+
+
+def test_indexing_types(test, device):
+    arr_1d = wp.zeros(shape=(4,), dtype=wp.int32, device=device)
+    arr_2d = wp.zeros(shape=(4, 4), dtype=wp.int32, device=device)
+    arr_3d = wp.zeros(shape=(4, 4, 4), dtype=wp.int32, device=device)
+    arr_4d = wp.zeros(shape=(4, 4, 4, 4), dtype=wp.int32, device=device)
+    wp.launch(
+        kernel=kernel_indexing_types,
+        dim=1,
+        inputs=(arr_1d, arr_2d, arr_3d, arr_4d),
+        device=device,
+    )
+
+
 devices = get_test_devices()
 
 
@@ -2675,6 +2756,7 @@ def test_array_new_del(self):
 
 add_function_test(TestArray, "test_array_from_int32_domain", test_array_from_int32_domain, devices=devices)
 add_function_test(TestArray, "test_array_from_int64_domain", test_array_from_int64_domain, devices=devices)
+add_function_test(TestArray, "test_indexing_types", test_indexing_types, devices=devices)
 
 try:
     import torch
diff --git a/warp/tests/test_codegen.py b/warp/tests/test_codegen.py
index e3552ad2..db0bdee7 100644
--- a/warp/tests/test_codegen.py
+++ b/warp/tests/test_codegen.py
@@ -503,6 +503,76 @@ def dynamic_loop_kernel(n: int, input: wp.array(dtype=float)):
     ):
         wp.launch(dynamic_loop_kernel, dim=1, inputs=[3, inputs], device=device)
 
+    # the following nested loop must not raise an error
+    const_a = 7
+    const_b = 5
+
+    @wp.kernel
+    def mixed_dyn_static_loop_kernel(dyn_a: int, dyn_b: int, dyn_c: int, output: wp.array(dtype=float, ndim=2)):
+        tid = wp.tid()
+        for i in range(const_a + 1):
+            for j in range(dyn_a + 1):
+                for k in range(dyn_b + 1):
+                    for l in range(const_b + 1):
+                        for m in range(dyn_c + 1):
+                            coeff = i + j + k + l + m
+                            output[tid, coeff] = 1.0
+
+    dyn_a, dyn_b, dyn_c = 3, 4, 5
+    num_threads = 10
+    output = wp.empty([num_threads, const_a + const_b + dyn_a + dyn_b + dyn_c + 1], dtype=float, device=device)
+    wp.launch(
+        mixed_dyn_static_loop_kernel,
+        num_threads,
+        inputs=[
+            dyn_a,
+            dyn_b,
+            dyn_c,
+        ],
+        outputs=[output],
+        device=device,
+    )
+    assert_np_equal(output.numpy(), np.ones([num_threads, const_a + const_b + dyn_a + dyn_b + dyn_c + 1]))
+
+    @wp.kernel
+    def static_then_dynamic_loop_kernel(mats: wp.array(dtype=wp.mat33d)):
+        tid = wp.tid()
+        mat = wp.mat33d()
+        for i in range(3):
+            for j in range(3):
+                mat[i, j] = wp.float64(0.0)
+
+        dim = 2
+        for i in range(dim + 1):
+            for j in range(dim + 1):
+                mat[i, j] = wp.float64(1.0)
+
+        mats[tid] = mat
+
+    mats = wp.empty(1, dtype=wp.mat33d, device=device)
+    wp.launch(static_then_dynamic_loop_kernel, dim=1, inputs=[mats], device=device)
+    assert_np_equal(mats.numpy(), np.ones((1, 3, 3)))
+
+    @wp.kernel
+    def dynamic_then_static_loop_kernel(mats: wp.array(dtype=wp.mat33d)):
+        tid = wp.tid()
+        mat = wp.mat33d()
+
+        dim = 2
+        for i in range(dim + 1):
+            for j in range(dim + 1):
+                mat[i, j] = wp.float64(1.0)
+
+        for i in range(3):
+            for j in range(3):
+                mat[i, j] = wp.float64(0.0)
+
+        mats[tid] = mat
+
+    mats = wp.empty(1, dtype=wp.mat33d, device=device)
+    wp.launch(dynamic_then_static_loop_kernel, dim=1, inputs=[mats], device=device)
+    assert_np_equal(mats.numpy(), np.zeros((1, 3, 3)))
+
 
 @wp.kernel
 def test_call_syntax():
diff --git a/warp/tests/test_fabricarray.py b/warp/tests/test_fabricarray.py
index 0bf0f41f..6ffe04a2 100644
--- a/warp/tests/test_fabricarray.py
+++ b/warp/tests/test_fabricarray.py
@@ -821,6 +821,38 @@ def test_fabricarray_fill_matrix(test, device):
             assert_np_equal(ifb.numpy(), np.zeros((*ifb.shape, *mat_shape), dtype=nptype))
 
 
+@wp.kernel
+def fa_kernel_indexing_types(
+    a: wp.fabricarray(dtype=wp.int32),
+):
+    x = a[wp.uint8(0)]
+    y = a[wp.int16(1)]
+    z = a[wp.uint32(2)]
+    w = a[wp.int64(3)]
+
+    a[wp.uint8(0)] = 123
+    a[wp.int16(1)] = 123
+    a[wp.uint32(2)] = 123
+    a[wp.int64(3)] = 123
+
+    wp.atomic_add(a, wp.uint8(0), 123)
+    wp.atomic_sub(a, wp.int16(1), 123)
+    # wp.atomic_min(a, wp.uint32(2), 123)
+    # wp.atomic_max(a, wp.int64(3), 123)
+
+
+def test_fabricarray_indexing_types(test, device):
+    data = wp.zeros(shape=(4,), dtype=wp.int32, device=device)
+    iface = _create_fabric_array_interface(data, "foo", copy=True)
+    fa = wp.fabricarray(data=iface, attrib="foo")
+    wp.launch(
+        kernel=fa_kernel_indexing_types,
+        dim=1,
+        inputs=(fa,),
+        device=device,
+    )
+
+
 @wp.kernel
 def fa_generic_sums_kernel(a: wp.fabricarrayarray(dtype=Any), sums: wp.array(dtype=Any)):
     i = wp.tid()
@@ -945,6 +977,7 @@ def test_fabricarray_new_del(self):
 add_function_test(TestFabricArray, "test_fabricarray_fill_scalar", test_fabricarray_fill_scalar, devices=devices)
 add_function_test(TestFabricArray, "test_fabricarray_fill_vector", test_fabricarray_fill_vector, devices=devices)
 add_function_test(TestFabricArray, "test_fabricarray_fill_matrix", test_fabricarray_fill_matrix, devices=devices)
+add_function_test(TestFabricArray, "test_fabricarray_indexing_types", test_fabricarray_indexing_types, devices=devices)
 
 # fabric arrays of arrays
 add_function_test(TestFabricArray, "test_fabricarrayarray", test_fabricarrayarray, devices=devices)
diff --git a/warp/tests/test_fem.py b/warp/tests/test_fem.py
index 63e3cde9..e8e96ece 100644
--- a/warp/tests/test_fem.py
+++ b/warp/tests/test_fem.py
@@ -28,6 +28,9 @@
 )
 from warp.tests.unittest_utils import *
 
+vec6f = wp.vec(length=6, dtype=float)
+mat66f = wp.mat(shape=(6, 6), dtype=float)
+
 
 @integrand
 def linear_form(s: Sample, u: Field):
@@ -1507,7 +1510,7 @@ def test_implicit_fields(test, device):
 
 @wp.kernel
 def test_qr_eigenvalues():
-    tol = 1.0e-6
+    tol = 1.0e-8
 
     # zero
     Zero = wp.mat33(0.0)
@@ -1546,6 +1549,19 @@ def test_qr_eigenvalues():
     Err4 = wp.transpose(P4) * wp.diag(D4) * P4 - Rank4
     wp.expect_near(wp.ddot(Err4, Err4), 0.0, tol)
 
+    # test robustness to low requested tolerance
+    Rank6 = mat66f(
+        vec6f(0.00171076, 0.0, 0.0, 0.0, 0.0, 0.0),
+        vec6f(0.0, 0.00169935, 6.14367e-06, -3.52589e-05, 3.02397e-05, -1.53458e-11),
+        vec6f(0.0, 6.14368e-06, 0.00172217, 2.03568e-05, 1.74589e-05, -2.92627e-05),
+        vec6f(0.0, -3.52589e-05, 2.03568e-05, 0.00172178, 2.53422e-05, 3.02397e-05),
+        vec6f(0.0, 3.02397e-05, 1.74589e-05, 2.53422e-05, 0.00171114, 3.52589e-05),
+        vec6f(0.0, 6.42993e-12, -2.92627e-05, 3.02397e-05, 3.52589e-05, 0.00169935),
+    )
+    D6, P6 = symmetric_eigenvalues_qr(Rank6, 0.0)
+    Err6 = wp.transpose(P6) * wp.diag(D6) * P6 - Rank6
+    wp.expect_near(wp.ddot(Err6, Err6), 0.0, 1.0e-13)
+
 
 @wp.kernel
 def test_qr_inverse():
diff --git a/warp/tests/test_func.py b/warp/tests/test_func.py
index 495e0a9c..631fe769 100644
--- a/warp/tests/test_func.py
+++ b/warp/tests/test_func.py
@@ -7,7 +7,7 @@
 
 import math
 import unittest
-from typing import Tuple
+from typing import Any, Tuple
 
 import numpy as np
 
@@ -191,6 +191,37 @@ def test_user_func_return_multiple_values():
     wp.expect_eq(b, 54756.0)
 
 
+@wp.func
+def user_func_overload(
+    b: wp.array(dtype=Any),
+    i: int,
+):
+    return b[i] * 2.0
+
+
+@wp.kernel
+def user_func_overload_resolution_kernel(
+    a: wp.array(dtype=Any),
+    b: wp.array(dtype=Any),
+):
+    i = wp.tid()
+    a[i] = user_func_overload(b, i)
+
+
+def test_user_func_overload_resolution(test, device):
+    a0 = wp.array((1, 2, 3), dtype=wp.vec3)
+    b0 = wp.array((2, 3, 4), dtype=wp.vec3)
+
+    a1 = wp.array((5,), dtype=float)
+    b1 = wp.array((6,), dtype=float)
+
+    wp.launch(user_func_overload_resolution_kernel, a0.shape, (a0, b0))
+    wp.launch(user_func_overload_resolution_kernel, a1.shape, (a1, b1))
+
+    assert_np_equal(a0.numpy()[0], (4, 6, 8))
+    assert a1.numpy()[0] == 12
+
+
 devices = get_test_devices()
 
 
@@ -375,6 +406,9 @@ def test_native_function_error_resolution(self):
     dim=1,
     devices=devices,
 )
+add_function_test(
+    TestFunc, func=test_user_func_overload_resolution, name="test_user_func_overload_resolution", devices=devices
+)
 
 
 if __name__ == "__main__":
diff --git a/warp/tests/test_generics.py b/warp/tests/test_generics.py
index ed769338..1b5ab9ac 100644
--- a/warp/tests/test_generics.py
+++ b/warp/tests/test_generics.py
@@ -522,6 +522,57 @@ def kernel():
         )
 
 
+@wp.func
+def vec_int_annotation_func(v: wp.vec(3, wp.Int)) -> wp.Int:
+    return v[0] + v[1] + v[2]
+
+
+@wp.func
+def vec_float_annotation_func(v: wp.vec(3, wp.Float)) -> wp.Float:
+    return v[0] + v[1] + v[2]
+
+
+@wp.func
+def vec_scalar_annotation_func(v: wp.vec(3, wp.Scalar)) -> wp.Scalar:
+    return v[0] + v[1] + v[2]
+
+
+@wp.func
+def mat_int_annotation_func(m: wp.mat((2, 2), wp.Int)) -> wp.Int:
+    return m[0, 0] + m[0, 1] + m[1, 0] + m[1, 1]
+
+
+@wp.func
+def mat_float_annotation_func(m: wp.mat((2, 2), wp.Float)) -> wp.Float:
+    return m[0, 0] + m[0, 1] + m[1, 0] + m[1, 1]
+
+
+@wp.func
+def mat_scalar_annotation_func(m: wp.mat((2, 2), wp.Scalar)) -> wp.Scalar:
+    return m[0, 0] + m[0, 1] + m[1, 0] + m[1, 1]
+
+
+mat22s = wp.mat((2, 2), wp.int16)
+mat22d = wp.mat((2, 2), wp.float64)
+
+
+@wp.kernel
+def test_annotations_kernel():
+    vi16 = wp.vec3s(wp.int16(1), wp.int16(2), wp.int16(3))
+    vf64 = wp.vec3d(wp.float64(1), wp.float64(2), wp.float64(3))
+    wp.expect_eq(vec_int_annotation_func(vi16), wp.int16(6))
+    wp.expect_eq(vec_float_annotation_func(vf64), wp.float64(6))
+    wp.expect_eq(vec_scalar_annotation_func(vi16), wp.int16(6))
+    wp.expect_eq(vec_scalar_annotation_func(vf64), wp.float64(6))
+
+    mi16 = mat22s(wp.int16(1), wp.int16(2), wp.int16(3), wp.int16(4))
+    mf64 = mat22d(wp.float64(1), wp.float64(2), wp.float64(3), wp.float64(4))
+    wp.expect_eq(mat_int_annotation_func(mi16), wp.int16(10))
+    wp.expect_eq(mat_float_annotation_func(mf64), wp.float64(10))
+    wp.expect_eq(mat_scalar_annotation_func(mi16), wp.int16(10))
+    wp.expect_eq(mat_scalar_annotation_func(mf64), wp.float64(10))
+
+
 class TestGenerics(unittest.TestCase):
     pass
 
@@ -590,6 +641,7 @@ class TestGenerics(unittest.TestCase):
 )
 add_function_test(TestGenerics, "test_type_operator_misspell", test_type_operator_misspell, devices=devices)
 add_function_test(TestGenerics, "test_type_attribute_error", test_type_attribute_error, devices=devices)
+add_kernel_test(TestGenerics, name="test_annotations_kernel", kernel=test_annotations_kernel, dim=1, devices=devices)
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
diff --git a/warp/tests/test_iter.py b/warp/tests/test_iter.py
new file mode 100644
index 00000000..32a066b4
--- /dev/null
+++ b/warp/tests/test_iter.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import unittest
+
+import warp as wp
+from warp.tests.unittest_utils import *
+
+
+@wp.kernel
+def reversed_kernel(
+    start: wp.int32,
+    end: wp.int32,
+    step: wp.int32,
+    out_count: wp.array(dtype=wp.int32),
+    out_values: wp.array(dtype=wp.int32),
+):
+    count = wp.int32(0)
+    for i in reversed(range(start, end, step)):
+        out_values[count] = i
+        count += 1
+
+    out_count[0] = count
+
+
+def test_reversed(test, device):
+    count = wp.empty(1, dtype=wp.int32)
+    values = wp.empty(32, dtype=wp.int32)
+
+    start, end, step = (-2, 8, 3)
+    wp.launch(
+        reversed_kernel,
+        dim=1,
+        inputs=(start, end, step),
+        outputs=(count, values),
+    )
+    expected = tuple(reversed(range(start, end, step)))
+    assert count.numpy()[0] == len(expected)
+    assert_np_equal(values.numpy()[: len(expected)], expected)
+
+    start, end, step = (9, -3, -2)
+    wp.launch(
+        reversed_kernel,
+        dim=1,
+        inputs=(start, end, step),
+        outputs=(count, values),
+    )
+    expected = tuple(reversed(range(start, end, step)))
+    assert count.numpy()[0] == len(expected)
+    assert_np_equal(values.numpy()[: len(expected)], expected)
+
+
+devices = get_test_devices()
+
+
+class TestIter(unittest.TestCase):
+    pass
+
+
+add_function_test(TestIter, "test_reversed", test_reversed, devices=devices)
+
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)
diff --git a/warp/tests/test_model.py b/warp/tests/test_model.py
index dde81889..da872a6c 100644
--- a/warp/tests/test_model.py
+++ b/warp/tests/test_model.py
@@ -157,6 +157,19 @@ def add_three_cubes(builder: ModelBuilder, parent_body=-1):
         assert builder.body_mass == [1.0, 4.0]
         assert builder.body_inv_mass == [1.0, 0.25]
 
+        # create another builder, test add_builder function
+        builder2 = ModelBuilder()
+        builder2.add_builder(builder)
+        assert builder2.articulation_count == builder.articulation_count
+        assert builder2.joint_count == builder.joint_count
+        assert builder2.body_count == builder.body_count
+        assert builder2.shape_count == builder.shape_count
+        assert builder2.articulation_start == builder.articulation_start
+        # add the same builder again
+        builder2.add_builder(builder)
+        assert builder2.articulation_count == 2 * builder.articulation_count
+        assert builder2.articulation_start == [0, 1, 2, 3]
+
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
diff --git a/warp/tests/test_print.py b/warp/tests/test_print.py
index 542db95b..e5431684 100644
--- a/warp/tests/test_print.py
+++ b/warp/tests/test_print.py
@@ -7,6 +7,7 @@
 
 import sys
 import unittest
+from typing import Any
 
 import warp as wp
 from warp.tests.unittest_utils import *
@@ -126,6 +127,139 @@ def test_print_boolean(test, device):
         test.assertRegex(s, rf"True{os.linesep}False{os.linesep}")
 
 
+@wp.kernel
+def generic_print_kernel(x: Any):
+    print(x)
+
+
+@wp.struct
+class SimpleStruct:
+    x: float
+    y: float
+
+
+generic_print_types = [*wp.types.scalar_types]
+for scalar_type in wp.types.scalar_types:
+    generic_print_types.append(wp.types.vector(2, scalar_type))
+    generic_print_types.append(wp.types.vector(3, scalar_type))
+    generic_print_types.append(wp.types.vector(4, scalar_type))
+    generic_print_types.append(wp.types.matrix((2, 2), scalar_type))
+    generic_print_types.append(wp.types.matrix((3, 3), scalar_type))
+    generic_print_types.append(wp.types.matrix((4, 4), scalar_type))
+generic_print_types.append(wp.bool)
+generic_print_types.append(SimpleStruct)
+generic_print_types.append(wp.array(dtype=float))
+
+for T in generic_print_types:
+    wp.overload(generic_print_kernel, [T])
+
+
+def test_print_adjoint(test, device):
+    for scalar_type in wp.types.scalar_types:
+        # scalar
+        capture = StdOutCapture()
+        capture.begin()
+        wp.launch(
+            generic_print_kernel,
+            dim=1,
+            inputs=[scalar_type(17)],
+            adj_inputs=[scalar_type(42)],
+            adjoint=True,
+            device=device,
+        )
+        wp.synchronize_device(device)
+        s = capture.end()
+
+        # We skip the win32 comparison for now since the capture sometimes is an empty string
+        if sys.platform != "win32":
+            test.assertRegex(s, rf"17{os.linesep}adj: 42{os.linesep}")
+
+        for dim in (2, 3, 4):
+            # vector
+            vec_type = wp.types.vector(dim, scalar_type)
+            vec_data = np.arange(vec_type._length_, dtype=wp.dtype_to_numpy(scalar_type))
+            v = vec_type(vec_data)
+            adj_v = vec_type(vec_data[::-1])
+
+            capture = StdOutCapture()
+            capture.begin()
+            wp.launch(generic_print_kernel, dim=1, inputs=[v], adj_inputs=[adj_v], adjoint=True, device=device)
+            wp.synchronize_device(device)
+            s = capture.end()
+
+            # We skip the win32 comparison for now since the capture sometimes is an empty string
+            if sys.platform != "win32":
+                expected_forward = " ".join(str(int(x)) for x in v) + " "
+                expected_adjoint = " ".join(str(int(x)) for x in adj_v)
+                test.assertRegex(s, rf"{expected_forward}{os.linesep}adj: {expected_adjoint}{os.linesep}")
+
+            # matrix
+            mat_type = wp.types.matrix((dim, dim), scalar_type)
+            mat_data = np.arange(mat_type._length_, dtype=wp.dtype_to_numpy(scalar_type))
+            m = mat_type(mat_data)
+            adj_m = mat_type(mat_data[::-1])
+
+            capture = StdOutCapture()
+            capture.begin()
+            wp.launch(generic_print_kernel, dim=1, inputs=[m], adj_inputs=[adj_m], adjoint=True, device=device)
+            wp.synchronize_device(device)
+            s = capture.end()
+
+            # We skip the win32 comparison for now since the capture sometimes is an empty string
+            if sys.platform != "win32":
+                expected_forward = ""
+                expected_adjoint = ""
+                for row in range(dim):
+                    if row == 0:
+                        adj_prefix = "adj: "
+                    else:
+                        adj_prefix = "     "
+                    expected_forward += " ".join(str(int(x)) for x in m[row]) + f" {os.linesep}"
+                    expected_adjoint += adj_prefix + " ".join(str(int(x)) for x in adj_m[row]) + f"{os.linesep}"
+                test.assertRegex(s, rf"{expected_forward}{expected_adjoint}")
+
+    # Booleans
+    capture = StdOutCapture()
+    capture.begin()
+    wp.launch(generic_print_kernel, dim=1, inputs=[True], adj_inputs=[False], adjoint=True, device=device)
+    wp.synchronize_device(device)
+    s = capture.end()
+
+    # We skip the win32 comparison for now since the capture sometimes is an empty string
+    if sys.platform != "win32":
+        test.assertRegex(s, rf"True{os.linesep}adj: False{os.linesep}")
+
+    # structs, not printable yet
+    capture = StdOutCapture()
+    capture.begin()
+    wp.launch(
+        generic_print_kernel, dim=1, inputs=[SimpleStruct()], adj_inputs=[SimpleStruct()], adjoint=True, device=device
+    )
+    wp.synchronize_device(device)
+    s = capture.end()
+
+    # We skip the win32 comparison for now since the capture sometimes is an empty string
+    if sys.platform != "win32":
+        test.assertRegex(
+            s, rf"<type without print implementation>{os.linesep}adj: <type without print implementation>{os.linesep}"
+        )
+
+    # arrays, not printable
+    capture = StdOutCapture()
+    capture.begin()
+    a = wp.ones(10, dtype=float, device=device)
+    adj_a = wp.zeros(10, dtype=float, device=device)
+    wp.launch(generic_print_kernel, dim=1, inputs=[a], adj_inputs=[adj_a], adjoint=True, device=device)
+    wp.synchronize_device(device)
+    s = capture.end()
+
+    # We skip the win32 comparison for now since the capture sometimes is an empty string
+    if sys.platform != "win32":
+        test.assertRegex(
+            s, rf"<type without print implementation>{os.linesep}adj: <type without print implementation>{os.linesep}"
+        )
+
+
 class TestPrint(unittest.TestCase):
     pass
 
@@ -134,6 +268,7 @@ class TestPrint(unittest.TestCase):
 add_function_test(TestPrint, "test_print", test_print, devices=devices, check_output=False)
 add_function_test(TestPrint, "test_print_numeric", test_print_numeric, devices=devices, check_output=False)
 add_function_test(TestPrint, "test_print_boolean", test_print_boolean, devices=devices, check_output=False)
+add_function_test(TestPrint, "test_print_adjoint", test_print_adjoint, devices=devices, check_output=False)
 
 
 if __name__ == "__main__":
diff --git a/warp/tests/test_static.py b/warp/tests/test_static.py
index d816af4f..9e3f7393 100644
--- a/warp/tests/test_static.py
+++ b/warp/tests/test_static.py
@@ -5,6 +5,8 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
+import importlib
+import tempfile
 import unittest
 from typing import Dict, List
 
@@ -17,6 +19,23 @@
 global_variable = 3
 
 
+def load_code_as_module(code, name):
+    file, file_path = tempfile.mkstemp(suffix=".py")
+
+    try:
+        with os.fdopen(file, "w") as f:
+            f.write(code)
+
+        spec = importlib.util.spec_from_file_location(name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+    finally:
+        os.remove(file_path)
+
+    # return Warp module
+    return wp.get_module(module.__name__)
+
+
 @wp.func
 def static_global_variable_func():
     static_var = warp.static(global_variable + 2)
@@ -234,7 +253,7 @@ def function_variable_kernel(results: wp.array(dtype=int)):
             results[0] = wp.static(func)(3, 2)  # noqa: B023
 
         results = wp.zeros(1, dtype=int, device=device)
-        # note that the kernel has to be recompiled everytime the value of func changes
+        # note that the kernel has to be recompiled every time the value of func changes
         wp.launch(function_variable_kernel, 1, [results], device=device)
         assert_np_equal(results.numpy(), np.array([func(3, 2)], dtype=int))
 
@@ -383,6 +402,140 @@ def static_condition3(results: wp.array(dtype=int)):
         assert_np_equal(counts["else"], 0)
 
 
+static_builtin_constant_template = """
+import warp as wp
+
+# Python builtin literal like 17, 42.0, or True
+C = {value}
+
+@wp.kernel
+def k():
+    print(wp.static(C))
+"""
+
+static_warp_constant_template = """
+import warp as wp
+
+# Warp scalar value like wp.uint8(17)
+C = wp.{dtype}({value})
+
+@wp.kernel
+def k():
+    print(wp.static(C))
+"""
+
+static_struct_constant_template = """
+import warp as wp
+
+@wp.struct
+class SimpleStruct:
+    x: float
+
+C = SimpleStruct()
+C.x = {value}
+
+@wp.kernel
+def k():
+    print(wp.static(C))
+"""
+
+static_func_template = """
+import warp as wp
+
+@wp.func
+def f():
+    # modify the function to verify hashing
+    return {value}
+
+@wp.kernel
+def k():
+    print(wp.static(f)())
+"""
+
+
+def test_static_constant_hash(test, _):
+    # Python literals
+    # (type, value1, value2)
+    literals = [
+        (int, 17, 42),
+        (float, 17.5, 42.5),
+        (bool, True, False),
+    ]
+
+    for builtin_type, value1, value2 in literals:
+        type_name = builtin_type.__name__
+        with test.subTest(msg=f"{type_name}"):
+            source1 = static_builtin_constant_template.format(value=value1)
+            source2 = static_builtin_constant_template.format(value=value2)
+            source3 = static_builtin_constant_template.format(value=value1)
+
+            module1 = load_code_as_module(source1, f"aux_static_constant_builtin_{type_name}_1")
+            module2 = load_code_as_module(source2, f"aux_static_constant_builtin_{type_name}_2")
+            module3 = load_code_as_module(source3, f"aux_static_constant_builtin_{type_name}_3")
+
+            hash1 = module1.hash_module()
+            hash2 = module2.hash_module()
+            hash3 = module3.hash_module()
+
+            test.assertNotEqual(hash1, hash2)
+            test.assertEqual(hash1, hash3)
+
+    # Warp types (scalars, vectors, matrices)
+    for warp_type in [*wp.types.scalar_types, *wp.types.vector_types]:
+        type_name = warp_type.__name__
+        with test.subTest(msg=f"wp.{type_name}"):
+            value1 = ", ".join([str(17)] * warp_type._length_)
+            value2 = ", ".join([str(42)] * warp_type._length_)
+            source1 = static_warp_constant_template.format(dtype=type_name, value=value1)
+            source2 = static_warp_constant_template.format(dtype=type_name, value=value2)
+            source3 = static_warp_constant_template.format(dtype=type_name, value=value1)
+
+            module1 = load_code_as_module(source1, f"aux_static_constant_wp_{type_name}_1")
+            module2 = load_code_as_module(source2, f"aux_static_constant_wp_{type_name}_2")
+            module3 = load_code_as_module(source3, f"aux_static_constant_wp_{type_name}_3")
+
+            hash1 = module1.hash_module()
+            hash2 = module2.hash_module()
+            hash3 = module3.hash_module()
+
+            test.assertNotEqual(hash1, hash2)
+            test.assertEqual(hash1, hash3)
+
+    # structs
+    with test.subTest(msg="struct"):
+        source1 = static_struct_constant_template.format(value=17)
+        source2 = static_struct_constant_template.format(value=42)
+        source3 = static_struct_constant_template.format(value=17)
+
+        module1 = load_code_as_module(source1, "aux_static_constant_struct_1")
+        module2 = load_code_as_module(source2, "aux_static_constant_struct_2")
+        module3 = load_code_as_module(source3, "aux_static_constant_struct_3")
+
+        hash1 = module1.hash_module()
+        hash2 = module2.hash_module()
+        hash3 = module3.hash_module()
+
+        test.assertNotEqual(hash1, hash2)
+        test.assertEqual(hash1, hash3)
+
+
+def test_static_function_hash(test, _):
+    source1 = static_func_template.format(value=17)
+    source2 = static_func_template.format(value=42)
+    source3 = static_func_template.format(value=17)
+
+    module1 = load_code_as_module(source1, "aux_static_func1")
+    module2 = load_code_as_module(source2, "aux_static_func2")
+    module3 = load_code_as_module(source3, "aux_static_func3")
+
+    hash1 = module1.hash_module()
+    hash2 = module2.hash_module()
+    hash3 = module3.hash_module()
+
+    test.assertNotEqual(hash1, hash2)
+    test.assertEqual(hash1, hash3)
+
+
 devices = get_test_devices()
 
 
@@ -406,6 +559,9 @@ def test_static_python_call(self):
 add_function_test(TestStatic, "test_static_for_loop", test_static_for_loop, devices=devices)
 add_function_test(TestStatic, "test_static_if_else_elif", test_static_if_else_elif, devices=devices)
 
+add_function_test(TestStatic, "test_static_constant_hash", test_static_constant_hash, devices=None)
+add_function_test(TestStatic, "test_static_function_hash", test_static_function_hash, devices=None)
+
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
diff --git a/warp/tests/unittest_suites.py b/warp/tests/unittest_suites.py
index 26ccf9a4..2d76557f 100644
--- a/warp/tests/unittest_suites.py
+++ b/warp/tests/unittest_suites.py
@@ -170,6 +170,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
     from warp.tests.test_sparse import TestSparse
     from warp.tests.test_spatial import TestSpatial
     from warp.tests.test_special_values import TestSpecialValues
+    from warp.tests.test_static import TestStatic
     from warp.tests.test_streams import TestStreams
     from warp.tests.test_struct import TestStruct
     from warp.tests.test_tape import TestTape
@@ -269,6 +270,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
         TestSparse,
         TestSpatial,
         TestSpecialValues,
+        TestStatic,
         TestStreams,
         TestStruct,
         TestTape,
@@ -329,6 +331,7 @@ def kit_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader):
     from warp.tests.test_rounding import TestRounding
     from warp.tests.test_runlength_encode import TestRunlengthEncode
     from warp.tests.test_sparse import TestSparse
+    from warp.tests.test_static import TestStatic
     from warp.tests.test_streams import TestStreams
     from warp.tests.test_tape import TestTape
     from warp.tests.test_transient_module import TestTransientModule
@@ -374,6 +377,7 @@ def kit_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader):
         TestRounding,
         TestRunlengthEncode,
         TestSparse,
+        TestStatic,
         TestStreams,
         TestTape,
         TestTransientModule,
diff --git a/warp/types.py b/warp/types.py
index ea9604e4..e9722b1f 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -100,8 +100,10 @@ class vec_t(ctypes.Array):
 
         if dtype is bool:
             _type_ = ctypes.c_bool
-        elif dtype in [Scalar, Float]:
+        elif dtype in (Scalar, Float):
             _type_ = ctypes.c_float
+        elif dtype is Int:
+            _type_ = ctypes.c_int
         else:
             _type_ = dtype._type_
 
@@ -289,8 +291,10 @@ class mat_t(ctypes.Array):
 
         if dtype is bool:
             _type_ = ctypes.c_bool
-        elif dtype in [Scalar, Float]:
+        elif dtype in (Scalar, Float):
             _type_ = ctypes.c_float
+        elif dtype is Int:
+            _type_ = ctypes.c_int
         else:
             _type_ = dtype._type_
 
@@ -1490,7 +1494,11 @@ def types_equal(a, b, match_generic=False):
 
         return True
 
-    if is_array(a) and type(a) is type(b):
+    if is_array(a) and type(a) is type(b) and types_equal(a.dtype, b.dtype, match_generic=match_generic):
+        return True
+
+    # match NewStructInstance and Struct dtype
+    if getattr(a, "cls", "a") is getattr(b, "cls", "b"):
         return True
 
     # match NewStructInstance and Struct dtype
@@ -2266,13 +2274,22 @@ def grad(self, grad):
             self._requires_grad = False
         else:
             # make sure the given gradient array is compatible
-            if (
-                grad.dtype != self.dtype
-                or grad.shape != self.shape
-                or grad.strides != self.strides
-                or grad.device != self.device
-            ):
-                raise ValueError("The given gradient array is incompatible")
+            if grad.dtype != self.dtype:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected dtype {self.dtype}, got {grad.dtype}"
+                )
+            if grad.shape != self.shape:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected shape {self.shape}, got {grad.shape}"
+                )
+            if grad.device != self.device:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected device {self.device}, got {grad.device}"
+                )
+            if grad.strides != self.strides:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected strides {self.strides}, got {grad.strides}"
+                )
             self._grad = grad
             self._requires_grad = True
 

From 96b0d0adc5e764beefe1ddc974121415b6dd1233 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 29 Oct 2024 14:28:45 -0700
Subject: [PATCH 088/102] Fix issues with tile print

---
 warp/native/tile.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index 6d164d7f..dad774ec 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -768,18 +768,29 @@ void tile_register_t<T, M, N>::print()
     WP_TILE_SYNC();
 }
 
-template <typename Tile>
-inline CUDA_CALLABLE void print(Tile& t)
+template <typename T, int M, int N>
+inline CUDA_CALLABLE void print(const tile_register_t<T, M, N>& t)
 {
     t.print();
 }
 
-template <typename Tile, typename AdjTile>
-inline CUDA_CALLABLE void adj_print(Tile& t, AdjTile& a)
+template <typename T, int M, int N>
+inline CUDA_CALLABLE void adj_print(const tile_register_t<T, M, N>& t, const tile_register_t<T, M, N>& a)
 {
     a.print();
 }
 
+template <typename T, int M, int N>
+inline CUDA_CALLABLE void print(const tile_shared_t<T, M, N>& t)
+{
+    t.print();
+}
+
+template <typename T, int M, int N>
+inline CUDA_CALLABLE void adj_print(const tile_shared_t<T, M, N>& t, const tile_shared_t<T, M, N>& a)
+{
+    a.print();
+}
 
 // helpers to allocate shared tiles
 template <typename T, int M, int N, int Alloc>

From 246c9e9fb7567282470fa13c8322e5763d2470a2 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 29 Oct 2024 14:54:35 -0700
Subject: [PATCH 089/102] Experiment with mathdx support pipeline

---
 .gitlab-ci.yml                        |  54 ++++-----
 .gitlab/ci/cuda-11-build-and-test.yml |   2 +-
 .gitlab/ci/debug-build-and-test.yml   |   2 +-
 .gitlab/ci/mathdx-support.yml         | 167 --------------------------
 4 files changed, 26 insertions(+), 199 deletions(-)
 delete mode 100644 .gitlab/ci/mathdx-support.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 566a12bc..2135f7c2 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -62,15 +62,23 @@ stages:
 
 linux-aarch64 build:
   stage: build
-  image: ubuntu:22.04
+  image: ubuntu:20.04
   extends:
     - .save_warp_bin_artifact
   before_script:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
+    - >
+      curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/96/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
+      -o libmathdx.tar.gz
+    - mkdir -p _build/target-deps
+    - tar -xzf libmathdx.tar.gz -C _build/target-deps
+    - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
+    - gcc --version
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
-    - ./tools/ci/building/build-linux-aarch64/build.sh --no-docker # We are already using the builder image
+    - ./tools/ci/building/build-linux-aarch64/build.sh --no-docker
     - mkdir -p warp/bin/linux-aarch64
     - mv warp/bin/warp.so warp/bin/linux-aarch64
     - mv warp/bin/warp-clang.so warp/bin/linux-aarch64
@@ -79,12 +87,24 @@ linux-aarch64 build:
 
 linux-x86_64 build:
   stage: build
-  image: urm.nvidia.com/ct-omniverse-docker/centos7-gcc10-builder:3.2.0
+  image: ubuntu:20.04
   extends:
     - .save_warp_bin_artifact
     - .runner-build-linux-x86_64
+  before_script:
+    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
+    - apt-get update && apt-get install build-essential curl --no-install-recommends -y
+    - >
+      curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/96/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
+      -o libmathdx.tar.gz
+    - mkdir -p _build/target-deps
+    - tar -xzf libmathdx.tar.gz -C _build/target-deps
+    - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
+    - gcc --version
+    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
-    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker # We are already using the builder image
+    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker
     - mkdir -p warp/bin/linux-x86_64
     - mv warp/bin/warp.so warp/bin/linux-x86_64
     - mv warp/bin/warp-clang.so warp/bin/linux-x86_64
@@ -502,32 +522,6 @@ debug build and test:
   extends:
     - .trigger_common
 
-trigger mathdx support pipeline:
-  stage: test
-  image: busybox
-  extends:
-    - .runner-utility-linux-x86_64
-  needs: []
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "schedule"
-    - if: $CI_COMMIT_TAG
-    - if: $CI_COMMIT_BRANCH =~ /^release-.*/
-    - when: manual # Can be triggered in all other scenarios
-      allow_failure: true
-  variables:
-    GIT_STRATEGY: none
-  script:
-    - echo "Run this job to test Warp compiled with mathdx support."
-
-# Uses the same Python version as the main pipeline.
-mathdx support:
-  stage: child pipelines
-  needs: [trigger mathdx support pipeline]
-  trigger:
-    include: /.gitlab/ci/mathdx-support.yml
-  extends:
-    - .trigger_common
-
 # Trigger CUDA 11 pipelines
 # Workaround from https://gitlab.com/gitlab-org/gitlab/-/issues/284086
 trigger cuda 11 pipeline:
diff --git a/.gitlab/ci/cuda-11-build-and-test.yml b/.gitlab/ci/cuda-11-build-and-test.yml
index 3f5cd25d..2276aafd 100644
--- a/.gitlab/ci/cuda-11-build-and-test.yml
+++ b/.gitlab/ci/cuda-11-build-and-test.yml
@@ -45,7 +45,7 @@ stages:
 
 linux-aarch64 build:
   stage: build
-  image: ubuntu:22.04
+  image: ubuntu:20.04
   extends:
     - .save_warp_bin_artifact
   before_script:
diff --git a/.gitlab/ci/debug-build-and-test.yml b/.gitlab/ci/debug-build-and-test.yml
index d028af2e..ca389d9d 100644
--- a/.gitlab/ci/debug-build-and-test.yml
+++ b/.gitlab/ci/debug-build-and-test.yml
@@ -35,7 +35,7 @@ stages:
 # Hide this job for now until debug aarch64 builds work
 .linux-aarch64 build:
   stage: build
-  image: ubuntu:22.04
+  image: ubuntu:20.04
   extends:
     - .save_warp_bin_artifact
   before_script:
diff --git a/.gitlab/ci/mathdx-support.yml b/.gitlab/ci/mathdx-support.yml
deleted file mode 100644
index bc711297..00000000
--- a/.gitlab/ci/mathdx-support.yml
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
-# NVIDIA CORPORATION and its licensors retain all intellectual property
-# and proprietary rights in and to this software, related documentation
-# and any modifications thereto.  Any use, reproduction, disclosure or
-# distribution of this software and related documentation without an express
-# license agreement from NVIDIA CORPORATION is strictly prohibited.
-
-# ==============================================================================
-# CI/CD Pipeline Configuration
-# ==============================================================================
-
-include: /.gitlab/ci/common.yml
-
-workflow:
-  rules:
-    - if: $CI_PIPELINE_SOURCE == "parent_pipeline"
-
-stages:
-  - build
-  - test
-  - package
-  - deploy
-
-# ==============================================================================
-# Build Jobs (Release)
-# ==============================================================================
-
-linux-x86_64 build:
-  stage: build
-  image: ubuntu:20.04
-  extends:
-    - .save_warp_bin_artifact
-    - .runner-build-linux-x86_64
-  before_script:
-    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
-    - apt-get update && apt-get install build-essential curl --no-install-recommends -y
-    - >
-      curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/93/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
-      -o libmathdx.tar.gz
-    - mkdir -p _build/target-deps
-    - tar -xzf libmathdx.tar.gz -C _build/target-deps
-    - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
-    - gcc --version
-    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
-  script:
-    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker # We are already using the builder image
-    - mkdir -p warp/bin/linux-x86_64
-    - mv warp/bin/warp.so warp/bin/linux-x86_64
-    - mv warp/bin/warp-clang.so warp/bin/linux-x86_64
-
-linux-aarch64 build:
-  stage: build
-  image: ubuntu:20.04
-  extends:
-    - .save_warp_bin_artifact
-  before_script:
-    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
-    - apt-get update && apt-get install build-essential curl --no-install-recommends -y
-    - >
-      curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/93/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
-      -o libmathdx.tar.gz
-    - mkdir -p _build/target-deps
-    - tar -xzf libmathdx.tar.gz -C _build/target-deps
-    - export LIBMATHDX_HOME="$CI_PROJECT_DIR/_build/target-deps/libmathdx-0.0.1-Linux"
-    - gcc --version
-    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
-  script:
-    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker # We are already using the builder image
-    - mkdir -p warp/bin/linux-aarch64
-    - mv warp/bin/warp.so warp/bin/linux-aarch64
-    - mv warp/bin/warp-clang.so warp/bin/linux-aarch64
-  tags:
-    - arch/arm
-
-# ==============================================================================
-# Unit Testing Jobs (MathDx Support)
-#
-# Unlike the main testing jobs defined in /.gitlab-ci.yml, the jobs don't
-# generate code coverage reports.
-# ==============================================================================
-
-linux-x86_64 test:
-  stage: test
-  needs: [linux-x86_64 build]
-  extends:
-    - .omni_nvks_gpu_2x
-    - .save_test_report_artifact
-  before_script:
-    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
-    - df -h
-    # Move compiled binaries out of platform-specific directory
-    - mv warp/bin/linux-x86_64/warp.so warp/bin/
-    - mv warp/bin/linux-x86_64/warp-clang.so warp/bin/
-    - tools/packman/packman install -l _build/target-deps/python python ${DEFAULT_PYTHON}-linux-x86_64
-    - export PATH="$CUDA_BIN:$PATH"
-    - $PYTHON -m venv _venv
-    - source _venv/bin/activate
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
-    - python -m pip install -U "jax[cuda12]"
-    - python -m pip install -e .
-    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
-    # HACK: disable P2P tests due to misbehaving agents
-    - export WARP_DISABLE_P2P_TESTS=1
-  script:
-    - python -m warp.tests --junit-report-xml rspec.xml -s autodetect --failfast
-
-linux-aarch64 test jetson:
-  image: ubuntu:22.04
-  needs: [linux-aarch64 build]
-  extends:
-    - .save_test_report_artifact
-  before_script:
-    - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
-    - !reference [.snippets, install-python+warp-aarch64]
-    - python -m pip install -U "jax[cuda12]"
-    - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
-  script:
-    - python -m warp.tests --junit-report-xml rspec.xml -s autodetect --failfast
-  tags:
-    - gpu/orin
-
-# ==============================================================================
-# Packaging Jobs
-# ==============================================================================
-
-# Creates wheel files for PyPI
-create pypi wheels:
-  stage: package
-  needs:
-    - linux-aarch64 build
-    - linux-x86_64 build
-  extends:
-    - .runner-utility-linux-x86_64
-  before_script:
-    - python3 -m pip install --upgrade pip
-    - python3 -m pip install build
-  script:
-    - sed -i "s/^\(.*\)$/\1+tile/" VERSION.md  # Modify VERSION.md with +tile
-    - python3 -m build --wheel -C--build-option=-Plinux-x86_64
-    - python3 -m build --wheel -C--build-option=-Plinux-aarch64
-    - find . -type f -exec chmod 664 {} +
-    - find . -type d -exec chmod 775 {} +
-  artifacts:
-    name: $CI_COMMIT_REF_SLUG-$CI_COMMIT_SHORT_SHA
-    expose_as: "Python Wheels MathDx"
-    paths:
-      - "dist/"
-    when: always
-
-publish wheels to gitlab pypi registry:
-  stage: deploy
-  image: python:3.11-slim
-  needs: ["create pypi wheels"]
-  extends:
-    - .runner-utility-linux-x86_64
-  rules:
-    - when: manual
-      allow_failure: true
-  before_script:
-    - python3 -m pip install --upgrade pip
-    - python3 -m pip install --upgrade build twine
-  script:
-    - TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python3 -m twine upload --verbose --skip-existing --non-interactive --repository-url ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi dist/*

From 548cb9f1356b687a59afeeef8143bdcae65f224c Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Tue, 29 Oct 2024 21:07:25 -0700
Subject: [PATCH 090/102] Fix merge issue that broke test_tile_mlp

---
 warp/codegen.py             | 14 --------------
 warp/tests/test_tile_mlp.py |  2 +-
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/warp/codegen.py b/warp/codegen.py
index 000ea4d5..51c98c72 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -1340,10 +1340,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         if return_type is None:
             # handles expression (zero output) functions, e.g.: void do_something();
-
-            output = None
-            output_list = []
-
             forward_call = (
                 f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
             )
@@ -1353,12 +1349,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         elif not isinstance(return_type, Sequence) or len(return_type) == 1:
             # handle simple function (one output)
-
-            if isinstance(return_type, Sequence):
-                return_type = return_type[0]
-            output = adj.add_var(return_type)
-            output_list = [output]
-
             forward_call = f"var_{output} = {func.namespace}{func_name}({adj.format_forward_call_args(fwd_args, use_initializer_list)});"
             replay_call = forward_call
             if func.custom_replay_func is not None:
@@ -1366,10 +1356,6 @@ def add_call(adj, func, args, kwargs, type_args, min_outputs=None):
 
         else:
             # handle multiple value functions
-
-            output = [adj.add_var(v) for v in return_type]
-            output_list = output
-
             forward_call = (
                 f"{func.namespace}{func_name}({adj.format_forward_call_args(fwd_args + output, use_initializer_list)});"
             )
diff --git a/warp/tests/test_tile_mlp.py b/warp/tests/test_tile_mlp.py
index 89fcf052..9ae760f4 100644
--- a/warp/tests/test_tile_mlp.py
+++ b/warp/tests/test_tile_mlp.py
@@ -391,5 +391,5 @@ class TestTileMLP(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    #    wp.clear_kernel_cache()
+    wp.clear_kernel_cache()
     unittest.main(verbosity=2, failfast=True)

From 4d6444bd51cbab7f09c656f710f8be50a86ba646 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 12:26:01 -0700
Subject: [PATCH 091/102] Relocate tile examples

---
 {examples => warp/examples/tile}/tile_fft.py    | 0
 {examples => warp/examples/tile}/tile_matmul.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {examples => warp/examples/tile}/tile_fft.py (100%)
 rename {examples => warp/examples/tile}/tile_matmul.py (100%)

diff --git a/examples/tile_fft.py b/warp/examples/tile/tile_fft.py
similarity index 100%
rename from examples/tile_fft.py
rename to warp/examples/tile/tile_fft.py
diff --git a/examples/tile_matmul.py b/warp/examples/tile/tile_matmul.py
similarity index 100%
rename from examples/tile_matmul.py
rename to warp/examples/tile/tile_matmul.py

From b7962f824b6433f71bdba89670725e0ceba9b13d Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 12:53:52 -0700
Subject: [PATCH 092/102] Add license for Pixel the Cat image

---
 licenses/assets/pixel-LICENSE.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 licenses/assets/pixel-LICENSE.txt

diff --git a/licenses/assets/pixel-LICENSE.txt b/licenses/assets/pixel-LICENSE.txt
new file mode 100644
index 00000000..b01f22c5
--- /dev/null
+++ b/licenses/assets/pixel-LICENSE.txt
@@ -0,0 +1,3 @@
+Pixel the Cat (pixel.jpg) (c) 2020 by Alison Wawrzyniak is licensed under CC BY 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by/4.0/
+
+Resized from original.

From 24960d7b873e602233db7bd4f42236cd1b843d0b Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 12:55:33 -0700
Subject: [PATCH 093/102] Fix CHANGELOG

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fae03b44..24987e0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## [1.4.0] - 2024-10-01
+## [Unreleased] - 2024-??
 
 ### Added
 

From 91f634ab1960ca5e842ee30037e7f529d0602f2d Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 13:01:09 -0700
Subject: [PATCH 094/102] Don't install mathdx in build.sh

---
 tools/ci/building/build-linux-x86_64/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci/building/build-linux-x86_64/build.sh b/tools/ci/building/build-linux-x86_64/build.sh
index e9af605d..51940183 100755
--- a/tools/ci/building/build-linux-x86_64/build.sh
+++ b/tools/ci/building/build-linux-x86_64/build.sh
@@ -74,7 +74,7 @@ CUDA="$SCRIPT_DIR/../../../../_build/target-deps/cuda"
 
 # pip deps
 $PYTHON -m pip install --upgrade pip
-$PYTHON -m pip install --upgrade numpy gitpython cmake ninja nvidia-mathdx==24.4.0
+$PYTHON -m pip install --upgrade numpy gitpython cmake ninja
 
 if [ "$GITLAB_CI" = "true" ]; then
     echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"

From 4ee746f1761608d06de59c40e38b1f537ce9da41 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 13:15:44 -0700
Subject: [PATCH 095/102] Merge tile_matmul examples

---
 warp/examples/tile/example_tile_matmul.py |  8 +++---
 warp/examples/tile/tile_matmul.py         | 34 -----------------------
 2 files changed, 4 insertions(+), 38 deletions(-)
 delete mode 100644 warp/examples/tile/tile_matmul.py

diff --git a/warp/examples/tile/example_tile_matmul.py b/warp/examples/tile/example_tile_matmul.py
index b8ee510c..b795b35a 100644
--- a/warp/examples/tile/example_tile_matmul.py
+++ b/warp/examples/tile/example_tile_matmul.py
@@ -26,11 +26,11 @@
 
 
 @wp.kernel
-def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
+def tile_gemm(A: wp.array2d(dtype=wp.float32), B: wp.array2d(dtype=wp.float16), C: wp.array2d(dtype=wp.float64)):
     # output tile index
     i, j = wp.tid()
 
-    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
 
     _M = A.shape[0]
     _N = B.shape[1]
@@ -58,8 +58,8 @@ def tile_gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.arra
 
     rng = np.random.default_rng(42)
     A = rng.random((M, K), dtype=np.float32)
-    B = rng.random((K, N), dtype=np.float32)
-    C = np.zeros((M, N), dtype=np.float32)
+    B = rng.random((K, N), dtype=np.float32).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float64)
 
     A_wp = wp.array(A, requires_grad=True)
     B_wp = wp.array(B, requires_grad=True)
diff --git a/warp/examples/tile/tile_matmul.py b/warp/examples/tile/tile_matmul.py
deleted file mode 100644
index 57b94bbc..00000000
--- a/warp/examples/tile/tile_matmul.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import numpy as np
-
-import warp as wp
-
-wp.init()
-wp.build.clear_kernel_cache()
-
-BLOCK_DIM = 32
-M, N, K = 4, 8, 16
-
-
-@wp.kernel
-def matmul_tiled(ga: wp.array2d(dtype=wp.float32), gb: wp.array2d(dtype=wp.float16), gc: wp.array2d(dtype=wp.float64)):
-    i, j, _ = wp.tid()
-    a = wp.tile_load(ga, i, j, m=M, n=K)
-    b = wp.tile_load(gb, i, j, m=K, n=N)
-    c = wp.tile_zeros(m=M, n=N, dtype=wp.float64)
-    wp.tile_matmul(a, b, c)
-    wp.tile_store(gc, i, j, c)
-
-
-A = np.ones((M, K), dtype=np.float32)
-B = 3 * np.ones((K, N), dtype=np.float16)
-C = np.zeros((M, N), dtype=np.float64)
-
-A_wp = wp.array2d(A, dtype=wp.float32)
-B_wp = wp.array2d(B, dtype=wp.float16)
-C_wp = wp.array2d(C, dtype=wp.float64)
-
-wp.launch(matmul_tiled, dim=[1, 1, BLOCK_DIM], inputs=[A_wp, B_wp, C_wp], block_dim=BLOCK_DIM)
-wp.synchronize()
-
-print("inputs:\n", A, "\n", B)
-print("output (should be = 48 * np.ones(4, 8)):\n", C_wp)

From 5304a66d13105a02b1a9f9db389fd2296e8aaf6a Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 15:19:30 -0700
Subject: [PATCH 096/102] Rename example, add headers

---
 warp/examples/benchmarks/benchmark_tile.py              | 7 +++++++
 warp/examples/tile/{tile_fft.py => example_tile_fft.py} | 0
 warp/examples/tile/example_tile_matmul.py               | 6 +++++-
 3 files changed, 12 insertions(+), 1 deletion(-)
 rename warp/examples/tile/{tile_fft.py => example_tile_fft.py} (100%)

diff --git a/warp/examples/benchmarks/benchmark_tile.py b/warp/examples/benchmarks/benchmark_tile.py
index 54fec3f9..051aaf1c 100644
--- a/warp/examples/benchmarks/benchmark_tile.py
+++ b/warp/examples/benchmarks/benchmark_tile.py
@@ -1,3 +1,10 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
 import numpy as np
 import torch
 
diff --git a/warp/examples/tile/tile_fft.py b/warp/examples/tile/example_tile_fft.py
similarity index 100%
rename from warp/examples/tile/tile_fft.py
rename to warp/examples/tile/example_tile_fft.py
diff --git a/warp/examples/tile/example_tile_matmul.py b/warp/examples/tile/example_tile_matmul.py
index b795b35a..a275c820 100644
--- a/warp/examples/tile/example_tile_matmul.py
+++ b/warp/examples/tile/example_tile_matmul.py
@@ -67,7 +67,11 @@ def tile_gemm(A: wp.array2d(dtype=wp.float32), B: wp.array2d(dtype=wp.float16),
 
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_gemm, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], block_dim=TILE_THREADS
+            tile_gemm,
+            dim=(int(M / TILE_M), int(N / TILE_N)),
+            inputs=[A_wp, B_wp],
+            outputs=[C_wp],
+            block_dim=TILE_THREADS,
         )
 
     assert np.allclose(C_wp.numpy(), A @ B)

From 8fe55f551ac211a0b9ef30ec2f80ad03b7d0d14a Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 15:26:17 -0700
Subject: [PATCH 097/102] Rename example, add copyright headers

---
 warp/examples/tile/example_tile_fft.py | 36 ++++++++++++++++++--------
 warp/native/tile.h                     |  8 ++++++
 warp/native/tile_gemm.h                |  8 ++++++
 warp/native/tile_reduce.h              | 10 ++++++-
 4 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/warp/examples/tile/example_tile_fft.py b/warp/examples/tile/example_tile_fft.py
index f47e0b4a..2ad87fc0 100644
--- a/warp/examples/tile/example_tile_fft.py
+++ b/warp/examples/tile/example_tile_fft.py
@@ -1,11 +1,22 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+###########################################################################
+# Example Tile FFT
+#
+# Shows how to write a simple FFT kernel using Warp tile primitives.
+#
+###########################################################################
+
 import numpy as np
 
 import warp as wp
 
-wp.init()
 wp.set_module_options({"enable_backward": False})
-wp.set_device("cuda:0")
-wp.build.clear_kernel_cache()
 
 BLOCK_DIM = 8
 TILE_M = 1
@@ -21,13 +32,16 @@ def fft_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
     wp.tile_store(y, i, j, a)
 
 
-x_h = np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
-x_h[:, :, 1] = 0
-y_h = 3 * np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
-x_wp = wp.array2d(x_h, dtype=wp.vec2d)
-y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+if __name__ == "__main__":
+    wp.set_device("cuda:0")
+
+    x_h = np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+    x_h[:, :, 1] = 0
+    y_h = 3 * np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
 
-wp.launch(fft_tiled, dim=[1, 1, BLOCK_DIM], inputs=[x_wp, y_wp], block_dim=BLOCK_DIM)
+    wp.launch_tiled(fft_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
 
-print("inputs:\n", x_wp)  # [1+0i, 1+0i, 1+0i, ...]
-print("output:\n", y_wp)  # [32+0i, 0, 0, ...]
+    print("Inputs:\n", x_wp)  # [1+0i, 1+0i, 1+0i, ...]
+    print("Output:\n", y_wp)  # [32+0i, 0, 0, ...]
diff --git a/warp/native/tile.h b/warp/native/tile.h
index dad774ec..e5b48a9d 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -1,3 +1,11 @@
+/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
 #pragma once
 
 #include "builtin.h"
diff --git a/warp/native/tile_gemm.h b/warp/native/tile_gemm.h
index c033330a..2ab0fe40 100644
--- a/warp/native/tile_gemm.h
+++ b/warp/native/tile_gemm.h
@@ -1,3 +1,11 @@
+/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
 #pragma once
 
 #include "builtin.h"
diff --git a/warp/native/tile_reduce.h b/warp/native/tile_reduce.h
index 3b5da6d9..67d0e5c9 100644
--- a/warp/native/tile_reduce.h
+++ b/warp/native/tile_reduce.h
@@ -1,3 +1,11 @@
+/** Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
 #pragma once
 
 #include "tile.h"
@@ -202,4 +210,4 @@ void adj_tile_min(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 
 
 
-} // namespace wp
\ No newline at end of file
+} // namespace wp

From 59ed9f2b19ab5675eaf73f77f5b46b20b3b75fb4 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 15:55:19 -0700
Subject: [PATCH 098/102] Update CI scripts

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e6a26363..17833f4c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -70,7 +70,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/96/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/99/libmathdx_build_aarch64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps
@@ -96,7 +96,7 @@ linux-x86_64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - >
       curl -k -H "Authorization: Bearer $ARTIFACTORY_ACCESS_TOKEN"
-      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/96/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
+      $ARTIFACTORY_BASE_URL/sw-cuda-math-mathdx-generic-local/cicd/libmathdx/main/PostMerge/99/libmathdx_build_x86_64_rockylinux8_cuda12.0.0_release.tar.gz
       -o libmathdx.tar.gz
     - mkdir -p _build/target-deps
     - tar -xzf libmathdx.tar.gz -C _build/target-deps

From a13a44edaf864513493992d99583adc67b44ffb7 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Wed, 30 Oct 2024 16:39:23 -0700
Subject: [PATCH 099/102] More FFT tile tests

---
 warp/tests/test_tile_mathdx.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
index 2c8d7180..c441a9f3 100644
--- a/warp/tests/test_tile_mathdx.py
+++ b/warp/tests/test_tile_mathdx.py
@@ -8,6 +8,7 @@
 import unittest
 
 import numpy as np
+import functools
 
 import warp as wp
 from warp.tests.unittest_utils import *
@@ -18,8 +19,6 @@
 TILE_N = wp.constant(4)
 TILE_K = wp.constant(8)
 
-N_FFT = wp.constant(128)
-
 # num threads per-tile
 TILE_DIM = 64
 
@@ -67,33 +66,36 @@ def test_tile_math_matmul(test, device):
     assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C, tol=1e-2)
 
 
-@wp.kernel()
-def tile_math_fft_kernel(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
-    i, j = wp.tid()
-    xy = wp.tile_load(gx, i, j, m=N_FFT, n=N_FFT)
-    wp.tile_fft(xy)
-    wp.tile_store(gy, i, j, xy)
+def test_tile_math_fft(test, device, wp_dtype, fft_size):
 
+    np_real_dtype = {wp.vec2f: np.float32, wp.vec2d: np.float64}[wp_dtype]
+    np_cplx_dtype = {wp.vec2f: np.complex64, wp.vec2d: np.complex128}[wp_dtype]
 
-def test_tile_math_fft(test, device):
+    @wp.kernel()
+    def tile_math_fft_kernel(gx: wp.array2d(dtype=wp_dtype), gy: wp.array2d(dtype=wp_dtype)):
+        i, j = wp.tid()
+        xy = wp.tile_load(gx, i, j, m=fft_size, n=fft_size)
+        wp.tile_fft(xy)
+        wp.tile_store(gy, i, j, xy)
+    
     rng = np.random.default_rng(42)
 
     # Warp doesn't really have a complex64 type,
     # so we use 2 float32 to represent a single complex64 number and then convert it to vec2f
 
-    X = rng.random((N_FFT, 2 * N_FFT), dtype=np.float32)
+    X = rng.random((fft_size, 2 * fft_size), dtype=np_real_dtype)
     Y = np.zeros_like(X)
 
-    X_wp = wp.array2d(X, requires_grad=True, dtype=wp.vec2f, device=device)
-    Y_wp = wp.array2d(Y, requires_grad=True, dtype=wp.vec2f, device=device)
+    X_wp = wp.array2d(X, requires_grad=True, dtype=wp_dtype, device=device)
+    Y_wp = wp.array2d(Y, requires_grad=True, dtype=wp_dtype, device=device)
 
-    X_c64 = X.view(np.complex64).reshape(N_FFT, N_FFT)
+    X_c64 = X.view(np_cplx_dtype).reshape(fft_size, fft_size)
     Y_c64 = np.fft.fft(X_c64, axis=-1)
 
     with wp.Tape() as tape:
         wp.launch_tiled(tile_math_fft_kernel, dim=[1, 1], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
 
-    Y_wp_c64 = Y_wp.numpy().view(np.complex64).reshape(N_FFT, N_FFT)
+    Y_wp_c64 = Y_wp.numpy().view(np_cplx_dtype).reshape(fft_size, fft_size)
 
     assert_np_equal(Y_wp_c64, Y_c64, tol=1.0e-4)
 
@@ -109,7 +111,8 @@ class TestTileMathDx(unittest.TestCase):
 
 
 add_function_test(TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=devices)
-add_function_test(TestTileMathDx, "test_tile_math_fft", test_tile_math_fft, devices=devices)
+add_function_test(TestTileMathDx, "test_tile_math_fft", functools.partial(test_tile_math_fft, wp_dtype=wp.vec2f, fft_size=wp.constant(128)), devices=devices)
+add_function_test(TestTileMathDx, "test_tile_math_fft", functools.partial(test_tile_math_fft, wp_dtype=wp.vec2d, fft_size=wp.constant(256)), devices=devices)
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()

From c52e54f1dddaaf0ea64eea9414ef1e011b49fd00 Mon Sep 17 00:00:00 2001
From: Leopold Cambier <lcambier@nvidia.com>
Date: Wed, 30 Oct 2024 20:06:34 -0700
Subject: [PATCH 100/102] Fix FFT alignment

---
 warp/native/tile.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/warp/native/tile.h b/warp/native/tile.h
index e5b48a9d..9c896d10 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -1390,13 +1390,18 @@ void adj_tile_matmul(Fwd fun_forward, AdjA fun_backward_A, AdjB fun_backward_B,
     WP_TILE_SYNC();
 }
 
-
+// TODO(lcambier): use a properly overaligned complex type that matches cuFFTDx's expectation
+// TODO(lcambier): use dynamic smem
 #define tile_fft(function_name, dtype, shared_memory_size, batch_size, ept, Xinout) \
     do { \
         void function_name(dtype*, dtype*); \
         WP_TILE_SHARED __align__(16) char buffer[shared_memory_size]; \
+        __align__(16) dtype data[ept]; \
         for(int b = 0; b < (int)batch_size; b++) { \
-            function_name(Xinout.data + (int)b * (int)ept, (dtype*)buffer); \
+            dtype* inout = Xinout.data + (int)b * (int)ept; \
+            memcpy(data, inout, sizeof(dtype) * ept); \
+            function_name(data, (dtype*)buffer); \
+            memcpy(inout, data, sizeof(dtype) * ept); \
             WP_TILE_SYNC(); \
         } \
     } while (0)

From 389e8592331698fbb96a7d3ae49cad75a3439b8b Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 21:06:28 -0700
Subject: [PATCH 101/102] Fix Ruff issues

---
 warp/tests/test_tile_mathdx.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
index c441a9f3..b5e3bb2b 100644
--- a/warp/tests/test_tile_mathdx.py
+++ b/warp/tests/test_tile_mathdx.py
@@ -5,10 +5,10 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
+import functools
 import unittest
 
 import numpy as np
-import functools
 
 import warp as wp
 from warp.tests.unittest_utils import *
@@ -67,7 +67,6 @@ def test_tile_math_matmul(test, device):
 
 
 def test_tile_math_fft(test, device, wp_dtype, fft_size):
-
     np_real_dtype = {wp.vec2f: np.float32, wp.vec2d: np.float64}[wp_dtype]
     np_cplx_dtype = {wp.vec2f: np.complex64, wp.vec2d: np.complex128}[wp_dtype]
 
@@ -77,7 +76,7 @@ def tile_math_fft_kernel(gx: wp.array2d(dtype=wp_dtype), gy: wp.array2d(dtype=wp
         xy = wp.tile_load(gx, i, j, m=fft_size, n=fft_size)
         wp.tile_fft(xy)
         wp.tile_store(gy, i, j, xy)
-    
+
     rng = np.random.default_rng(42)
 
     # Warp doesn't really have a complex64 type,
@@ -111,9 +110,19 @@ class TestTileMathDx(unittest.TestCase):
 
 
 add_function_test(TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=devices)
-add_function_test(TestTileMathDx, "test_tile_math_fft", functools.partial(test_tile_math_fft, wp_dtype=wp.vec2f, fft_size=wp.constant(128)), devices=devices)
-add_function_test(TestTileMathDx, "test_tile_math_fft", functools.partial(test_tile_math_fft, wp_dtype=wp.vec2d, fft_size=wp.constant(256)), devices=devices)
+add_function_test(
+    TestTileMathDx,
+    "test_tile_math_fft",
+    functools.partial(test_tile_math_fft, wp_dtype=wp.vec2f, fft_size=wp.constant(128)),
+    devices=devices,
+)
+add_function_test(
+    TestTileMathDx,
+    "test_tile_math_fft",
+    functools.partial(test_tile_math_fft, wp_dtype=wp.vec2d, fft_size=wp.constant(256)),
+    devices=devices,
+)
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
-    unittest.main(verbosity=2)
+    unittest.main(verbosity=2)
\ No newline at end of file

From e0fc988db319f7ad00986c951e70ac42ca94cac6 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Wed, 30 Oct 2024 21:44:09 -0700
Subject: [PATCH 102/102] Add a trailing newline to appease Ruff

---
 warp/tests/test_tile_mathdx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/warp/tests/test_tile_mathdx.py b/warp/tests/test_tile_mathdx.py
index b5e3bb2b..31fc32a7 100644
--- a/warp/tests/test_tile_mathdx.py
+++ b/warp/tests/test_tile_mathdx.py
@@ -125,4 +125,4 @@ class TestTileMathDx(unittest.TestCase):
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()
-    unittest.main(verbosity=2)
\ No newline at end of file
+    unittest.main(verbosity=2)