diff --git a/examples/backend_comparisons/small_example.py b/examples/backend_comparisons/small_example.py
new file mode 100644
index 0000000..23ca639
--- /dev/null
+++ b/examples/backend_comparisons/small_example.py
@@ -0,0 +1,300 @@
+# Simple example of functions to generate a warp kernel for LBM
+
+import warp as wp
+import numpy as np
+
+# Initialize Warp
+wp.init()
+
+def make_warp_kernel(
+    velocity_weight,
+    velocity_set,
+    dtype=wp.float32,
+    dim=3, # slightly hard coded for 3d right now
+    q=19,
+):
+
+    # Make needed vector classes
+    lattice_vec = wp.vec(q, dtype=dtype)
+    velocity_vec = wp.vec(dim, dtype=dtype)
+
+    # Make array type
+    if dim == 2:
+        array_type = wp.array3d(dtype=dtype)
+    elif dim == 3:
+        array_type = wp.array4d(dtype=dtype)
+
+    # Make everything constant
+    velocity_weight = wp.constant(velocity_weight)
+    velocity_set = wp.constant(velocity_set)
+    q = wp.constant(q)
+    dim = wp.constant(dim)
+
+    # Make function for computing exu
+    @wp.func
+    def compute_exu(u: velocity_vec):
+        exu = lattice_vec()
+        for _ in range(q):
+            for d in range(dim):
+                if velocity_set[_, d] == 1:
+                    exu[_] += u[d]
+                elif velocity_set[_, d] == -1:
+                    exu[_] -= u[d]
+        return exu
+
+    # Make function for computing feq
+    @wp.func
+    def compute_feq(
+        p: dtype,
+        uxu: dtype,
+        exu: lattice_vec,
+    ):
+        factor_1 = 1.5
+        factor_2 = 4.5
+        feq = lattice_vec()
+        for _ in range(q):
+            feq[_] = (
+                velocity_weight[_] * p * (
+                    1.0
+                    + factor_1 * (2.0 * exu[_] - uxu)
+                    + factor_2 * exu[_] * exu[_]
+                )
+            )
+        return feq
+
+    # Make function for computing u and p
+    @wp.func
+    def compute_u_and_p(f: lattice_vec):
+        p = wp.float32(0.0)
+        u = velocity_vec()
+        for d in range(dim):
+            u[d] = wp.float32(0.0)
+        for _ in range(q):
+            p += f[_]
+            for d in range(dim):
+                if velocity_set[_, d] == 1:
+                    u[d] += f[_]
+                elif velocity_set[_, d] == -1:
+                    u[d] -= f[_]
+        u /= p
+        return u, p
+
+    # Make function for getting stream index
+    @wp.func
+    def get_streamed_index(
+            i: int,
+            x: int,
+            y: int,
+            z: int,
+            width: int,
+            height: int,
+            length: int,
+    ):
+        streamed_x = x + velocity_set[i, 0]
+        streamed_y = y + velocity_set[i, 1]
+        streamed_z = z + velocity_set[i, 2]
+        if streamed_x == -1: # TODO hacky
+            streamed_x = width - 1
+        if streamed_y == -1:
+            streamed_y = height - 1
+        if streamed_z == -1:
+            streamed_z = length - 1
+        if streamed_x == width:
+            streamed_x = 0
+        if streamed_y == height:
+            streamed_y = 0
+        if streamed_z == length:
+            streamed_z = 0
+        return streamed_x, streamed_y, streamed_z
+
+    # Make kernel for stream and collide
+    @wp.kernel
+    def collide_stream(
+        f0: array_type,
+        f1: array_type,
+        width: int,
+        height: int,
+        length: int,
+        tau: float,
+    ):
+
+        # Get indices (TODO: no good way to do variable dimension indexing)
+        f = lattice_vec()
+        x, y, z = wp.tid()
+        for i in range(q):
+            f[i] = f0[i, x, y, z]
+
+        # Compute p and u
+        u, p = compute_u_and_p(f)
+
+        # get uxu
+        uxu = wp.dot(u, u)
+
+        # Compute velocity_set dot u
+        exu = compute_exu(u)
+
+        # Compute equilibrium
+        feq = compute_feq(p, uxu, exu)
+
+        # Set value
+        new_f = f - (f - feq) / tau
+        for i in range(q):
+            (streamed_x, streamed_y, streamed_z) = get_streamed_index(
+                i, x, y, z, width, height, length
+            )
+            f1[i, streamed_x, streamed_y, streamed_z] = new_f[i]
+
+    # make kernel for initialization
+    @wp.kernel
+    def initialize_taylor_green(
+        f0: array_type,
+        dx: float,
+        vel: float,
+        width: int,
+        height: int,
+        length: int,
+        tau: float,
+    ):
+
+        # Get indices (TODO: no good way to do variable dimension indexing)
+        i, j, k = wp.tid()
+
+        # Get real coordinates
+        x = wp.float(i) * dx
+        y = wp.float(j) * dx
+        z = wp.float(k) * dx
+
+        # Compute velocity
+        u = velocity_vec()
+        u[0] = vel * wp.sin(x) * wp.cos(y) * wp.cos(z)
+        u[1] = -vel * wp.cos(x) * wp.sin(y) * wp.cos(z)
+        u[2] = 0.0
+
+        # Compute p
+        p = (
+            3.0
+            * vel
+            * vel
+            * (1.0 / 16.0)
+            * (
+                wp.cos(2.0 * x)
+                + wp.cos(2.0 * y)
+                + wp.cos(2.0 * z)
+            )
+            + 1.0
+        )
+
+        # Compute uxu
+        uxu = wp.dot(u, u)
+
+        # Compute velocity_set dot u
+        exu = compute_exu(u)
+
+        # Compute equilibrium
+        feq = compute_feq(p, uxu, exu)
+
+        # Set value
+        for _ in range(q):
+            f0[_, i, j, k] = feq[_]
+
+    return collide_stream, initialize_taylor_green
+
+def plt_f(f):
+    import matplotlib.pyplot as plt
+    plt.imshow(f.numpy()[3, :, :, f.shape[3] // 4])
+    plt.show()
+
+if __name__ == "__main__":
+
+    # Parameters
+    n = 256
+    tau = 0.505
+    dim = 3
+    q = 19
+    lattice_dtype = wp.float32
+    lattice_vec = wp.vec(q, dtype=lattice_dtype)
+
+    # Make arrays
+    f0 = wp.empty((q, n, n, n), dtype=lattice_dtype, device="cuda:0")
+    f1 = wp.empty((q, n, n, n), dtype=lattice_dtype, device="cuda:0")
+
+    # Make velocity set
+    velocity_weight = wp.vec(q, dtype=lattice_dtype)(
+        [1.0/3.0] + [1.0/18.0] * 6 + [1.0/36.0] * 12
+    )
+    velocity_set = wp.mat((q, dim), dtype=wp.int32)(
+        [
+            [0, 0, 0],
+            [1, 0, 0],
+            [-1, 0, 0],
+            [0, 1, 0],
+            [0, -1, 0],
+            [0, 0, 1],
+            [0, 0, -1],
+            [0, 1, 1],
+            [0, -1, -1],
+            [0, 1, -1],
+            [0, -1, 1],
+            [1, 0, 1],
+            [-1, 0, -1],
+            [1, 0, -1],
+            [-1, 0, 1],
+            [1, 1, 0],
+            [-1, -1, 0],
+            [1, -1, 0],
+            [-1, 1, 0],
+        ]
+    )
+
+    # Make kernel
+    collide_stream, initialize = make_warp_kernel(
+        velocity_weight,
+        velocity_set,
+        dtype=lattice_dtype,
+        dim=dim,
+        q=q,
+    )
+
+    # Initialize
+    cs = 1.0 / np.sqrt(3.0)
+    vel = 0.1 * cs
+    dx = 2.0 * np.pi / n
+    wp.launch(
+        initialize,
+        inputs=[
+            f0,
+            dx,
+            vel,
+            n,
+            n,
+            n,
+            tau,
+        ],
+        dim=(n, n, n),
+    )
+
+    # Compute MLUPS
+    import time
+    import tqdm
+    nr_iterations = 128
+    start = time.time()
+    for i in tqdm.tqdm(range(nr_iterations)):
+        #if i % 10 == 0:
+        #    plt_f(f0)
+
+        wp.launch(
+            collide_stream,
+            inputs=[
+                f0,
+                f1,
+                n,
+                n,
+                n,
+                tau,
+            ],
+            dim=(n, n, n),
+        )
+        f0, f1 = f1, f0
+    wp.synchronize()
+    end = time.time()
+    print("MLUPS: ", (nr_iterations * n * n * n) / (end - start) / 1e6)