diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
index 45ea2557..18c05def 100644
--- a/.github/workflows/ci_linux.yml
+++ b/.github/workflows/ci_linux.yml
@@ -73,18 +73,18 @@ jobs:
       - name: Clippy
         env:
           RUSTFLAGS: -Dwarnings
-        run: cargo clippy --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
+        run: cargo clippy --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
 
       - name: Build all bindings
         run: cargo build --all-features -p cust_raw
 
       - name: Build workspace
-        run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
+        run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
 
       - name: Check documentation
         env:
           RUSTDOCFLAGS: -Dwarnings
-        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
+        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
 
       - name: Prepare artifact details
         id: artifact_details
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
index f446987e..a5e1055a 100644
--- a/.github/workflows/ci_windows.yml
+++ b/.github/workflows/ci_windows.yml
@@ -66,7 +66,7 @@ jobs:
         run: cargo build --all-features -p cust_raw
 
       - name: Build
-        run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*"
+        run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*"
 
       # Don't currently test because many tests rely on the system having a CUDA GPU
       # - name: Test
@@ -75,4 +75,4 @@ jobs:
       - name: Check documentation
         env:
           RUSTDOCFLAGS: -Dwarnings
-        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
+        run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
diff --git a/Cargo.toml b/Cargo.toml
index 7dd130f5..4c495879 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,9 +8,14 @@ members = [
 
   "xtask",
 
+  "examples/cuda/vecadd",
+  "examples/cuda/vecadd/kernels",
+  "examples/cuda/gemm",
+  "examples/cuda/gemm/kernels",
+  "examples/cuda/path_tracer",
+  "examples/cuda/path_tracer/kernels",
+
   "examples/optix/*",
-  "examples/cuda/cpu/*",
-  "examples/cuda/gpu/*",
 ]
 
 exclude = [
diff --git a/examples/cuda/cpu/add/Cargo.toml b/examples/cuda/cpu/add/Cargo.toml
deleted file mode 100644
index 523e0a75..00000000
--- a/examples/cuda/cpu/add/Cargo.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-[package]
-name = "add"
-version = "0.1.0"
-edition = "2021"
-
-[dependencies]
-cust = { version = "0.3", path = "../../../../crates/cust" }
-nanorand = "0.6.1"
-
-# We don't depend on these directly, but want to pin them to specific versions.
-# This is because we're bound to a specific rustc version but cargo will chose
-# the newest semver compatible versions anyway.
-log = "=0.4.17"
-regex-syntax = "=0.6.28"
-regex = "=1.11.1"
-thread_local = "=1.1.4"
-rayon = "=1.10"
-rayon-core = "=1.12.1"
-byteorder = "=1.4.0"
-
-[build-dependencies]
-cuda_builder = { version = "0.3", path = "../../../../crates/cuda_builder" }
diff --git a/examples/cuda/cpu/add/build.rs b/examples/cuda/cpu/add/build.rs
deleted file mode 100644
index bb96d320..00000000
--- a/examples/cuda/cpu/add/build.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-use cuda_builder::CudaBuilder;
-
-fn main() {
-    CudaBuilder::new("../../gpu/add_gpu")
-        .copy_to("../../resources/add.ptx")
-        .build()
-        .unwrap();
-}
diff --git a/examples/cuda/cpu/path_tracer/build.rs b/examples/cuda/cpu/path_tracer/build.rs
deleted file mode 100644
index ce64785d..00000000
--- a/examples/cuda/cpu/path_tracer/build.rs
+++ /dev/null
@@ -1,14 +0,0 @@
-use cuda_builder::CudaBuilder;
-
-fn main() {
-    CudaBuilder::new("../../gpu/path_tracer_gpu")
-        .copy_to("../../resources/path_tracer.ptx")
-        .build()
-        .unwrap();
-
-    CudaBuilder::new("../../gpu/path_tracer_gpu")
-        .copy_to("../../resources/path_tracer_optix.ptx")
-        .build_args(&["--features", "optix"])
-        .build()
-        .unwrap();
-}
diff --git a/examples/cuda/gemm/Cargo.toml b/examples/cuda/gemm/Cargo.toml
new file mode 100644
index 00000000..605a3ee2
--- /dev/null
+++ b/examples/cuda/gemm/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "gemm"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+blastoff = { path = "../../../crates/blastoff" }
+cuda_std = { path = "../../../crates/cuda_std" }
+cust = { path = "../../../crates/cust" }
+cust_raw = { path = "../../../crates/cust_raw", features = ["driver"] }
+ndarray = { version = "0.16", features = ["approx"] }
+ndarray-rand = "0.15.0"
+rand = "0.9"
+
+[build-dependencies]
+cuda_builder = { path = "../../../crates/cuda_builder" }
diff --git a/examples/cuda/gemm/build.rs b/examples/cuda/gemm/build.rs
new file mode 100644
index 00000000..9dfb16af
--- /dev/null
+++ b/examples/cuda/gemm/build.rs
@@ -0,0 +1,15 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    CudaBuilder::new("kernels")
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
diff --git a/examples/cuda/gemm/kernels/Cargo.toml b/examples/cuda/gemm/kernels/Cargo.toml
new file mode 100644
index 00000000..b92683ea
--- /dev/null
+++ b/examples/cuda/gemm/kernels/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "gemm-kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+glam = { version = "0.30.1", default-features = false, features = ["cuda", "nostd-libm"] }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/examples/cuda/gemm/kernels/src/gemm_naive.rs b/examples/cuda/gemm/kernels/src/gemm_naive.rs
new file mode 100644
index 00000000..02ba504f
--- /dev/null
+++ b/examples/cuda/gemm/kernels/src/gemm_naive.rs
@@ -0,0 +1,46 @@
+use cuda_std::kernel;
+use cuda_std::thread;
+
+#[kernel]
+#[allow(improper_ctypes_definitions)]
+/// Naive GEMM kernel for C = alpha * A * B + beta * C.
+///
+/// This kernel computes each element of the output matrix C independently, without any memory coalescing or tiling optimizations.
+///
+/// # Safety
+/// CUDA kernel requires unsafe.
+///
+/// # Parameters
+/// - `mat_a`: Input matrix A, shape (m x k), row-major order.
+/// - `mat_b`: Input matrix B, shape (k x n), row-major order.
+/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes.
+/// - `m`: Number of rows in A and C.
+/// - `n`: Number of columns in B and C.
+/// - `k`: Number of columns in A and rows in B.
+/// - `alpha`: Scalar multiplier for A * B.
+/// - `beta`: Scalar multiplier for C.
+///
+/// # Thread Mapping
+/// Each thread computes one element of C at (row, col).
+pub unsafe fn gemm_naive(
+    mat_a: &[f32],
+    mat_b: &[f32],
+    mat_c: *mut f32,
+    m: usize,
+    n: usize,
+    k: usize,
+    alpha: f32,
+    beta: f32,
+) {
+    let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;
+    let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;
+
+    if row < m && col < n {
+        let mut sum = 0.0f32;
+        for i in 0..k {
+            sum += mat_a[row * k + i] * mat_b[i * n + col];
+        }
+        let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) };
+        *elem = alpha * sum + beta * *elem;
+    }
+}
diff --git a/examples/cuda/gemm/kernels/src/gemm_tiled.rs b/examples/cuda/gemm/kernels/src/gemm_tiled.rs
new file mode 100644
index 00000000..6c0f00ec
--- /dev/null
+++ b/examples/cuda/gemm/kernels/src/gemm_tiled.rs
@@ -0,0 +1,83 @@
+use cuda_std::address_space;
+use cuda_std::kernel;
+use cuda_std::thread;
+
+#[kernel]
+#[allow(improper_ctypes_definitions)]
+/// Tiled GEMM kernel for C = alpha * A * B + beta * C.
+///
+/// This kernel uses shared memory tiling to improve memory access patterns and performance.
+///
+/// # Safety
+/// CUDA kernel requires unsafe.
+///
+/// # Parameters
+/// - `mat_a`: Input matrix A, shape (m x k), row-major order.
+/// - `mat_b`: Input matrix B, shape (k x n), row-major order.
+/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes.
+/// - `m`: Number of rows in A and C.
+/// - `n`: Number of columns in B and C.
+/// - `k`: Number of columns in A and rows in B.
+/// - `alpha`: Scalar multiplier for A * B.
+/// - `beta`: Scalar multiplier for C.
+///
+/// # Tiling
+/// Each block computes a TILE_SIZE x TILE_SIZE tile of C using shared memory for A and B tiles.
+/// Threads within a block collaboratively load tiles and compute partial sums.
+///
+/// # Thread Mapping
+/// Each thread computes one element of the output tile.
+pub unsafe fn gemm_tiled(
+    mat_a: &[f32],
+    mat_b: &[f32],
+    mat_c: *mut f32,
+    m: usize,
+    n: usize,
+    k: usize,
+    alpha: f32,
+    beta: f32,
+) {
+    const TILE_SIZE: usize = 16;
+
+    #[address_space(shared)]
+    static mut TILE_A: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE];
+    #[address_space(shared)]
+    static mut TILE_B: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE];
+
+    // Thread indices within the block.
+    let tx = thread::thread_idx_x() as usize;
+    let ty = thread::thread_idx_y() as usize;
+
+    // Calculate row and column in the mat_c.
+    let row = thread::block_idx_x() as usize * TILE_SIZE + ty;
+    let col = thread::block_idx_y() as usize * TILE_SIZE + tx;
+
+    let mut sum = 0.0f32;
+    // Loop over tiles of mat_a and mat_b in the k dimension.
+    for kk in (0..k).step_by(TILE_SIZE) {
+        // Collaborative loading of tiles into shared memory.
+        if row < m && (kk + tx) < k {
+            unsafe { TILE_A[ty * TILE_SIZE + tx] = mat_a[row * k + (kk + tx)] };
+        } else {
+            unsafe { TILE_A[ty * TILE_SIZE + tx] = 0.0f32 };
+        }
+        if col < n && (kk + ty) < k {
+            unsafe { TILE_B[ty * TILE_SIZE + tx] = mat_b[(kk + ty) * n + col] };
+        } else {
+            unsafe { TILE_B[ty * TILE_SIZE + tx] = 0.0f32 };
+        }
+        thread::sync_threads();
+
+        // Perform the computation on the tile.
+        for i in 0..TILE_SIZE {
+            sum += unsafe { TILE_A[ty * TILE_SIZE + i] * TILE_B[i * TILE_SIZE + tx] };
+        }
+        thread::sync_threads();
+    }
+
+    // Write the result back to mat_c with alpha and beta scaling.
+    if row < m && col < n {
+        let c = unsafe { mat_c.add(row * n + col) };
+        unsafe { *c = alpha * sum + beta * *c };
+    }
+}
diff --git a/examples/cuda/gemm/kernels/src/lib.rs b/examples/cuda/gemm/kernels/src/lib.rs
new file mode 100644
index 00000000..19fab562
--- /dev/null
+++ b/examples/cuda/gemm/kernels/src/lib.rs
@@ -0,0 +1,5 @@
+mod gemm_naive;
+mod gemm_tiled;
+
+pub use crate::gemm_naive::gemm_naive;
+pub use crate::gemm_tiled::gemm_tiled;
diff --git a/examples/cuda/gemm/src/main.rs b/examples/cuda/gemm/src/main.rs
new file mode 100644
index 00000000..ff8708c2
--- /dev/null
+++ b/examples/cuda/gemm/src/main.rs
@@ -0,0 +1,466 @@
+//! Example demonstrating GEMM (General Matrix Multiply) on CUDA using Rust-CUDA.
+//!
+//! This example benchmarks naive and tiled GEMM kernels as well as cuBLAS for various matrix sizes.
+//! It uses the `cust` crate for CUDA management and `ndarray` for host-side matrix operations.
+
+use std::cell;
+use std::error::Error;
+
+use cust::event;
+use cust::launch;
+use cust::memory;
+use cust::memory::CopyDestination as _;
+use cust::module;
+use cust::stream;
+use cust::util::SliceExt as _;
+use ndarray::Array;
+use ndarray_rand::RandomExt as _;
+use ndarray_rand::rand_distr::Uniform;
+
+const EPS: f32 = 0.01;
+const NUM_WARMUPS: usize = 2;
+const NUM_RUNS: usize = 10;
+const MAT_SIZES: [usize; 8] = [32, 64, 128, 256, 512, 1024, 2048, 4096];
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
+
+type GemmFn = dyn Fn(
+    &stream::Stream,
+    &module::Module,
+    &memory::DeviceBuffer<f32>,
+    &memory::DeviceBuffer<f32>,
+    &mut memory::DeviceBuffer<f32>,
+    usize,
+    usize,
+    usize,
+    f32,
+    f32,
+) -> Result<(), Box<dyn Error>>;
+
+fn main() -> Result<(), Box<dyn Error>> {
+    // initialize CUDA, this will pick the first available device and will
+    // make a CUDA context from it.
+    // We don't need the context for anything but it must be kept alive.
+    let _ctx = cust::quick_init()?;
+
+    // Make the CUDA module, modules just house the GPU code for the kernels we created.
+    // they can be made from PTX code, cubins, or fatbins.
+    let module = module::Module::from_ptx(PTX, &[])?;
+
+    // Make a CUDA stream to issue calls to. You can think of this as an OS thread but for dispatching
+    // GPU calls.
+    let stream = stream::Stream::new(stream::StreamFlags::NON_BLOCKING, None)?;
+
+    run_cublas(&stream)?;
+    run_gemm_kernel(&stream, &module, &gemm_naive, "gemm_naive")?;
+    run_gemm_kernel(&stream, &module, &gemm_tiled, "gemm_tiled")?;
+
+    Ok(())
+}
+
+/// Runs the cuBLAS GEMM for a set of matrix sizes and checks correctness.
+///
+/// # Arguments
+/// * `stream` - CUDA stream to use for kernel launches and memory operations.
+///
+/// This function benchmarks cuBLAS GEMM and checks the result for small matrices.
+fn run_cublas(stream: &stream::Stream) -> Result<(), Box<dyn Error>> {
+    // Make a cuBLAS context which manages the cuBLAS internal GPU memory allocations.
+    let mut cublas_ctx = blastoff::CublasContext::new()?;
+
+    // Sanity check.
+    {
+        let mat_a = ndarray::arr2(&[[1.0, 2.0], [3.0, 4.0]]);
+        let mat_b = ndarray::arr2(&[[5.0, 6.0], [7.0, 8.0]]);
+        let mat_c_expect = ndarray::arr2(&[[19.0, 22.0], [43.0, 50.0]]);
+        let (alpha, beta) = (1.0, 0.0);
+
+        let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mut mat_c_gpu = unsafe { cust::memory::DeviceBuffer::uninitialized(2 * 2)? };
+        let alpha_gpu = cust::memory::DeviceBox::new(&alpha)?;
+        let beta_gpu = cust::memory::DeviceBox::new(&beta)?;
+
+        // ndarray uses row-major order, but cuBLAS uses column-major order.
+        // In such case, C=AxB is equivalent to C^T=B^TxA^T.
+        cublas_ctx.gemm::<f32>(
+            stream,
+            2,
+            2,
+            2,
+            &alpha_gpu,
+            &mat_b_gpu,
+            2,
+            blastoff::MatrixOp::None,
+            &beta_gpu,
+            &mat_a_gpu,
+            2,
+            blastoff::MatrixOp::None,
+            &mut mat_c_gpu,
+            2,
+        )?;
+        stream.synchronize()?;
+
+        let mut mat_c_actual = Array::<f32, _>::zeros((2, 2));
+        mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?;
+        assert!(mat_c_expect.relative_eq(&mat_c_actual, EPS, EPS));
+    }
+
+    for sz in MAT_SIZES.iter().cloned() {
+        let mat_a = ndarray::Array2::<f32>::random((sz, sz), Uniform::new(-10., 10.));
+        let mat_b = ndarray::Array2::<f32>::random((sz, sz), Uniform::new(-10., 10.));
+        let mat_c = ndarray::Array2::<f32>::random((sz, sz), Uniform::new(-10., 10.));
+        let (alpha, beta) = (1.0, 0.0);
+
+        let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mut mat_c_gpu = mat_c.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let alpha_gpu = cust::memory::DeviceBox::new(&alpha)?;
+        let beta_gpu = cust::memory::DeviceBox::new(&beta)?;
+        stream.synchronize()?;
+
+        // Warm up before timing.
+        for _ in 0..NUM_WARMUPS {
+            cublas_ctx.gemm::<f32>(
+                stream,
+                sz,
+                sz,
+                sz,
+                &alpha_gpu,
+                &mat_b_gpu,
+                sz,
+                blastoff::MatrixOp::None,
+                &beta_gpu,
+                &mat_a_gpu,
+                sz,
+                blastoff::MatrixOp::None,
+                &mut mat_c_gpu,
+                sz,
+            )?;
+        }
+        stream.synchronize()?;
+
+        // Time the kernel execution.
+        let beg = event::Event::new(event::EventFlags::DEFAULT)?;
+        let end = event::Event::new(event::EventFlags::DEFAULT)?;
+        beg.record(stream)?;
+        for _ in 0..NUM_RUNS {
+            cublas_ctx.gemm::<f32>(
+                stream,
+                sz,
+                sz,
+                sz,
+                &alpha_gpu,
+                &mat_b_gpu,
+                sz,
+                blastoff::MatrixOp::None,
+                &beta_gpu,
+                &mat_a_gpu,
+                sz,
+                blastoff::MatrixOp::None,
+                &mut mat_c_gpu,
+                sz,
+            )?;
+            stream.synchronize()?;
+        }
+        end.record(stream)?;
+        beg.synchronize()?;
+        end.synchronize()?;
+
+        let mut mat_c_actual = Array::<f32, _>::zeros((sz, sz));
+        mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?;
+        let duration = end.elapsed_time_f32(&beg)? / (NUM_RUNS as f32);
+        println!("cuBLAS {}x{}: {:.4}ms", sz, sz, duration);
+        if sz < 1024 {
+            assert_gemm_eq(&mat_a, &mat_b, &mat_c, alpha, beta, &mat_c_actual);
+        }
+    }
+
+    Ok(())
+}
+
+/// Runs a GEMM kernel (naive or tiled) for a set of matrix sizes and checks correctness.
+///
+/// # Arguments
+/// * `stream` - CUDA stream to use for kernel launches and memory operations.
+/// * `module` - CUDA module containing the kernel.
+/// * `gemm_fn` - Function pointer to the GEMM kernel launcher.
+/// * `kernel_name` - Name of the kernel for logging.
+///
+/// This function benchmarks the provided GEMM kernel and checks the result for small matrices.
+fn run_gemm_kernel(
+    stream: &stream::Stream,
+    module: &module::Module,
+    gemm_fn: &GemmFn,
+    kernel_name: &str,
+) -> Result<(), Box<dyn Error>> {
+    // Sanity check.
+    {
+        let mat_a = ndarray::arr2::<f32, 2>(&[[1.0, 2.0], [3.0, 4.0]]);
+        let mat_b = ndarray::arr2::<f32, 2>(&[[5.0, 6.0], [7.0, 8.0]]);
+        let mat_c_expect = ndarray::arr2::<f32, 2>(&[[19.0, 22.0], [43.0, 50.0]]);
+        let (alpha, beta) = (1.0f32, 0.0f32);
+
+        let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mut mat_c_gpu = unsafe { cust::memory::DeviceBuffer::uninitialized(2 * 2)? };
+        stream.synchronize()?;
+
+        gemm_fn(
+            stream,
+            module,
+            &mat_a_gpu,
+            &mat_b_gpu,
+            &mut mat_c_gpu,
+            2,
+            2,
+            2,
+            alpha,
+            beta,
+        )?;
+        stream.synchronize()?;
+
+        let mut mat_c_actual = Array::<f32, _>::zeros((2, 2));
+        mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?;
+        assert!(mat_c_expect.relative_eq(&mat_c_actual, EPS, EPS));
+    }
+
+    for sz in MAT_SIZES.iter().cloned() {
+        let mat_a = ndarray::Array2::<f32>::random((sz, sz), Uniform::new(-10., 10.));
+        let mat_b = ndarray::Array2::<f32>::random((sz, sz), Uniform::new(-10., 10.));
+        let mat_c = ndarray::Array2::<f32>::random((sz, sz), Uniform::new(-10., 10.));
+        let (alpha, beta) = (1.0f32, 0.0f32);
+
+        let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        let mut mat_c_gpu = mat_c.as_standard_layout().as_slice().unwrap().as_dbuf()?;
+        stream.synchronize()?;
+
+        // Warm up before timing.
+        for _ in 0..NUM_WARMUPS {
+            gemm_fn(
+                stream,
+                module,
+                &mat_a_gpu,
+                &mat_b_gpu,
+                &mut mat_c_gpu,
+                sz,
+                sz,
+                sz,
+                alpha,
+                beta,
+            )?;
+        }
+        stream.synchronize()?;
+
+        // Time the kernel execution.
+        let beg = event::Event::new(event::EventFlags::DEFAULT)?;
+        let end = event::Event::new(event::EventFlags::DEFAULT)?;
+        beg.record(stream)?;
+        for _ in 0..NUM_RUNS {
+            gemm_fn(
+                stream,
+                module,
+                &mat_a_gpu,
+                &mat_b_gpu,
+                &mut mat_c_gpu,
+                sz,
+                sz,
+                sz,
+                alpha,
+                beta,
+            )?;
+            stream.synchronize()?;
+        }
+        end.record(stream)?;
+        beg.synchronize()?;
+        end.synchronize()?;
+
+        let mut mat_c_actual = Array::<f32, _>::zeros((sz, sz));
+        mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?;
+        let duration = end.elapsed_time_f32(&beg)? / (NUM_RUNS as f32);
+        println!("{} {}x{}: {:.4}ms", kernel_name, sz, sz, duration);
+        if sz < 1024 {
+            assert_gemm_eq(&mat_a, &mat_b, &mat_c, alpha, beta, &mat_c_actual);
+        }
+    }
+    Ok(())
+}
+
+/// Asserts that the GEMM result matches the expected value within a tolerance.
+///
+/// # Arguments
+/// * `mat_a` - Left matrix operand.
+/// * `mat_b` - Right matrix operand.
+/// * `mat_c` - Initial value of the output matrix.
+/// * `alpha` - Scalar multiplier for mat_a * mat_b.
+/// * `beta` - Scalar multiplier for mat_c.
+/// * `mat_c_actual` - Result from the device.
+fn assert_gemm_eq<T>(
+    mat_a: &ndarray::Array2<T>,
+    mat_b: &ndarray::Array2<T>,
+    mat_c: &ndarray::Array2<T>,
+    alpha: T,
+    beta: T,
+    mat_c_actual: &ndarray::Array2<T>,
+) where
+    T: Clone,
+    f64: From<T>,
+{
+    let mat_a = mat_a.mapv(|v| Into::<f64>::into(v));
+    let mat_b = mat_b.mapv(|v| Into::<f64>::into(v));
+    let mat_c = mat_c.mapv(|v| Into::<f64>::into(v));
+    let mat_c_actual = mat_c_actual.mapv(|v| Into::<f64>::into(v));
+    let alpha: f64 = alpha.into();
+    let beta: f64 = beta.into();
+    let mat_c_expect = alpha * mat_a.dot(&mat_b) + beta * mat_c;
+    let ok = mat_c_expect.relative_eq(&mat_c_actual, EPS.into(), EPS.into());
+    if !ok {
+        println!("Actual: {:?}", mat_c_actual);
+        println!("Expect: {:?}", mat_c_expect);
+        panic!("GEMM result mismatch");
+    }
+}
+
+/// Launches the naive GEMM kernel on the device.
+///
+/// # Arguments
+/// * `stream` - CUDA stream to use for kernel launch.
+/// * `module` - CUDA module containing the kernel.
+/// * `mat_a` - Device buffer for left matrix operand (m x k).
+/// * `mat_b` - Device buffer for right matrix operand (k x n).
+/// * `mat_c` - Device buffer for output matrix (m x n).
+/// * `m` - Number of rows in mat_a and mat_c.
+/// * `n` - Number of columns in mat_b and mat_c.
+/// * `k` - Number of columns in mat_a and rows in mat_b.
+/// * `alpha` - Scalar multiplier for mat_a * mat_b.
+/// * `beta` - Scalar multiplier for mat_c.
+///
+/// This function configures the launch parameters and invokes the naive GEMM kernel.
+#[allow(clippy::too_many_arguments)]
+pub fn gemm_naive(
+    stream: &stream::Stream,
+    module: &module::Module,
+    mat_a: &memory::DeviceBuffer<f32>,
+    mat_b: &memory::DeviceBuffer<f32>,
+    mat_c: &mut memory::DeviceBuffer<f32>,
+    m: usize,
+    n: usize,
+    k: usize,
+    alpha: f32,
+    beta: f32,
+) -> Result<(), Box<dyn Error>> {
+    assert_eq!(mat_a.len(), m * k);
+    assert_eq!(mat_b.len(), k * n);
+    assert_eq!(mat_c.len(), m * n);
+
+    let kernel_cell = cell::LazyCell::new(|| {
+        module
+            .get_function("gemm_naive")
+            .expect("kernel not found.")
+    });
+    let kernel = &*kernel_cell;
+
+    // use the CUDA occupancy API to find an optimal launch configuration for the grid and block size.
+    // This will try to maximize how much of the GPU is used by finding the best launch configuration for the
+    // current CUDA device/architecture.
+    let (_, block_size) = kernel.suggested_launch_configuration(0, 0.into())?;
+    let block_size = block_size as usize;
+    let (block_size_x, block_size_y) = if block_size > m * n {
+        (block_size.div_ceil(m) as u32, m as u32)
+    } else {
+        (1, block_size as u32)
+    };
+    let (grid_size_x, grid_size_y) = (
+        (m as u32).div_ceil(block_size_x),
+        (n as u32).div_ceil(block_size_y),
+    );
+    unsafe {
+        launch!(
+            kernel<<<
+                (grid_size_x, grid_size_y),
+                (block_size_x, block_size_y),
+                0,
+                stream
+            >>>(
+                mat_a.as_device_ptr(),
+                mat_a.len(),
+                mat_b.as_device_ptr(),
+                mat_b.len(),
+                mat_c.as_device_ptr(),
+                m,
+                n,
+                k,
+                alpha,
+                beta,
+            )
+        )?;
+    };
+    Ok(())
+}
+
+/// Launches the tiled GEMM kernel on the device.
+///
+/// # Arguments
+/// * `stream` - CUDA stream to use for kernel launch.
+/// * `module` - CUDA module containing the kernel.
+/// * `mat_a` - Device buffer for left matrix operand (m x k).
+/// * `mat_b` - Device buffer for right matrix operand (k x n).
+/// * `mat_c` - Device buffer for output matrix (m x n).
+/// * `m` - Number of rows in mat_a and mat_c.
+/// * `n` - Number of columns in mat_b and mat_c.
+/// * `k` - Number of columns in mat_a and rows in mat_b.
+/// * `alpha` - Scalar multiplier for mat_a * mat_b.
+/// * `beta` - Scalar multiplier for mat_c.
+///
+/// This function configures the launch parameters and invokes the tiled GEMM kernel.
+#[allow(clippy::too_many_arguments)]
+pub fn gemm_tiled(
+    stream: &stream::Stream,
+    module: &module::Module,
+    mat_a: &memory::DeviceBuffer<f32>,
+    mat_b: &memory::DeviceBuffer<f32>,
+    mat_c: &mut memory::DeviceBuffer<f32>,
+    m: usize,
+    n: usize,
+    k: usize,
+    alpha: f32,
+    beta: f32,
+) -> Result<(), Box<dyn Error>> {
+    assert_eq!(mat_a.len(), m * k);
+    assert_eq!(mat_b.len(), k * n);
+    assert_eq!(mat_c.len(), m * n);
+
+    // These values must be aligned with the kernel code.
+    const TILE_SIZE: usize = 16;
+
+    let kernel_cell = cell::LazyCell::new(|| {
+        module
+            .get_function("gemm_tiled")
+            .expect("kernel not found.")
+    });
+    let kernel = &*kernel_cell;
+
+    let (grid_size_x, grid_size_y) = (n.div_ceil(TILE_SIZE) as u32, m.div_ceil(TILE_SIZE) as u32);
+    unsafe {
+        launch!(
+            kernel<<<
+                (grid_size_x, grid_size_y),
+                (TILE_SIZE as u32, TILE_SIZE as u32),
+                0,
+                stream
+            >>>(
+                mat_a.as_device_ptr(),
+                mat_a.len(),
+                mat_b.as_device_ptr(),
+                mat_b.len(),
+                mat_c.as_device_ptr(),
+                m,
+                n,
+                k,
+                alpha,
+                beta,
+            )
+        )?;
+    };
+    Ok(())
+}
diff --git a/examples/cuda/gpu/add_gpu/Cargo.toml b/examples/cuda/gpu/add_gpu/Cargo.toml
deleted file mode 100644
index 48321561..00000000
--- a/examples/cuda/gpu/add_gpu/Cargo.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-[package]
-name = "add_gpu"
-version = "0.1.0"
-edition = "2021"
-
-[dependencies]
-cuda_std = { version = "0.2", path = "../../../../crates/cuda_std" }
-
-[lib]
-crate-type = ["cdylib", "rlib"]
diff --git a/examples/cuda/cpu/path_tracer/Cargo.toml b/examples/cuda/path_tracer/Cargo.toml
similarity index 50%
rename from examples/cuda/cpu/path_tracer/Cargo.toml
rename to examples/cuda/path_tracer/Cargo.toml
index 763ba0e5..46619e01 100644
--- a/examples/cuda/cpu/path_tracer/Cargo.toml
+++ b/examples/cuda/path_tracer/Cargo.toml
@@ -1,16 +1,18 @@
 [package]
-name = "path_tracer"
+name = "path-tracer"
 version = "0.1.0"
 edition = "2018"
 
 [dependencies]
 vek = { version = "0.17.1", features = ["bytemuck", "mint"] }
 bytemuck = { version = "1.21", features = ["derive"] }
-cust = { version = "0.3", path = "../../../../crates/cust", features = ["impl_vek"] }
+cust = { version = "0.3", path = "../../../crates/cust", features = [
+    "impl_vek",
+] }
 image = "0.25.5"
-path_tracer_gpu = { path = "../../gpu/path_tracer_gpu" }
-gpu_rand = { version = "0.1", path = "../../../../crates/gpu_rand" }
-optix = { version = "0.1", path = "../../../../crates/optix" }
+path-tracer-kernels = { path = "kernels" }
+gpu_rand = { version = "0.1", path = "../../../crates/gpu_rand" }
+optix = { version = "0.1", path = "../../../crates/optix" }
 glium = "0.32.0"
 glutin = "0.28.0"
 imgui = "0.9.0"
@@ -21,4 +23,4 @@ sysinfo = "0.33.1"
 anyhow = "1.0.53"
 
 [build-dependencies]
-cuda_builder = { version = "0.3", path = "../../../../crates/cuda_builder" }
+cuda_builder = { version = "0.3", path = "../../../crates/cuda_builder" }
diff --git a/examples/cuda/path_tracer/build.rs b/examples/cuda/path_tracer/build.rs
new file mode 100644
index 00000000..da3f5a50
--- /dev/null
+++ b/examples/cuda/path_tracer/build.rs
@@ -0,0 +1,17 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    CudaBuilder::new("kernels")
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+    CudaBuilder::new("kernels")
+        .copy_to(out_path.join("kernels_optix.ptx"))
+        .build_args(&["--features", "optix"])
+        .build()
+        .unwrap();
+}
diff --git a/examples/cuda/gpu/path_tracer_gpu/Cargo.toml b/examples/cuda/path_tracer/kernels/Cargo.toml
similarity index 93%
rename from examples/cuda/gpu/path_tracer_gpu/Cargo.toml
rename to examples/cuda/path_tracer/kernels/Cargo.toml
index 02cbc45f..97a92bd2 100644
--- a/examples/cuda/gpu/path_tracer_gpu/Cargo.toml
+++ b/examples/cuda/path_tracer/kernels/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "path_tracer_gpu"
+name = "path-tracer-kernels"
 version = "0.1.0"
 edition = "2018"
 
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/hittable.rs b/examples/cuda/path_tracer/kernels/src/hittable.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/hittable.rs
rename to examples/cuda/path_tracer/kernels/src/hittable.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/lib.rs b/examples/cuda/path_tracer/kernels/src/lib.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/lib.rs
rename to examples/cuda/path_tracer/kernels/src/lib.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/material.rs b/examples/cuda/path_tracer/kernels/src/material.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/material.rs
rename to examples/cuda/path_tracer/kernels/src/material.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/math.rs b/examples/cuda/path_tracer/kernels/src/math.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/math.rs
rename to examples/cuda/path_tracer/kernels/src/math.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/optix.rs b/examples/cuda/path_tracer/kernels/src/optix.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/optix.rs
rename to examples/cuda/path_tracer/kernels/src/optix.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/render.rs b/examples/cuda/path_tracer/kernels/src/render.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/render.rs
rename to examples/cuda/path_tracer/kernels/src/render.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/render_kernels.rs b/examples/cuda/path_tracer/kernels/src/render_kernels.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/render_kernels.rs
rename to examples/cuda/path_tracer/kernels/src/render_kernels.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/scene.rs b/examples/cuda/path_tracer/kernels/src/scene.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/scene.rs
rename to examples/cuda/path_tracer/kernels/src/scene.rs
diff --git a/examples/cuda/gpu/path_tracer_gpu/src/sphere.rs b/examples/cuda/path_tracer/kernels/src/sphere.rs
similarity index 100%
rename from examples/cuda/gpu/path_tracer_gpu/src/sphere.rs
rename to examples/cuda/path_tracer/kernels/src/sphere.rs
diff --git a/examples/cuda/cpu/path_tracer/shaders/image.frag b/examples/cuda/path_tracer/shaders/image.frag
similarity index 100%
rename from examples/cuda/cpu/path_tracer/shaders/image.frag
rename to examples/cuda/path_tracer/shaders/image.frag
diff --git a/examples/cuda/cpu/path_tracer/shaders/image.vert b/examples/cuda/path_tracer/shaders/image.vert
similarity index 100%
rename from examples/cuda/cpu/path_tracer/shaders/image.vert
rename to examples/cuda/path_tracer/shaders/image.vert
diff --git a/examples/cuda/cpu/path_tracer/src/common.rs b/examples/cuda/path_tracer/src/common.rs
similarity index 99%
rename from examples/cuda/cpu/path_tracer/src/common.rs
rename to examples/cuda/path_tracer/src/common.rs
index bfe69df0..969ff946 100644
--- a/examples/cuda/cpu/path_tracer/src/common.rs
+++ b/examples/cuda/path_tracer/src/common.rs
@@ -1,7 +1,7 @@
 use glium::glutin::event::{
     ElementState, Event, MouseButton, MouseScrollDelta, VirtualKeyCode, WindowEvent,
 };
-use path_tracer_gpu::Viewport;
+use path_tracer_kernels::Viewport;
 use vek::{Vec2, Vec3};
 
 #[derive(Debug, Clone, Copy, PartialEq)]
diff --git a/examples/cuda/cpu/path_tracer/src/cpu/mod.rs b/examples/cuda/path_tracer/src/cpu/mod.rs
similarity index 99%
rename from examples/cuda/cpu/path_tracer/src/cpu/mod.rs
rename to examples/cuda/path_tracer/src/cpu/mod.rs
index 217bde5e..3fad1581 100644
--- a/examples/cuda/cpu/path_tracer/src/cpu/mod.rs
+++ b/examples/cuda/path_tracer/src/cpu/mod.rs
@@ -2,7 +2,7 @@ use std::time::Duration;
 
 use gpu_rand::{DefaultRand, GpuRand};
 use imgui::Ui;
-use path_tracer_gpu::{
+use path_tracer_kernels::{
     material::MaterialKind, render::generate_ray, scene::Scene, Object, Viewport,
 };
 use rayon::prelude::*;
diff --git a/examples/cuda/cpu/path_tracer/src/cuda/data.rs b/examples/cuda/path_tracer/src/cuda/data.rs
similarity index 98%
rename from examples/cuda/cpu/path_tracer/src/cuda/data.rs
rename to examples/cuda/path_tracer/src/cuda/data.rs
index 9e71cef6..d7f2d224 100644
--- a/examples/cuda/cpu/path_tracer/src/cuda/data.rs
+++ b/examples/cuda/path_tracer/src/cuda/data.rs
@@ -6,7 +6,7 @@ use cust::{
     util::SliceExt,
 };
 use gpu_rand::DefaultRand;
-use path_tracer_gpu::{material::MaterialKind, scene::Scene, Object, Viewport};
+use path_tracer_kernels::{material::MaterialKind, scene::Scene, Object, Viewport};
 use vek::{Vec2, Vec3};
 
 use super::SEED;
diff --git a/examples/cuda/cpu/path_tracer/src/cuda/mod.rs b/examples/cuda/path_tracer/src/cuda/mod.rs
similarity index 98%
rename from examples/cuda/cpu/path_tracer/src/cuda/mod.rs
rename to examples/cuda/path_tracer/src/cuda/mod.rs
index 7d92c645..f737e6e8 100644
--- a/examples/cuda/cpu/path_tracer/src/cuda/mod.rs
+++ b/examples/cuda/path_tracer/src/cuda/mod.rs
@@ -18,7 +18,7 @@ use optix::{
     context::DeviceContext,
     denoiser::{Denoiser, DenoiserModelKind, Image, ImageFormat},
 };
-use path_tracer_gpu::scene::Scene;
+use path_tracer_kernels::scene::Scene;
 use vek::{Vec2, Vec3};
 
 /// Seed for the random states
@@ -29,7 +29,7 @@ pub const SEED: u64 = 932174513921034;
 /// This should always be a multiple of warp size (32) to maximize occupancy.
 const THREAD_BLOCK_AXIS_LENGTH: usize = 16;
 
-pub(crate) static PTX: &str = include_str!("../../../../resources/path_tracer.ptx");
+pub(crate) static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
 
 pub struct CudaRenderer {
     stream: Stream,
diff --git a/examples/cuda/cpu/path_tracer/src/main.rs b/examples/cuda/path_tracer/src/main.rs
similarity index 98%
rename from examples/cuda/cpu/path_tracer/src/main.rs
rename to examples/cuda/path_tracer/src/main.rs
index 4311c172..d4ff4470 100644
--- a/examples/cuda/cpu/path_tracer/src/main.rs
+++ b/examples/cuda/path_tracer/src/main.rs
@@ -6,7 +6,7 @@ pub mod renderer;
 pub mod viewer;
 
 use common::Camera;
-use path_tracer_gpu::{
+use path_tracer_kernels::{
     material::{DielectricMaterial, DiffuseMaterial, MaterialKind, MetallicMaterial},
     scene::Scene,
     sphere::Sphere,
diff --git a/examples/cuda/cpu/path_tracer/src/optix/mod.rs b/examples/cuda/path_tracer/src/optix/mod.rs
similarity index 97%
rename from examples/cuda/cpu/path_tracer/src/optix/mod.rs
rename to examples/cuda/path_tracer/src/optix/mod.rs
index bf8bdc09..57191234 100644
--- a/examples/cuda/cpu/path_tracer/src/optix/mod.rs
+++ b/examples/cuda/path_tracer/src/optix/mod.rs
@@ -17,13 +17,13 @@ use optix::{
     },
     shader_binding_table::{SbtRecord, ShaderBindingTable},
 };
-use path_tracer_gpu::{optix::LaunchParams, scene::Scene, sphere::Sphere, Object};
+use path_tracer_kernels::{optix::LaunchParams, scene::Scene, sphere::Sphere, Object};
 
 pub type RaygenRecord = SbtRecord<i32>;
 pub type MissRecord = SbtRecord<i32>;
 pub type SphereHitgroupRecord = SbtRecord<Sphere>;
 
-pub(crate) static OPTIX_PTX: &str = include_str!("../../../../resources/path_tracer_optix.ptx");
+pub(crate) static OPTIX_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels_optix.ptx"));
 
 /// A subset of the CUDA renderer that uses hardware raytracing with OptiX
 pub struct OptixRenderer {
diff --git a/examples/cuda/cpu/path_tracer/src/renderer.rs b/examples/cuda/path_tracer/src/renderer.rs
similarity index 99%
rename from examples/cuda/cpu/path_tracer/src/renderer.rs
rename to examples/cuda/path_tracer/src/renderer.rs
index 99e368ae..9bda19d0 100644
--- a/examples/cuda/cpu/path_tracer/src/renderer.rs
+++ b/examples/cuda/path_tracer/src/renderer.rs
@@ -1,6 +1,6 @@
 use glium::glutin::{event::Event, event_loop::ControlFlow};
 use imgui::Ui;
-use path_tracer_gpu::scene::Scene;
+use path_tracer_kernels::scene::Scene;
 use sysinfo::System;
 use vek::Vec2;
 
diff --git a/examples/cuda/cpu/path_tracer/src/viewer.rs b/examples/cuda/path_tracer/src/viewer.rs
similarity index 99%
rename from examples/cuda/cpu/path_tracer/src/viewer.rs
rename to examples/cuda/path_tracer/src/viewer.rs
index 9f040987..3237f210 100644
--- a/examples/cuda/cpu/path_tracer/src/viewer.rs
+++ b/examples/cuda/path_tracer/src/viewer.rs
@@ -14,7 +14,7 @@ use glium::{
 
 use imgui::Condition;
 use imgui_winit_support::{HiDpiMode, WinitPlatform};
-use path_tracer_gpu::scene::Scene;
+use path_tracer_kernels::scene::Scene;
 use std::time::Instant;
 use vek::Vec2;
 
diff --git a/examples/cuda/resources/.gitignore b/examples/cuda/resources/.gitignore
deleted file mode 100644
index baabe46d..00000000
--- a/examples/cuda/resources/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.ptx
\ No newline at end of file
diff --git a/examples/cuda/vecadd/Cargo.toml b/examples/cuda/vecadd/Cargo.toml
new file mode 100644
index 00000000..5e421b29
--- /dev/null
+++ b/examples/cuda/vecadd/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "vecadd"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cust = { path = "../../../crates/cust" }
+nanorand = "0.7"
+
+[build-dependencies]
+cuda_builder = { path = "../../../crates/cuda_builder" }
diff --git a/examples/cuda/vecadd/build.rs b/examples/cuda/vecadd/build.rs
new file mode 100644
index 00000000..2046e342
--- /dev/null
+++ b/examples/cuda/vecadd/build.rs
@@ -0,0 +1,15 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo:rerun-if-changed=build.rs");
+    println!("cargo:rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    CudaBuilder::new("kernels")
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
diff --git a/examples/cuda/vecadd/kernels/Cargo.toml b/examples/cuda/vecadd/kernels/Cargo.toml
new file mode 100644
index 00000000..cb84009d
--- /dev/null
+++ b/examples/cuda/vecadd/kernels/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "vecadd-kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/examples/cuda/gpu/add_gpu/src/lib.rs b/examples/cuda/vecadd/kernels/src/lib.rs
similarity index 66%
rename from examples/cuda/gpu/add_gpu/src/lib.rs
rename to examples/cuda/vecadd/kernels/src/lib.rs
index 42f14817..ba1c3038 100644
--- a/examples/cuda/gpu/add_gpu/src/lib.rs
+++ b/examples/cuda/vecadd/kernels/src/lib.rs
@@ -2,10 +2,10 @@ use cuda_std::prelude::*;
 
 #[kernel]
 #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
-pub unsafe fn add(a: &[f32], b: &[f32], c: *mut f32) {
+pub unsafe fn vecadd(a: &[f32], b: &[f32], c: *mut f32) {
     let idx = thread::index_1d() as usize;
     if idx < a.len() {
-        let elem = &mut *c.add(idx);
+        let elem = unsafe { &mut *c.add(idx) };
         *elem = a[idx] + b[idx];
     }
 }
diff --git a/examples/cuda/cpu/add/src/main.rs b/examples/cuda/vecadd/src/main.rs
similarity index 87%
rename from examples/cuda/cpu/add/src/main.rs
rename to examples/cuda/vecadd/src/main.rs
index 69c34b4c..c6263f02 100644
--- a/examples/cuda/cpu/add/src/main.rs
+++ b/examples/cuda/vecadd/src/main.rs
@@ -5,7 +5,7 @@ use std::error::Error;
 /// How many numbers to generate and add together.
 const NUMBERS_LEN: usize = 100_000;
 
-static PTX: &str = include_str!("../../../resources/add.ptx");
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
 
 fn main() -> Result<(), Box<dyn Error>> {
     // generate our random vectors.
@@ -37,13 +37,13 @@ fn main() -> Result<(), Box<dyn Error>> {
     let mut out = vec![0.0f32; NUMBERS_LEN];
     let out_buf = out.as_slice().as_dbuf()?;
 
-    // retrieve the add kernel from the module so we can calculate the right launch config.
-    let func = module.get_function("add")?;
+    // retrieve the `vecadd` kernel from the module so we can calculate the right launch config.
+    let vecadd = module.get_function("vecadd")?;
 
     // use the CUDA occupancy API to find an optimal launch configuration for the grid and block size.
     // This will try to maximize how much of the GPU is used by finding the best launch configuration for the
     // current CUDA device/architecture.
-    let (_, block_size) = func.suggested_launch_configuration(0, 0.into())?;
+    let (_, block_size) = vecadd.suggested_launch_configuration(0, 0.into())?;
 
     let grid_size = (NUMBERS_LEN as u32).div_ceil(block_size);
 
@@ -57,7 +57,7 @@ fn main() -> Result<(), Box<dyn Error>> {
     unsafe {
         launch!(
             // slices are passed as two parameters, the pointer and the length.
-            func<<<grid_size, block_size, 0, stream>>>(
+            vecadd<<<grid_size, block_size, 0, stream>>>(
                 lhs_gpu.as_device_ptr(),
                 lhs_gpu.len(),
                 rhs_gpu.as_device_ptr(),