diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml index 45ea2557..18c05def 100644 --- a/.github/workflows/ci_linux.yml +++ b/.github/workflows/ci_linux.yml @@ -73,18 +73,18 @@ jobs: - name: Clippy env: RUSTFLAGS: -Dwarnings - run: cargo clippy --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" + run: cargo clippy --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" - name: Build all bindings run: cargo build --all-features -p cust_raw - name: Build workspace - run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" + run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" - name: Check documentation env: RUSTDOCFLAGS: -Dwarnings - run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw" + run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw" - name: Prepare artifact details id: artifact_details diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml index f446987e..a5e1055a 100644 --- a/.github/workflows/ci_windows.yml +++ b/.github/workflows/ci_windows.yml @@ -66,7 +66,7 @@ jobs: run: cargo build --all-features -p cust_raw - name: Build - run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*" + run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" # Don't currently test because many tests rely on the system having a CUDA GPU # - name: Test @@ -75,4 +75,4 @@ jobs: - name: Check documentation env: RUSTDOCFLAGS: -Dwarnings - run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw" + run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw" diff --git a/Cargo.toml b/Cargo.toml index 7dd130f5..4c495879 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,9 +8,14 @@ members = [ "xtask", + "examples/cuda/vecadd", + "examples/cuda/vecadd/kernels", + "examples/cuda/gemm", + "examples/cuda/gemm/kernels", + "examples/cuda/path_tracer", + "examples/cuda/path_tracer/kernels", + "examples/optix/*", - "examples/cuda/cpu/*", - "examples/cuda/gpu/*", ] exclude = [ diff --git a/examples/cuda/cpu/add/Cargo.toml b/examples/cuda/cpu/add/Cargo.toml deleted file mode 100644 index 523e0a75..00000000 --- a/examples/cuda/cpu/add/Cargo.toml +++ /dev/null @@ -1,22 +0,0 @@ -[package] -name = "add" -version = "0.1.0" -edition = "2021" - -[dependencies] -cust = { version = "0.3", path = "../../../../crates/cust" } -nanorand = "0.6.1" - -# We don't depend on these directly, but want to pin them to specific versions. -# This is because we're bound to a specific rustc version but cargo will chose -# the newest semver compatible versions anyway. -log = "=0.4.17" -regex-syntax = "=0.6.28" -regex = "=1.11.1" -thread_local = "=1.1.4" -rayon = "=1.10" -rayon-core = "=1.12.1" -byteorder = "=1.4.0" - -[build-dependencies] -cuda_builder = { version = "0.3", path = "../../../../crates/cuda_builder" } diff --git a/examples/cuda/cpu/add/build.rs b/examples/cuda/cpu/add/build.rs deleted file mode 100644 index bb96d320..00000000 --- a/examples/cuda/cpu/add/build.rs +++ /dev/null @@ -1,8 +0,0 @@ -use cuda_builder::CudaBuilder; - -fn main() { - CudaBuilder::new("../../gpu/add_gpu") - .copy_to("../../resources/add.ptx") - .build() - .unwrap(); -} diff --git a/examples/cuda/cpu/path_tracer/build.rs b/examples/cuda/cpu/path_tracer/build.rs deleted file mode 100644 index ce64785d..00000000 --- a/examples/cuda/cpu/path_tracer/build.rs +++ /dev/null @@ -1,14 +0,0 @@ -use cuda_builder::CudaBuilder; - -fn main() { - CudaBuilder::new("../../gpu/path_tracer_gpu") - .copy_to("../../resources/path_tracer.ptx") - .build() - .unwrap(); - - CudaBuilder::new("../../gpu/path_tracer_gpu") - .copy_to("../../resources/path_tracer_optix.ptx") - .build_args(&["--features", "optix"]) - .build() - .unwrap(); -} diff --git a/examples/cuda/gemm/Cargo.toml b/examples/cuda/gemm/Cargo.toml new file mode 100644 index 00000000..605a3ee2 --- /dev/null +++ b/examples/cuda/gemm/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "gemm" +version = "0.1.0" +edition = "2024" + +[dependencies] +blastoff = { path = "../../../crates/blastoff" } +cuda_std = { path = "../../../crates/cuda_std" } +cust = { path = "../../../crates/cust" } +cust_raw = { path = "../../../crates/cust_raw", features = ["driver"] } +ndarray = { version = "0.16", features = ["approx"] } +ndarray-rand = "0.15.0" +rand = "0.9" + +[build-dependencies] +cuda_builder = { path = "../../../crates/cuda_builder" } diff --git a/examples/cuda/gemm/build.rs b/examples/cuda/gemm/build.rs new file mode 100644 index 00000000..9dfb16af --- /dev/null +++ b/examples/cuda/gemm/build.rs @@ -0,0 +1,15 @@ +use std::env; +use std::path; + +use cuda_builder::CudaBuilder; + +fn main() { + println!("cargo::rerun-if-changed=build.rs"); + println!("cargo::rerun-if-changed=kernels"); + + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + CudaBuilder::new("kernels") + .copy_to(out_path.join("kernels.ptx")) + .build() + .unwrap(); +} diff --git a/examples/cuda/gemm/kernels/Cargo.toml b/examples/cuda/gemm/kernels/Cargo.toml new file mode 100644 index 00000000..b92683ea --- /dev/null +++ b/examples/cuda/gemm/kernels/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "gemm-kernels" +version = "0.1.0" +edition = "2024" + +[dependencies] +cuda_std = { path = "../../../../crates/cuda_std" } +glam = { version = "0.30.1", default-features = false, features = ["cuda", "nostd-libm"] } + +[lib] +crate-type = ["cdylib", "rlib"] diff --git a/examples/cuda/gemm/kernels/src/gemm_naive.rs b/examples/cuda/gemm/kernels/src/gemm_naive.rs new file mode 100644 index 00000000..02ba504f --- /dev/null +++ b/examples/cuda/gemm/kernels/src/gemm_naive.rs @@ -0,0 +1,46 @@ +use cuda_std::kernel; +use cuda_std::thread; + +#[kernel] +#[allow(improper_ctypes_definitions)] +/// Naive GEMM kernel for C = alpha * A * B + beta * C. +/// +/// This kernel computes each element of the output matrix C independently, without any memory coalescing or tiling optimizations. +/// +/// # Safety +/// CUDA kernel requires unsafe. +/// +/// # Parameters +/// - `mat_a`: Input matrix A, shape (m x k), row-major order. +/// - `mat_b`: Input matrix B, shape (k x n), row-major order. +/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes. +/// - `m`: Number of rows in A and C. +/// - `n`: Number of columns in B and C. +/// - `k`: Number of columns in A and rows in B. +/// - `alpha`: Scalar multiplier for A * B. +/// - `beta`: Scalar multiplier for C. +/// +/// # Thread Mapping +/// Each thread computes one element of C at (row, col). +pub unsafe fn gemm_naive( + mat_a: &[f32], + mat_b: &[f32], + mat_c: *mut f32, + m: usize, + n: usize, + k: usize, + alpha: f32, + beta: f32, +) { + let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize; + let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize; + + if row < m && col < n { + let mut sum = 0.0f32; + for i in 0..k { + sum += mat_a[row * k + i] * mat_b[i * n + col]; + } + let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) }; + *elem = alpha * sum + beta * *elem; + } +} diff --git a/examples/cuda/gemm/kernels/src/gemm_tiled.rs b/examples/cuda/gemm/kernels/src/gemm_tiled.rs new file mode 100644 index 00000000..6c0f00ec --- /dev/null +++ b/examples/cuda/gemm/kernels/src/gemm_tiled.rs @@ -0,0 +1,83 @@ +use cuda_std::address_space; +use cuda_std::kernel; +use cuda_std::thread; + +#[kernel] +#[allow(improper_ctypes_definitions)] +/// Tiled GEMM kernel for C = alpha * A * B + beta * C. +/// +/// This kernel uses shared memory tiling to improve memory access patterns and performance. +/// +/// # Safety +/// CUDA kernel requires unsafe. +/// +/// # Parameters +/// - `mat_a`: Input matrix A, shape (m x k), row-major order. +/// - `mat_b`: Input matrix B, shape (k x n), row-major order. +/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes. +/// - `m`: Number of rows in A and C. +/// - `n`: Number of columns in B and C. +/// - `k`: Number of columns in A and rows in B. +/// - `alpha`: Scalar multiplier for A * B. +/// - `beta`: Scalar multiplier for C. +/// +/// # Tiling +/// Each block computes a TILE_SIZE x TILE_SIZE tile of C using shared memory for A and B tiles. +/// Threads within a block collaboratively load tiles and compute partial sums. +/// +/// # Thread Mapping +/// Each thread computes one element of the output tile. +pub unsafe fn gemm_tiled( + mat_a: &[f32], + mat_b: &[f32], + mat_c: *mut f32, + m: usize, + n: usize, + k: usize, + alpha: f32, + beta: f32, +) { + const TILE_SIZE: usize = 16; + + #[address_space(shared)] + static mut TILE_A: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE]; + #[address_space(shared)] + static mut TILE_B: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE]; + + // Thread indices within the block. + let tx = thread::thread_idx_x() as usize; + let ty = thread::thread_idx_y() as usize; + + // Calculate row and column in the mat_c. + let row = thread::block_idx_x() as usize * TILE_SIZE + ty; + let col = thread::block_idx_y() as usize * TILE_SIZE + tx; + + let mut sum = 0.0f32; + // Loop over tiles of mat_a and mat_b in the k dimension. + for kk in (0..k).step_by(TILE_SIZE) { + // Collaborative loading of tiles into shared memory. + if row < m && (kk + tx) < k { + unsafe { TILE_A[ty * TILE_SIZE + tx] = mat_a[row * k + (kk + tx)] }; + } else { + unsafe { TILE_A[ty * TILE_SIZE + tx] = 0.0f32 }; + } + if col < n && (kk + ty) < k { + unsafe { TILE_B[ty * TILE_SIZE + tx] = mat_b[(kk + ty) * n + col] }; + } else { + unsafe { TILE_B[ty * TILE_SIZE + tx] = 0.0f32 }; + } + thread::sync_threads(); + + // Perform the computation on the tile. + for i in 0..TILE_SIZE { + sum += unsafe { TILE_A[ty * TILE_SIZE + i] * TILE_B[i * TILE_SIZE + tx] }; + } + thread::sync_threads(); + } + + // Write the result back to mat_c with alpha and beta scaling. + if row < m && col < n { + let c = unsafe { mat_c.add(row * n + col) }; + unsafe { *c = alpha * sum + beta * *c }; + } +} diff --git a/examples/cuda/gemm/kernels/src/lib.rs b/examples/cuda/gemm/kernels/src/lib.rs new file mode 100644 index 00000000..19fab562 --- /dev/null +++ b/examples/cuda/gemm/kernels/src/lib.rs @@ -0,0 +1,5 @@ +mod gemm_naive; +mod gemm_tiled; + +pub use crate::gemm_naive::gemm_naive; +pub use crate::gemm_tiled::gemm_tiled; diff --git a/examples/cuda/gemm/src/main.rs b/examples/cuda/gemm/src/main.rs new file mode 100644 index 00000000..ff8708c2 --- /dev/null +++ b/examples/cuda/gemm/src/main.rs @@ -0,0 +1,466 @@ +//! Example demonstrating GEMM (General Matrix Multiply) on CUDA using Rust-CUDA. +//! +//! This example benchmarks naive and tiled GEMM kernels as well as cuBLAS for various matrix sizes. +//! It uses the `cust` crate for CUDA management and `ndarray` for host-side matrix operations. + +use std::cell; +use std::error::Error; + +use cust::event; +use cust::launch; +use cust::memory; +use cust::memory::CopyDestination as _; +use cust::module; +use cust::stream; +use cust::util::SliceExt as _; +use ndarray::Array; +use ndarray_rand::RandomExt as _; +use ndarray_rand::rand_distr::Uniform; + +const EPS: f32 = 0.01; +const NUM_WARMUPS: usize = 2; +const NUM_RUNS: usize = 10; +const MAT_SIZES: [usize; 8] = [32, 64, 128, 256, 512, 1024, 2048, 4096]; +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); + +type GemmFn = dyn Fn( + &stream::Stream, + &module::Module, + &memory::DeviceBuffer, + &memory::DeviceBuffer, + &mut memory::DeviceBuffer, + usize, + usize, + usize, + f32, + f32, +) -> Result<(), Box>; + +fn main() -> Result<(), Box> { + // initialize CUDA, this will pick the first available device and will + // make a CUDA context from it. + // We don't need the context for anything but it must be kept alive. + let _ctx = cust::quick_init()?; + + // Make the CUDA module, modules just house the GPU code for the kernels we created. + // they can be made from PTX code, cubins, or fatbins. + let module = module::Module::from_ptx(PTX, &[])?; + + // Make a CUDA stream to issue calls to. You can think of this as an OS thread but for dispatching + // GPU calls. + let stream = stream::Stream::new(stream::StreamFlags::NON_BLOCKING, None)?; + + run_cublas(&stream)?; + run_gemm_kernel(&stream, &module, &gemm_naive, "gemm_naive")?; + run_gemm_kernel(&stream, &module, &gemm_tiled, "gemm_tiled")?; + + Ok(()) +} + +/// Runs the cuBLAS GEMM for a set of matrix sizes and checks correctness. +/// +/// # Arguments +/// * `stream` - CUDA stream to use for kernel launches and memory operations. +/// +/// This function benchmarks cuBLAS GEMM and checks the result for small matrices. +fn run_cublas(stream: &stream::Stream) -> Result<(), Box> { + // Make a cuBLAS context which manages the cuBLAS internal GPU memory allocations. + let mut cublas_ctx = blastoff::CublasContext::new()?; + + // Sanity check. + { + let mat_a = ndarray::arr2(&[[1.0, 2.0], [3.0, 4.0]]); + let mat_b = ndarray::arr2(&[[5.0, 6.0], [7.0, 8.0]]); + let mat_c_expect = ndarray::arr2(&[[19.0, 22.0], [43.0, 50.0]]); + let (alpha, beta) = (1.0, 0.0); + + let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mut mat_c_gpu = unsafe { cust::memory::DeviceBuffer::uninitialized(2 * 2)? }; + let alpha_gpu = cust::memory::DeviceBox::new(&alpha)?; + let beta_gpu = cust::memory::DeviceBox::new(&beta)?; + + // ndarray uses row-major order, but cuBLAS uses column-major order. + // In such case, C=AxB is equivalent to C^T=B^TxA^T. + cublas_ctx.gemm::( + stream, + 2, + 2, + 2, + &alpha_gpu, + &mat_b_gpu, + 2, + blastoff::MatrixOp::None, + &beta_gpu, + &mat_a_gpu, + 2, + blastoff::MatrixOp::None, + &mut mat_c_gpu, + 2, + )?; + stream.synchronize()?; + + let mut mat_c_actual = Array::::zeros((2, 2)); + mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?; + assert!(mat_c_expect.relative_eq(&mat_c_actual, EPS, EPS)); + } + + for sz in MAT_SIZES.iter().cloned() { + let mat_a = ndarray::Array2::::random((sz, sz), Uniform::new(-10., 10.)); + let mat_b = ndarray::Array2::::random((sz, sz), Uniform::new(-10., 10.)); + let mat_c = ndarray::Array2::::random((sz, sz), Uniform::new(-10., 10.)); + let (alpha, beta) = (1.0, 0.0); + + let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mut mat_c_gpu = mat_c.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let alpha_gpu = cust::memory::DeviceBox::new(&alpha)?; + let beta_gpu = cust::memory::DeviceBox::new(&beta)?; + stream.synchronize()?; + + // Warm up before timing. + for _ in 0..NUM_WARMUPS { + cublas_ctx.gemm::( + stream, + sz, + sz, + sz, + &alpha_gpu, + &mat_b_gpu, + sz, + blastoff::MatrixOp::None, + &beta_gpu, + &mat_a_gpu, + sz, + blastoff::MatrixOp::None, + &mut mat_c_gpu, + sz, + )?; + } + stream.synchronize()?; + + // Time the kernel execution. + let beg = event::Event::new(event::EventFlags::DEFAULT)?; + let end = event::Event::new(event::EventFlags::DEFAULT)?; + beg.record(stream)?; + for _ in 0..NUM_RUNS { + cublas_ctx.gemm::( + stream, + sz, + sz, + sz, + &alpha_gpu, + &mat_b_gpu, + sz, + blastoff::MatrixOp::None, + &beta_gpu, + &mat_a_gpu, + sz, + blastoff::MatrixOp::None, + &mut mat_c_gpu, + sz, + )?; + stream.synchronize()?; + } + end.record(stream)?; + beg.synchronize()?; + end.synchronize()?; + + let mut mat_c_actual = Array::::zeros((sz, sz)); + mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?; + let duration = end.elapsed_time_f32(&beg)? / (NUM_RUNS as f32); + println!("cuBLAS {}x{}: {:.4}ms", sz, sz, duration); + if sz < 1024 { + assert_gemm_eq(&mat_a, &mat_b, &mat_c, alpha, beta, &mat_c_actual); + } + } + + Ok(()) +} + +/// Runs a GEMM kernel (naive or tiled) for a set of matrix sizes and checks correctness. +/// +/// # Arguments +/// * `stream` - CUDA stream to use for kernel launches and memory operations. +/// * `module` - CUDA module containing the kernel. +/// * `gemm_fn` - Function pointer to the GEMM kernel launcher. +/// * `kernel_name` - Name of the kernel for logging. +/// +/// This function benchmarks the provided GEMM kernel and checks the result for small matrices. +fn run_gemm_kernel( + stream: &stream::Stream, + module: &module::Module, + gemm_fn: &GemmFn, + kernel_name: &str, +) -> Result<(), Box> { + // Sanity check. + { + let mat_a = ndarray::arr2::(&[[1.0, 2.0], [3.0, 4.0]]); + let mat_b = ndarray::arr2::(&[[5.0, 6.0], [7.0, 8.0]]); + let mat_c_expect = ndarray::arr2::(&[[19.0, 22.0], [43.0, 50.0]]); + let (alpha, beta) = (1.0f32, 0.0f32); + + let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mut mat_c_gpu = unsafe { cust::memory::DeviceBuffer::uninitialized(2 * 2)? }; + stream.synchronize()?; + + gemm_fn( + stream, + module, + &mat_a_gpu, + &mat_b_gpu, + &mut mat_c_gpu, + 2, + 2, + 2, + alpha, + beta, + )?; + stream.synchronize()?; + + let mut mat_c_actual = Array::::zeros((2, 2)); + mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?; + assert!(mat_c_expect.relative_eq(&mat_c_actual, EPS, EPS)); + } + + for sz in MAT_SIZES.iter().cloned() { + let mat_a = ndarray::Array2::::random((sz, sz), Uniform::new(-10., 10.)); + let mat_b = ndarray::Array2::::random((sz, sz), Uniform::new(-10., 10.)); + let mat_c = ndarray::Array2::::random((sz, sz), Uniform::new(-10., 10.)); + let (alpha, beta) = (1.0f32, 0.0f32); + + let mat_a_gpu = mat_a.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mat_b_gpu = mat_b.as_standard_layout().as_slice().unwrap().as_dbuf()?; + let mut mat_c_gpu = mat_c.as_standard_layout().as_slice().unwrap().as_dbuf()?; + stream.synchronize()?; + + // Warm up before timing. + for _ in 0..NUM_WARMUPS { + gemm_fn( + stream, + module, + &mat_a_gpu, + &mat_b_gpu, + &mut mat_c_gpu, + sz, + sz, + sz, + alpha, + beta, + )?; + } + stream.synchronize()?; + + // Time the kernel execution. + let beg = event::Event::new(event::EventFlags::DEFAULT)?; + let end = event::Event::new(event::EventFlags::DEFAULT)?; + beg.record(stream)?; + for _ in 0..NUM_RUNS { + gemm_fn( + stream, + module, + &mat_a_gpu, + &mat_b_gpu, + &mut mat_c_gpu, + sz, + sz, + sz, + alpha, + beta, + )?; + stream.synchronize()?; + } + end.record(stream)?; + beg.synchronize()?; + end.synchronize()?; + + let mut mat_c_actual = Array::::zeros((sz, sz)); + mat_c_gpu.copy_to(&mut mat_c_actual.as_slice_mut().unwrap())?; + let duration = end.elapsed_time_f32(&beg)? / (NUM_RUNS as f32); + println!("{} {}x{}: {:.4}ms", kernel_name, sz, sz, duration); + if sz < 1024 { + assert_gemm_eq(&mat_a, &mat_b, &mat_c, alpha, beta, &mat_c_actual); + } + } + Ok(()) +} + +/// Asserts that the GEMM result matches the expected value within a tolerance. +/// +/// # Arguments +/// * `mat_a` - Left matrix operand. +/// * `mat_b` - Right matrix operand. +/// * `mat_c` - Initial value of the output matrix. +/// * `alpha` - Scalar multiplier for mat_a * mat_b. +/// * `beta` - Scalar multiplier for mat_c. +/// * `mat_c_actual` - Result from the device. +fn assert_gemm_eq( + mat_a: &ndarray::Array2, + mat_b: &ndarray::Array2, + mat_c: &ndarray::Array2, + alpha: T, + beta: T, + mat_c_actual: &ndarray::Array2, +) where + T: Clone, + f64: From, +{ + let mat_a = mat_a.mapv(|v| Into::::into(v)); + let mat_b = mat_b.mapv(|v| Into::::into(v)); + let mat_c = mat_c.mapv(|v| Into::::into(v)); + let mat_c_actual = mat_c_actual.mapv(|v| Into::::into(v)); + let alpha: f64 = alpha.into(); + let beta: f64 = beta.into(); + let mat_c_expect = alpha * mat_a.dot(&mat_b) + beta * mat_c; + let ok = mat_c_expect.relative_eq(&mat_c_actual, EPS.into(), EPS.into()); + if !ok { + println!("Actual: {:?}", mat_c_actual); + println!("Expect: {:?}", mat_c_expect); + panic!("GEMM result mismatch"); + } +} + +/// Launches the naive GEMM kernel on the device. +/// +/// # Arguments +/// * `stream` - CUDA stream to use for kernel launch. +/// * `module` - CUDA module containing the kernel. +/// * `mat_a` - Device buffer for left matrix operand (m x k). +/// * `mat_b` - Device buffer for right matrix operand (k x n). +/// * `mat_c` - Device buffer for output matrix (m x n). +/// * `m` - Number of rows in mat_a and mat_c. +/// * `n` - Number of columns in mat_b and mat_c. +/// * `k` - Number of columns in mat_a and rows in mat_b. +/// * `alpha` - Scalar multiplier for mat_a * mat_b. +/// * `beta` - Scalar multiplier for mat_c. +/// +/// This function configures the launch parameters and invokes the naive GEMM kernel. +#[allow(clippy::too_many_arguments)] +pub fn gemm_naive( + stream: &stream::Stream, + module: &module::Module, + mat_a: &memory::DeviceBuffer, + mat_b: &memory::DeviceBuffer, + mat_c: &mut memory::DeviceBuffer, + m: usize, + n: usize, + k: usize, + alpha: f32, + beta: f32, +) -> Result<(), Box> { + assert_eq!(mat_a.len(), m * k); + assert_eq!(mat_b.len(), k * n); + assert_eq!(mat_c.len(), m * n); + + let kernel_cell = cell::LazyCell::new(|| { + module + .get_function("gemm_naive") + .expect("kernel not found.") + }); + let kernel = &*kernel_cell; + + // use the CUDA occupancy API to find an optimal launch configuration for the grid and block size. + // This will try to maximize how much of the GPU is used by finding the best launch configuration for the + // current CUDA device/architecture. + let (_, block_size) = kernel.suggested_launch_configuration(0, 0.into())?; + let block_size = block_size as usize; + let (block_size_x, block_size_y) = if block_size > m * n { + (block_size.div_ceil(m) as u32, m as u32) + } else { + (1, block_size as u32) + }; + let (grid_size_x, grid_size_y) = ( + (m as u32).div_ceil(block_size_x), + (n as u32).div_ceil(block_size_y), + ); + unsafe { + launch!( + kernel<<< + (grid_size_x, grid_size_y), + (block_size_x, block_size_y), + 0, + stream + >>>( + mat_a.as_device_ptr(), + mat_a.len(), + mat_b.as_device_ptr(), + mat_b.len(), + mat_c.as_device_ptr(), + m, + n, + k, + alpha, + beta, + ) + )?; + }; + Ok(()) +} + +/// Launches the tiled GEMM kernel on the device. +/// +/// # Arguments +/// * `stream` - CUDA stream to use for kernel launch. +/// * `module` - CUDA module containing the kernel. +/// * `mat_a` - Device buffer for left matrix operand (m x k). +/// * `mat_b` - Device buffer for right matrix operand (k x n). +/// * `mat_c` - Device buffer for output matrix (m x n). +/// * `m` - Number of rows in mat_a and mat_c. +/// * `n` - Number of columns in mat_b and mat_c. +/// * `k` - Number of columns in mat_a and rows in mat_b. +/// * `alpha` - Scalar multiplier for mat_a * mat_b. +/// * `beta` - Scalar multiplier for mat_c. +/// +/// This function configures the launch parameters and invokes the tiled GEMM kernel. +#[allow(clippy::too_many_arguments)] +pub fn gemm_tiled( + stream: &stream::Stream, + module: &module::Module, + mat_a: &memory::DeviceBuffer, + mat_b: &memory::DeviceBuffer, + mat_c: &mut memory::DeviceBuffer, + m: usize, + n: usize, + k: usize, + alpha: f32, + beta: f32, +) -> Result<(), Box> { + assert_eq!(mat_a.len(), m * k); + assert_eq!(mat_b.len(), k * n); + assert_eq!(mat_c.len(), m * n); + + // These values must be aligned with the kernel code. + const TILE_SIZE: usize = 16; + + let kernel_cell = cell::LazyCell::new(|| { + module + .get_function("gemm_tiled") + .expect("kernel not found.") + }); + let kernel = &*kernel_cell; + + let (grid_size_x, grid_size_y) = (n.div_ceil(TILE_SIZE) as u32, m.div_ceil(TILE_SIZE) as u32); + unsafe { + launch!( + kernel<<< + (grid_size_x, grid_size_y), + (TILE_SIZE as u32, TILE_SIZE as u32), + 0, + stream + >>>( + mat_a.as_device_ptr(), + mat_a.len(), + mat_b.as_device_ptr(), + mat_b.len(), + mat_c.as_device_ptr(), + m, + n, + k, + alpha, + beta, + ) + )?; + }; + Ok(()) +} diff --git a/examples/cuda/gpu/add_gpu/Cargo.toml b/examples/cuda/gpu/add_gpu/Cargo.toml deleted file mode 100644 index 48321561..00000000 --- a/examples/cuda/gpu/add_gpu/Cargo.toml +++ /dev/null @@ -1,10 +0,0 @@ -[package] -name = "add_gpu" -version = "0.1.0" -edition = "2021" - -[dependencies] -cuda_std = { version = "0.2", path = "../../../../crates/cuda_std" } - -[lib] -crate-type = ["cdylib", "rlib"] diff --git a/examples/cuda/cpu/path_tracer/Cargo.toml b/examples/cuda/path_tracer/Cargo.toml similarity index 50% rename from examples/cuda/cpu/path_tracer/Cargo.toml rename to examples/cuda/path_tracer/Cargo.toml index 763ba0e5..46619e01 100644 --- a/examples/cuda/cpu/path_tracer/Cargo.toml +++ b/examples/cuda/path_tracer/Cargo.toml @@ -1,16 +1,18 @@ [package] -name = "path_tracer" +name = "path-tracer" version = "0.1.0" edition = "2018" [dependencies] vek = { version = "0.17.1", features = ["bytemuck", "mint"] } bytemuck = { version = "1.21", features = ["derive"] } -cust = { version = "0.3", path = "../../../../crates/cust", features = ["impl_vek"] } +cust = { version = "0.3", path = "../../../crates/cust", features = [ + "impl_vek", +] } image = "0.25.5" -path_tracer_gpu = { path = "../../gpu/path_tracer_gpu" } -gpu_rand = { version = "0.1", path = "../../../../crates/gpu_rand" } -optix = { version = "0.1", path = "../../../../crates/optix" } +path-tracer-kernels = { path = "kernels" } +gpu_rand = { version = "0.1", path = "../../../crates/gpu_rand" } +optix = { version = "0.1", path = "../../../crates/optix" } glium = "0.32.0" glutin = "0.28.0" imgui = "0.9.0" @@ -21,4 +23,4 @@ sysinfo = "0.33.1" anyhow = "1.0.53" [build-dependencies] -cuda_builder = { version = "0.3", path = "../../../../crates/cuda_builder" } +cuda_builder = { version = "0.3", path = "../../../crates/cuda_builder" } diff --git a/examples/cuda/path_tracer/build.rs b/examples/cuda/path_tracer/build.rs new file mode 100644 index 00000000..da3f5a50 --- /dev/null +++ b/examples/cuda/path_tracer/build.rs @@ -0,0 +1,17 @@ +use std::env; +use std::path; + +use cuda_builder::CudaBuilder; + +fn main() { + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + CudaBuilder::new("kernels") + .copy_to(out_path.join("kernels.ptx")) + .build() + .unwrap(); + CudaBuilder::new("kernels") + .copy_to(out_path.join("kernels_optix.ptx")) + .build_args(&["--features", "optix"]) + .build() + .unwrap(); +} diff --git a/examples/cuda/gpu/path_tracer_gpu/Cargo.toml b/examples/cuda/path_tracer/kernels/Cargo.toml similarity index 93% rename from examples/cuda/gpu/path_tracer_gpu/Cargo.toml rename to examples/cuda/path_tracer/kernels/Cargo.toml index 02cbc45f..97a92bd2 100644 --- a/examples/cuda/gpu/path_tracer_gpu/Cargo.toml +++ b/examples/cuda/path_tracer/kernels/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "path_tracer_gpu" +name = "path-tracer-kernels" version = "0.1.0" edition = "2018" diff --git a/examples/cuda/gpu/path_tracer_gpu/src/hittable.rs b/examples/cuda/path_tracer/kernels/src/hittable.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/hittable.rs rename to examples/cuda/path_tracer/kernels/src/hittable.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/lib.rs b/examples/cuda/path_tracer/kernels/src/lib.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/lib.rs rename to examples/cuda/path_tracer/kernels/src/lib.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/material.rs b/examples/cuda/path_tracer/kernels/src/material.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/material.rs rename to examples/cuda/path_tracer/kernels/src/material.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/math.rs b/examples/cuda/path_tracer/kernels/src/math.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/math.rs rename to examples/cuda/path_tracer/kernels/src/math.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/optix.rs b/examples/cuda/path_tracer/kernels/src/optix.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/optix.rs rename to examples/cuda/path_tracer/kernels/src/optix.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/render.rs b/examples/cuda/path_tracer/kernels/src/render.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/render.rs rename to examples/cuda/path_tracer/kernels/src/render.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/render_kernels.rs b/examples/cuda/path_tracer/kernels/src/render_kernels.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/render_kernels.rs rename to examples/cuda/path_tracer/kernels/src/render_kernels.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/scene.rs b/examples/cuda/path_tracer/kernels/src/scene.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/scene.rs rename to examples/cuda/path_tracer/kernels/src/scene.rs diff --git a/examples/cuda/gpu/path_tracer_gpu/src/sphere.rs b/examples/cuda/path_tracer/kernels/src/sphere.rs similarity index 100% rename from examples/cuda/gpu/path_tracer_gpu/src/sphere.rs rename to examples/cuda/path_tracer/kernels/src/sphere.rs diff --git a/examples/cuda/cpu/path_tracer/shaders/image.frag b/examples/cuda/path_tracer/shaders/image.frag similarity index 100% rename from examples/cuda/cpu/path_tracer/shaders/image.frag rename to examples/cuda/path_tracer/shaders/image.frag diff --git a/examples/cuda/cpu/path_tracer/shaders/image.vert b/examples/cuda/path_tracer/shaders/image.vert similarity index 100% rename from examples/cuda/cpu/path_tracer/shaders/image.vert rename to examples/cuda/path_tracer/shaders/image.vert diff --git a/examples/cuda/cpu/path_tracer/src/common.rs b/examples/cuda/path_tracer/src/common.rs similarity index 99% rename from examples/cuda/cpu/path_tracer/src/common.rs rename to examples/cuda/path_tracer/src/common.rs index bfe69df0..969ff946 100644 --- a/examples/cuda/cpu/path_tracer/src/common.rs +++ b/examples/cuda/path_tracer/src/common.rs @@ -1,7 +1,7 @@ use glium::glutin::event::{ ElementState, Event, MouseButton, MouseScrollDelta, VirtualKeyCode, WindowEvent, }; -use path_tracer_gpu::Viewport; +use path_tracer_kernels::Viewport; use vek::{Vec2, Vec3}; #[derive(Debug, Clone, Copy, PartialEq)] diff --git a/examples/cuda/cpu/path_tracer/src/cpu/mod.rs b/examples/cuda/path_tracer/src/cpu/mod.rs similarity index 99% rename from examples/cuda/cpu/path_tracer/src/cpu/mod.rs rename to examples/cuda/path_tracer/src/cpu/mod.rs index 217bde5e..3fad1581 100644 --- a/examples/cuda/cpu/path_tracer/src/cpu/mod.rs +++ b/examples/cuda/path_tracer/src/cpu/mod.rs @@ -2,7 +2,7 @@ use std::time::Duration; use gpu_rand::{DefaultRand, GpuRand}; use imgui::Ui; -use path_tracer_gpu::{ +use path_tracer_kernels::{ material::MaterialKind, render::generate_ray, scene::Scene, Object, Viewport, }; use rayon::prelude::*; diff --git a/examples/cuda/cpu/path_tracer/src/cuda/data.rs b/examples/cuda/path_tracer/src/cuda/data.rs similarity index 98% rename from examples/cuda/cpu/path_tracer/src/cuda/data.rs rename to examples/cuda/path_tracer/src/cuda/data.rs index 9e71cef6..d7f2d224 100644 --- a/examples/cuda/cpu/path_tracer/src/cuda/data.rs +++ b/examples/cuda/path_tracer/src/cuda/data.rs @@ -6,7 +6,7 @@ use cust::{ util::SliceExt, }; use gpu_rand::DefaultRand; -use path_tracer_gpu::{material::MaterialKind, scene::Scene, Object, Viewport}; +use path_tracer_kernels::{material::MaterialKind, scene::Scene, Object, Viewport}; use vek::{Vec2, Vec3}; use super::SEED; diff --git a/examples/cuda/cpu/path_tracer/src/cuda/mod.rs b/examples/cuda/path_tracer/src/cuda/mod.rs similarity index 98% rename from examples/cuda/cpu/path_tracer/src/cuda/mod.rs rename to examples/cuda/path_tracer/src/cuda/mod.rs index 7d92c645..f737e6e8 100644 --- a/examples/cuda/cpu/path_tracer/src/cuda/mod.rs +++ b/examples/cuda/path_tracer/src/cuda/mod.rs @@ -18,7 +18,7 @@ use optix::{ context::DeviceContext, denoiser::{Denoiser, DenoiserModelKind, Image, ImageFormat}, }; -use path_tracer_gpu::scene::Scene; +use path_tracer_kernels::scene::Scene; use vek::{Vec2, Vec3}; /// Seed for the random states @@ -29,7 +29,7 @@ pub const SEED: u64 = 932174513921034; /// This should always be a multiple of warp size (32) to maximize occupancy. const THREAD_BLOCK_AXIS_LENGTH: usize = 16; -pub(crate) static PTX: &str = include_str!("../../../../resources/path_tracer.ptx"); +pub(crate) static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); pub struct CudaRenderer { stream: Stream, diff --git a/examples/cuda/cpu/path_tracer/src/main.rs b/examples/cuda/path_tracer/src/main.rs similarity index 98% rename from examples/cuda/cpu/path_tracer/src/main.rs rename to examples/cuda/path_tracer/src/main.rs index 4311c172..d4ff4470 100644 --- a/examples/cuda/cpu/path_tracer/src/main.rs +++ b/examples/cuda/path_tracer/src/main.rs @@ -6,7 +6,7 @@ pub mod renderer; pub mod viewer; use common::Camera; -use path_tracer_gpu::{ +use path_tracer_kernels::{ material::{DielectricMaterial, DiffuseMaterial, MaterialKind, MetallicMaterial}, scene::Scene, sphere::Sphere, diff --git a/examples/cuda/cpu/path_tracer/src/optix/mod.rs b/examples/cuda/path_tracer/src/optix/mod.rs similarity index 97% rename from examples/cuda/cpu/path_tracer/src/optix/mod.rs rename to examples/cuda/path_tracer/src/optix/mod.rs index bf8bdc09..57191234 100644 --- a/examples/cuda/cpu/path_tracer/src/optix/mod.rs +++ b/examples/cuda/path_tracer/src/optix/mod.rs @@ -17,13 +17,13 @@ use optix::{ }, shader_binding_table::{SbtRecord, ShaderBindingTable}, }; -use path_tracer_gpu::{optix::LaunchParams, scene::Scene, sphere::Sphere, Object}; +use path_tracer_kernels::{optix::LaunchParams, scene::Scene, sphere::Sphere, Object}; pub type RaygenRecord = SbtRecord; pub type MissRecord = SbtRecord; pub type SphereHitgroupRecord = SbtRecord; -pub(crate) static OPTIX_PTX: &str = include_str!("../../../../resources/path_tracer_optix.ptx"); +pub(crate) static OPTIX_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels_optix.ptx")); /// A subset of the CUDA renderer that uses hardware raytracing with OptiX pub struct OptixRenderer { diff --git a/examples/cuda/cpu/path_tracer/src/renderer.rs b/examples/cuda/path_tracer/src/renderer.rs similarity index 99% rename from examples/cuda/cpu/path_tracer/src/renderer.rs rename to examples/cuda/path_tracer/src/renderer.rs index 99e368ae..9bda19d0 100644 --- a/examples/cuda/cpu/path_tracer/src/renderer.rs +++ b/examples/cuda/path_tracer/src/renderer.rs @@ -1,6 +1,6 @@ use glium::glutin::{event::Event, event_loop::ControlFlow}; use imgui::Ui; -use path_tracer_gpu::scene::Scene; +use path_tracer_kernels::scene::Scene; use sysinfo::System; use vek::Vec2; diff --git a/examples/cuda/cpu/path_tracer/src/viewer.rs b/examples/cuda/path_tracer/src/viewer.rs similarity index 99% rename from examples/cuda/cpu/path_tracer/src/viewer.rs rename to examples/cuda/path_tracer/src/viewer.rs index 9f040987..3237f210 100644 --- a/examples/cuda/cpu/path_tracer/src/viewer.rs +++ b/examples/cuda/path_tracer/src/viewer.rs @@ -14,7 +14,7 @@ use glium::{ use imgui::Condition; use imgui_winit_support::{HiDpiMode, WinitPlatform}; -use path_tracer_gpu::scene::Scene; +use path_tracer_kernels::scene::Scene; use std::time::Instant; use vek::Vec2; diff --git a/examples/cuda/resources/.gitignore b/examples/cuda/resources/.gitignore deleted file mode 100644 index baabe46d..00000000 --- a/examples/cuda/resources/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.ptx \ No newline at end of file diff --git a/examples/cuda/vecadd/Cargo.toml b/examples/cuda/vecadd/Cargo.toml new file mode 100644 index 00000000..5e421b29 --- /dev/null +++ b/examples/cuda/vecadd/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "vecadd" +version = "0.1.0" +edition = "2024" + +[dependencies] +cust = { path = "../../../crates/cust" } +nanorand = "0.7" + +[build-dependencies] +cuda_builder = { path = "../../../crates/cuda_builder" } diff --git a/examples/cuda/vecadd/build.rs b/examples/cuda/vecadd/build.rs new file mode 100644 index 00000000..2046e342 --- /dev/null +++ b/examples/cuda/vecadd/build.rs @@ -0,0 +1,15 @@ +use std::env; +use std::path; + +use cuda_builder::CudaBuilder; + +fn main() { + println!("cargo:rerun-if-changed=build.rs"); + println!("cargo:rerun-if-changed=kernels"); + + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + CudaBuilder::new("kernels") + .copy_to(out_path.join("kernels.ptx")) + .build() + .unwrap(); +} diff --git a/examples/cuda/vecadd/kernels/Cargo.toml b/examples/cuda/vecadd/kernels/Cargo.toml new file mode 100644 index 00000000..cb84009d --- /dev/null +++ b/examples/cuda/vecadd/kernels/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "vecadd-kernels" +version = "0.1.0" +edition = "2024" + +[dependencies] +cuda_std = { path = "../../../../crates/cuda_std" } + +[lib] +crate-type = ["cdylib", "rlib"] diff --git a/examples/cuda/gpu/add_gpu/src/lib.rs b/examples/cuda/vecadd/kernels/src/lib.rs similarity index 66% rename from examples/cuda/gpu/add_gpu/src/lib.rs rename to examples/cuda/vecadd/kernels/src/lib.rs index 42f14817..ba1c3038 100644 --- a/examples/cuda/gpu/add_gpu/src/lib.rs +++ b/examples/cuda/vecadd/kernels/src/lib.rs @@ -2,10 +2,10 @@ use cuda_std::prelude::*; #[kernel] #[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] -pub unsafe fn add(a: &[f32], b: &[f32], c: *mut f32) { +pub unsafe fn vecadd(a: &[f32], b: &[f32], c: *mut f32) { let idx = thread::index_1d() as usize; if idx < a.len() { - let elem = &mut *c.add(idx); + let elem = unsafe { &mut *c.add(idx) }; *elem = a[idx] + b[idx]; } } diff --git a/examples/cuda/cpu/add/src/main.rs b/examples/cuda/vecadd/src/main.rs similarity index 87% rename from examples/cuda/cpu/add/src/main.rs rename to examples/cuda/vecadd/src/main.rs index 69c34b4c..c6263f02 100644 --- a/examples/cuda/cpu/add/src/main.rs +++ b/examples/cuda/vecadd/src/main.rs @@ -5,7 +5,7 @@ use std::error::Error; /// How many numbers to generate and add together. const NUMBERS_LEN: usize = 100_000; -static PTX: &str = include_str!("../../../resources/add.ptx"); +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); fn main() -> Result<(), Box> { // generate our random vectors. @@ -37,13 +37,13 @@ fn main() -> Result<(), Box> { let mut out = vec![0.0f32; NUMBERS_LEN]; let out_buf = out.as_slice().as_dbuf()?; - // retrieve the add kernel from the module so we can calculate the right launch config. - let func = module.get_function("add")?; + // retrieve the `vecadd` kernel from the module so we can calculate the right launch config. + let vecadd = module.get_function("vecadd")?; // use the CUDA occupancy API to find an optimal launch configuration for the grid and block size. // This will try to maximize how much of the GPU is used by finding the best launch configuration for the // current CUDA device/architecture. - let (_, block_size) = func.suggested_launch_configuration(0, 0.into())?; + let (_, block_size) = vecadd.suggested_launch_configuration(0, 0.into())?; let grid_size = (NUMBERS_LEN as u32).div_ceil(block_size); @@ -57,7 +57,7 @@ fn main() -> Result<(), Box> { unsafe { launch!( // slices are passed as two parameters, the pointer and the length. - func<<>>( + vecadd<<>>( lhs_gpu.as_device_ptr(), lhs_gpu.len(), rhs_gpu.as_device_ptr(),