Skip to content

chore(examples): restructure CUDA examples and add a GEMM example #200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/ci_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,18 @@ jobs:
- name: Clippy
env:
RUSTFLAGS: -Dwarnings
run: cargo clippy --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
run: cargo clippy --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"

- name: Build all bindings
run: cargo build --all-features -p cust_raw

- name: Build workspace
run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"
run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*"

- name: Check documentation
env:
RUSTDOCFLAGS: -Dwarnings
run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"

- name: Prepare artifact details
id: artifact_details
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci_windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
run: cargo build --all-features -p cust_raw

- name: Build
run: cargo build --workspace --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*"
run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*"

# Don't currently test because many tests rely on the system having a CUDA GPU
# - name: Test
Expand All @@ -75,4 +75,4 @@ jobs:
- name: Check documentation
env:
RUSTDOCFLAGS: -Dwarnings
run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path_tracer" --exclude "denoiser" --exclude "add" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "cust_raw"
9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,14 @@ members = [

"xtask",

"examples/cuda/vecadd",
"examples/cuda/vecadd/kernels",
"examples/cuda/gemm",
"examples/cuda/gemm/kernels",
"examples/cuda/path_tracer",
"examples/cuda/path_tracer/kernels",

"examples/optix/*",
"examples/cuda/cpu/*",
"examples/cuda/gpu/*",
]

exclude = [
Expand Down
22 changes: 0 additions & 22 deletions examples/cuda/cpu/add/Cargo.toml

This file was deleted.

8 changes: 0 additions & 8 deletions examples/cuda/cpu/add/build.rs

This file was deleted.

14 changes: 0 additions & 14 deletions examples/cuda/cpu/path_tracer/build.rs

This file was deleted.

16 changes: 16 additions & 0 deletions examples/cuda/gemm/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[package]
name = "gemm"
version = "0.1.0"
edition = "2024"

[dependencies]
blastoff = { path = "../../../crates/blastoff" }
cuda_std = { path = "../../../crates/cuda_std" }
cust = { path = "../../../crates/cust" }
cust_raw = { path = "../../../crates/cust_raw", features = ["driver"] }
ndarray = { version = "0.16", features = ["approx"] }
ndarray-rand = "0.15.0"
rand = "0.9"

[build-dependencies]
cuda_builder = { path = "../../../crates/cuda_builder" }
15 changes: 15 additions & 0 deletions examples/cuda/gemm/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
use std::env;
use std::path;

use cuda_builder::CudaBuilder;

fn main() {
println!("cargo::rerun-if-changed=build.rs");
println!("cargo::rerun-if-changed=kernels");

let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
CudaBuilder::new("kernels")
.copy_to(out_path.join("kernels.ptx"))
.build()
.unwrap();
}
11 changes: 11 additions & 0 deletions examples/cuda/gemm/kernels/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "gemm-kernels"
version = "0.1.0"
edition = "2024"

[dependencies]
cuda_std = { path = "../../../../crates/cuda_std" }
glam = { version = "0.30.1", default-features = false, features = ["cuda", "nostd-libm"] }

[lib]
crate-type = ["cdylib", "rlib"]
46 changes: 46 additions & 0 deletions examples/cuda/gemm/kernels/src/gemm_naive.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use cuda_std::kernel;
use cuda_std::thread;

#[kernel]
#[allow(improper_ctypes_definitions)]
/// Naive GEMM kernel for C = alpha * A * B + beta * C.
///
/// This kernel computes each element of the output matrix C independently, without any memory coalescing or tiling optimizations.
///
/// # Safety
/// CUDA kernel requires unsafe.
///
/// # Parameters
/// - `mat_a`: Input matrix A, shape (m x k), row-major order.
/// - `mat_b`: Input matrix B, shape (k x n), row-major order.
/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes.
/// - `m`: Number of rows in A and C.
/// - `n`: Number of columns in B and C.
/// - `k`: Number of columns in A and rows in B.
/// - `alpha`: Scalar multiplier for A * B.
/// - `beta`: Scalar multiplier for C.
///
/// # Thread Mapping
/// Each thread computes one element of C at (row, col).
pub unsafe fn gemm_naive(
mat_a: &[f32],
mat_b: &[f32],
mat_c: *mut f32,
m: usize,
n: usize,
k: usize,
alpha: f32,
beta: f32,
) {
let row = (thread::block_dim_x() * thread::block_idx_x() + thread::thread_idx_x()) as usize;
let col = (thread::block_dim_y() * thread::block_idx_y() + thread::thread_idx_y()) as usize;

if row < m && col < n {
let mut sum = 0.0f32;
for i in 0..k {
sum += mat_a[row * k + i] * mat_b[i * n + col];
}
let elem = unsafe { &mut *mat_c.add((row * n + col) as usize) };
*elem = alpha * sum + beta * *elem;
}
}
83 changes: 83 additions & 0 deletions examples/cuda/gemm/kernels/src/gemm_tiled.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use cuda_std::address_space;
use cuda_std::kernel;
use cuda_std::thread;

#[kernel]
#[allow(improper_ctypes_definitions)]
/// Tiled GEMM kernel for C = alpha * A * B + beta * C.
///
/// This kernel uses shared memory tiling to improve memory access patterns and performance.
///
/// # Safety
/// CUDA kernel requires unsafe.
///
/// # Parameters
/// - `mat_a`: Input matrix A, shape (m x k), row-major order.
/// - `mat_b`: Input matrix B, shape (k x n), row-major order.
/// - `mat_c`: Output matrix C, shape (m x n), row-major order. Must be valid for writes.
/// - `m`: Number of rows in A and C.
/// - `n`: Number of columns in B and C.
/// - `k`: Number of columns in A and rows in B.
/// - `alpha`: Scalar multiplier for A * B.
/// - `beta`: Scalar multiplier for C.
///
/// # Tiling
/// Each block computes a TILE_SIZE x TILE_SIZE tile of C using shared memory for A and B tiles.
/// Threads within a block collaboratively load tiles and compute partial sums.
///
/// # Thread Mapping
/// Each thread computes one element of the output tile.
pub unsafe fn gemm_tiled(
mat_a: &[f32],
mat_b: &[f32],
mat_c: *mut f32,
m: usize,
n: usize,
k: usize,
alpha: f32,
beta: f32,
) {
const TILE_SIZE: usize = 16;

#[address_space(shared)]
static mut TILE_A: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE];
#[address_space(shared)]
static mut TILE_B: [f32; TILE_SIZE * TILE_SIZE] = [0.; TILE_SIZE * TILE_SIZE];

// Thread indices within the block.
let tx = thread::thread_idx_x() as usize;
let ty = thread::thread_idx_y() as usize;

// Calculate row and column in the mat_c.
let row = thread::block_idx_x() as usize * TILE_SIZE + ty;
let col = thread::block_idx_y() as usize * TILE_SIZE + tx;

let mut sum = 0.0f32;
// Loop over tiles of mat_a and mat_b in the k dimension.
for kk in (0..k).step_by(TILE_SIZE) {
// Collaborative loading of tiles into shared memory.
if row < m && (kk + tx) < k {
unsafe { TILE_A[ty * TILE_SIZE + tx] = mat_a[row * k + (kk + tx)] };
} else {
unsafe { TILE_A[ty * TILE_SIZE + tx] = 0.0f32 };
}
if col < n && (kk + ty) < k {
unsafe { TILE_B[ty * TILE_SIZE + tx] = mat_b[(kk + ty) * n + col] };
} else {
unsafe { TILE_B[ty * TILE_SIZE + tx] = 0.0f32 };
}
thread::sync_threads();

// Perform the computation on the tile.
for i in 0..TILE_SIZE {
sum += unsafe { TILE_A[ty * TILE_SIZE + i] * TILE_B[i * TILE_SIZE + tx] };
}
thread::sync_threads();
}

// Write the result back to mat_c with alpha and beta scaling.
if row < m && col < n {
let c = unsafe { mat_c.add(row * n + col) };
unsafe { *c = alpha * sum + beta * *c };
}
}
5 changes: 5 additions & 0 deletions examples/cuda/gemm/kernels/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod gemm_naive;
mod gemm_tiled;

pub use crate::gemm_naive::gemm_naive;
pub use crate::gemm_tiled::gemm_tiled;
Loading
Loading