Skip to content

Commit

Permalink
Add gpu performance exercises/demos
Browse files Browse the repository at this point in the history
  • Loading branch information
cscjlan committed Jul 1, 2024
1 parent b7ca383 commit 90b8e49
Show file tree
Hide file tree
Showing 10 changed files with 223 additions and 54 deletions.
19 changes: 19 additions & 0 deletions application-performance/demos/omniperf/01_three_kernels/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Build

Build on the login node with `./build.sh`

# Run

Run with `sbatch profile.sbatch`

# Analyze

1. Go to www.lumi.csc.fi
2. Start a desktop session
3. Launch a terminal on the desktop session
4. cd to this directory
5. Do `. ../sourceme.sh`
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
7. Open Firefox
8. Go to address `localhost:8050`
9. Analyze
77 changes: 77 additions & 0 deletions application-performance/demos/omniperf/01_three_kernels/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include <cstddef>
#include <hip/hip_runtime.h>
#include <math.h>

__global__ void kernel1(size_t n, float *x, float *y) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
x[tid] = 0.666f * sin(tid);
y[tid] = 1.337f * cos(tid);
}
}

__global__ void kernel2(size_t n, float a, float *x, float *y, float *r) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
r[tid] = a * x[tid] + y[tid];
}
}

__global__ void kernel3(size_t n, float a, float *x, float *y, float *r) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
const float x1 = x[tid];
const float x2 = x1 * x1;
const float x3 = x1 * x2;
const float x4 = x2 * x2;

const float y1 = y[tid];
const float y2 = y1 * y1;
const float y3 = y1 * y2;
const float y4 = y2 * y2;
// clang-format off
r[tid] =
1.0f * a * x1
- 2.0f * a * x2
+ 3.0f * a * x3
- 4.0f * a * x4
+ 4.0f * a * y1
- 3.0f * a * y2
+ 2.0f * a * y3
- 1.0f * a * y4;
// clang-format on
}
}

void *gpu_allocate(size_t bytes) {
void *p = nullptr;
[[maybe_unused]] const auto result = hipMalloc(&p, bytes);
return p;
}

int main() {
constexpr size_t n = 1 << 30;
constexpr size_t num_bytes = sizeof(float) * n;
constexpr float a = 3.4f;

float *const x = static_cast<float *>(gpu_allocate(num_bytes));
float *const y = static_cast<float *>(gpu_allocate(num_bytes));
float *const r = static_cast<float *>(gpu_allocate(num_bytes));

constexpr dim3 blocks(1024);
constexpr dim3 threads(1024);
kernel1<<<blocks, threads, 0, 0>>>(n, x, y);
kernel2<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
kernel3<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
[[maybe_unused]] auto t = hipDeviceSynchronize();

hipFree(x);
hipFree(y);
hipFree(r);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash -l

#SBATCH --account=project_465001194
#SBATCH --job-name=01_three_kernels
#SBATCH --output=01_three_kernels.out%j
#SBATCH --error=01_three_kernelsname.err%j
#SBATCH --partition=small-g
#SBATCH --reservation=CSC_summer_school_gpu
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=1
#SBATCH --mem=10G
#SBATCH --time=00:30:00

ml LUMI/23.09
ml partition/G
ml PrgEnv-cray
ml craype-accel-amd-gfx90a
ml rocm/5.4.6

export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH

srun omniperf profile -n 01_three_kernels -- ./a.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Build

Build on the login node with `./build.sh`

# Run

Run with `sbatch profile.sbatch`

# Analyze

1. Go to www.lumi.csc.fi
2. Start a desktop session
3. Launch a terminal on the desktop session
4. cd to this directory
5. Do `. ../sourceme.sh`
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
7. Open Firefox
8. Go to address `localhost:8050`
9. Analyze
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

ml LUMI/23.09
ml partition/G
ml rocm/5.4.6
ml PrgEnv-cray/8.4.0

CC -xhip -pg -O2 main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include <cstddef>
#include <hip/hip_runtime.h>

__global__ void init(size_t num_rows, float *a, float *b, float *c) {
const size_t col = threadIdx.x;
size_t row = threadIdx.y + blockIdx.x * blockDim.y;
const size_t row_stride = gridDim.x * blockDim.y;

for (; row < num_rows; row += row_stride) {
const size_t i = col + row * blockDim.x;
a[i] = 1.0f;
b[i] = 2.0f;
}
}

__global__ void row_major(size_t num_rows, float *a, float *b, float *c) {
const size_t col = threadIdx.x;
size_t row = threadIdx.y + blockIdx.x * blockDim.y;
const size_t row_stride = gridDim.x * blockDim.y;

for (; row < num_rows; row += row_stride) {
const size_t i = col + row * blockDim.x;
c[i] = a[i] + b[i];
}
}

__global__ void col_major(size_t num_rows, float *a, float *b, float *c) {
const size_t col = threadIdx.x;
size_t row = threadIdx.y + blockIdx.x * blockDim.y;
const size_t row_stride = gridDim.x * blockDim.y;

for (; row < num_rows; row += row_stride) {
const size_t i = row + col * num_rows;
c[i] = a[i] + b[i];
}
}

void *gpu_allocate(size_t bytes) {
void *p = nullptr;
[[maybe_unused]] const auto result = hipMalloc(&p, bytes);
return p;
}

int main() {
constexpr size_t num_rows = 1 << 24;
constexpr size_t num_cols = 64;
constexpr size_t n = num_rows * num_cols;
constexpr size_t num_bytes = sizeof(float) * n;

float *const a = static_cast<float *>(gpu_allocate(num_bytes));
float *const b = static_cast<float *>(gpu_allocate(num_bytes));
float *const c = static_cast<float *>(gpu_allocate(num_bytes));

constexpr dim3 blocks(1024);
constexpr dim3 threads(64, 16);
row_major<<<blocks, threads, 0, 0>>>(num_rows, a, b, c);
col_major<<<blocks, threads, 0, 0>>>(num_rows, a, b, c);

[[maybe_unused]] auto t = hipDeviceSynchronize();

hipFree(a);
hipFree(b);
hipFree(c);
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/bash -l

#SBATCH --account=project_465001194
#SBATCH --job-name=cpu_gpu_sync
#SBATCH --output=cpu_gpu_sync.out%j
#SBATCH --error=cpu_gpu_sync.err%j
#SBATCH --job-name=02_row_col
#SBATCH --output=02_row_col.out%j
#SBATCH --error=02_row_col.err%j
#SBATCH --partition=small-g
#SBATCH --reservation=CSC_summer_school_gpu
#SBATCH --nodes=1
Expand All @@ -20,4 +20,4 @@ ml rocm/5.4.6

export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH

srun omniperf profile -n cpu_gpu_sync -- ./a.out
srun omniperf profile -n 02_row_col -- ./a.out

This file was deleted.

9 changes: 9 additions & 0 deletions application-performance/demos/omniperf/sourceme.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

ml LUMI/23.09
ml partition/G
ml PrgEnv-cray
ml craype-accel-amd-gfx90a
ml rocm/5.4.6

export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH

0 comments on commit 90b8e49

Please sign in to comment.