Add gpu performance exercises/demos

csc-training · Jul 1, 2024 · 90b8e49 · 90b8e49
1 parent b7ca383
commit 90b8e49
Show file tree

Hide file tree

Showing 10 changed files with 223 additions and 54 deletions.
diff --git a/application-performance/demos/omniperf/01_three_kernels/README.md b/application-performance/demos/omniperf/01_three_kernels/README.md
@@ -0,0 +1,19 @@
+# Build
+
+Build on the login node with `./build.sh`
+
+# Run
+
+Run with `sbatch profile.sbatch`
+
+# Analyze
+
+1. Go to www.lumi.csc.fi
+2. Start a desktop session 
+3. Launch a terminal on the desktop session
+4. cd to this directory
+5. Do `. ../sourceme.sh`
+6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
+7. Open Firefox
+8. Go to address `localhost:8050`
+9. Analyze
diff --git a/...omniperf/cpu_gpu_synchronization/build.sh → .../demos/omniperf/01_three_kernels/build.sh b/...omniperf/cpu_gpu_synchronization/build.sh → .../demos/omniperf/01_three_kernels/build.sh
diff --git a/application-performance/demos/omniperf/01_three_kernels/main.cpp b/application-performance/demos/omniperf/01_three_kernels/main.cpp
@@ -0,0 +1,77 @@
+#include <cstddef>
+#include <hip/hip_runtime.h>
+#include <math.h>
+
+__global__ void kernel1(size_t n, float *x, float *y) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        x[tid] = 0.666f * sin(tid);
+        y[tid] = 1.337f * cos(tid);
+    }
+}
+
+__global__ void kernel2(size_t n, float a, float *x, float *y, float *r) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        r[tid] = a * x[tid] + y[tid];
+    }
+}
+
+__global__ void kernel3(size_t n, float a, float *x, float *y, float *r) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        const float x1 = x[tid];
+        const float x2 = x1 * x1;
+        const float x3 = x1 * x2;
+        const float x4 = x2 * x2;
+
+        const float y1 = y[tid];
+        const float y2 = y1 * y1;
+        const float y3 = y1 * y2;
+        const float y4 = y2 * y2;
+        // clang-format off
+        r[tid] = 
+              1.0f * a * x1
+            - 2.0f * a * x2
+            + 3.0f * a * x3
+            - 4.0f * a * x4
+            + 4.0f * a * y1
+            - 3.0f * a * y2
+            + 2.0f * a * y3
+            - 1.0f * a * y4;
+        // clang-format on
+    }
+}
+
+void *gpu_allocate(size_t bytes) {
+    void *p = nullptr;
+    [[maybe_unused]] const auto result = hipMalloc(&p, bytes);
+    return p;
+}
+
+int main() {
+    constexpr size_t n = 1 << 30;
+    constexpr size_t num_bytes = sizeof(float) * n;
+    constexpr float a = 3.4f;
+
+    float *const x = static_cast<float *>(gpu_allocate(num_bytes));
+    float *const y = static_cast<float *>(gpu_allocate(num_bytes));
+    float *const r = static_cast<float *>(gpu_allocate(num_bytes));
+
+    constexpr dim3 blocks(1024);
+    constexpr dim3 threads(1024);
+    kernel1<<<blocks, threads, 0, 0>>>(n, x, y);
+    kernel2<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
+    kernel3<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
+    [[maybe_unused]] auto t = hipDeviceSynchronize();
+
+    hipFree(x);
+    hipFree(y);
+    hipFree(r);
+}
diff --git a/application-performance/demos/omniperf/01_three_kernels/profile.sbatch b/application-performance/demos/omniperf/01_three_kernels/profile.sbatch
@@ -0,0 +1,23 @@
+#!/bin/bash -l
+
+#SBATCH --account=project_465001194
+#SBATCH --job-name=01_three_kernels
+#SBATCH --output=01_three_kernels.out%j
+#SBATCH --error=01_three_kernelsname.err%j
+#SBATCH --partition=small-g
+#SBATCH --reservation=CSC_summer_school_gpu
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=1
+#SBATCH --mem=10G
+#SBATCH --time=00:30:00
+
+ml LUMI/23.09
+ml partition/G
+ml PrgEnv-cray
+ml craype-accel-amd-gfx90a
+ml rocm/5.4.6
+
+export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH
+
+srun omniperf profile -n 01_three_kernels -- ./a.out
diff --git a/application-performance/demos/omniperf/02_strided_data_access/README.md b/application-performance/demos/omniperf/02_strided_data_access/README.md
@@ -0,0 +1,19 @@
+# Build
+
+Build on the login node with `./build.sh`
+
+# Run
+
+Run with `sbatch profile.sbatch`
+
+# Analyze
+
+1. Go to www.lumi.csc.fi
+2. Start a desktop session 
+3. Launch a terminal on the desktop session
+4. cd to this directory
+5. Do `. ../sourceme.sh`
+6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
+7. Open Firefox
+8. Go to address `localhost:8050`
+9. Analyze
diff --git a/application-performance/demos/omniperf/02_strided_data_access/build.sh b/application-performance/demos/omniperf/02_strided_data_access/build.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+ml LUMI/23.09
+ml partition/G
+ml rocm/5.4.6
+ml PrgEnv-cray/8.4.0
+
+CC -xhip -pg -O2 main.cpp
diff --git a/application-performance/demos/omniperf/02_strided_data_access/main.cpp b/application-performance/demos/omniperf/02_strided_data_access/main.cpp
@@ -0,0 +1,64 @@
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+__global__ void init(size_t num_rows, float *a, float *b, float *c) {
+    const size_t col = threadIdx.x;
+    size_t row = threadIdx.y + blockIdx.x * blockDim.y;
+    const size_t row_stride = gridDim.x * blockDim.y;
+
+    for (; row < num_rows; row += row_stride) {
+        const size_t i = col + row * blockDim.x;
+        a[i] = 1.0f;
+        b[i] = 2.0f;
+    }
+}
+
+__global__ void row_major(size_t num_rows, float *a, float *b, float *c) {
+    const size_t col = threadIdx.x;
+    size_t row = threadIdx.y + blockIdx.x * blockDim.y;
+    const size_t row_stride = gridDim.x * blockDim.y;
+
+    for (; row < num_rows; row += row_stride) {
+        const size_t i = col + row * blockDim.x;
+        c[i] = a[i] + b[i];
+    }
+}
+
+__global__ void col_major(size_t num_rows, float *a, float *b, float *c) {
+    const size_t col = threadIdx.x;
+    size_t row = threadIdx.y + blockIdx.x * blockDim.y;
+    const size_t row_stride = gridDim.x * blockDim.y;
+
+    for (; row < num_rows; row += row_stride) {
+        const size_t i = row + col * num_rows;
+        c[i] = a[i] + b[i];
+    }
+}
+
+void *gpu_allocate(size_t bytes) {
+    void *p = nullptr;
+    [[maybe_unused]] const auto result = hipMalloc(&p, bytes);
+    return p;
+}
+
+int main() {
+    constexpr size_t num_rows = 1 << 24;
+    constexpr size_t num_cols = 64;
+    constexpr size_t n = num_rows * num_cols;
+    constexpr size_t num_bytes = sizeof(float) * n;
+
+    float *const a = static_cast<float *>(gpu_allocate(num_bytes));
+    float *const b = static_cast<float *>(gpu_allocate(num_bytes));
+    float *const c = static_cast<float *>(gpu_allocate(num_bytes));
+
+    constexpr dim3 blocks(1024);
+    constexpr dim3 threads(64, 16);
+    row_major<<<blocks, threads, 0, 0>>>(num_rows, a, b, c);
+    col_major<<<blocks, threads, 0, 0>>>(num_rows, a, b, c);
+
+    [[maybe_unused]] auto t = hipDeviceSynchronize();
+
+    hipFree(a);
+    hipFree(b);
+    hipFree(c);
+}
diff --git a/...rf/cpu_gpu_synchronization/profile.sbatch → ...erf/02_strided_data_access/profile.sbatch b/...rf/cpu_gpu_synchronization/profile.sbatch → ...erf/02_strided_data_access/profile.sbatch
@@ -1,9 +1,9 @@
 #!/bin/bash -l
 
 #SBATCH --account=project_465001194
-#SBATCH --job-name=cpu_gpu_sync
-#SBATCH --output=cpu_gpu_sync.out%j
-#SBATCH --error=cpu_gpu_sync.err%j
+#SBATCH --job-name=02_row_col
+#SBATCH --output=02_row_col.out%j
+#SBATCH --error=02_row_col.err%j
 #SBATCH --partition=small-g
 #SBATCH --reservation=CSC_summer_school_gpu
 #SBATCH --nodes=1
@@ -20,4 +20,4 @@ ml rocm/5.4.6
 
 export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH
 
-srun omniperf profile -n cpu_gpu_sync -- ./a.out
+srun omniperf profile -n 02_row_col -- ./a.out
diff --git a/application-performance/demos/omniperf/cpu_gpu_synchronization/main.cpp b/application-performance/demos/omniperf/cpu_gpu_synchronization/main.cpp
diff --git a/application-performance/demos/omniperf/sourceme.sh b/application-performance/demos/omniperf/sourceme.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+ml LUMI/23.09
+ml partition/G
+ml PrgEnv-cray
+ml craype-accel-amd-gfx90a
+ml rocm/5.4.6
+
+export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH