Merge branch 'gpu_performance_examples'

csc-training · Jul 1, 2024 · 13920f4 · 13920f4
2 parents c13615b + 90b8e49
commit 13920f4
Show file tree

Hide file tree

Showing 57 changed files with 22,601 additions and 42 deletions.
diff --git a/application-performance/demos/idle_resources/common.h b/application-performance/demos/idle_resources/common.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <ratio>
+#include <vector>
+
+template <typename T> constexpr void saxpy(size_t i, T a, T *x, T *y, T *r) {
+    r[i] = a * x[i] + y[i];
+}
+
+template <typename T> constexpr void init_x(size_t i, T *x) {
+    x[i] = (T)2.3 * sin(i);
+}
+
+template <typename T> constexpr void init_y(size_t i, T *y) {
+    y[i] = (T)1.1 * cos(i);
+}
+
+template <typename T> void init(size_t n, T *x, T *y) {
+    for (size_t i = 0; i < n; i++) {
+        init_x(i, x);
+        init_y(i, y);
+    }
+}
+
+template <typename Allocate, typename Deallocate, typename Init, typename Func>
+void run(Allocate allocate, Deallocate deallocate, Init init, Func func) {
+    constexpr std::array ns{1 << 6,  1 << 9,  1 << 12, 1 << 15, 1 << 18,
+                            1 << 21, 1 << 24, 1 << 27, 1 << 30};
+    constexpr size_t max_n = *std::max_element(ns.begin(), ns.end());
+    constexpr size_t num_bytes = sizeof(float) * max_n;
+
+    float *const x = static_cast<float *>(allocate(num_bytes));
+    float *const y = static_cast<float *>(allocate(num_bytes));
+    float *const r = static_cast<float *>(allocate(num_bytes));
+    init(max_n, x, y);
+
+    for (size_t n : ns) {
+        constexpr auto n_iter = 20;
+        size_t avg = 0;
+        for (auto iteration = 0; iteration < n_iter; iteration++) {
+            constexpr float a = 3.4f;
+            const auto start = std::chrono::high_resolution_clock::now();
+            func(n, a, x, y, r);
+            const auto end = std::chrono::high_resolution_clock::now();
+            const std::chrono::duration<double, std::nano> dur = end - start;
+            avg += iteration == 0 ? 0 : dur.count();
+        }
+
+        std::fprintf(stderr, "%f\n", r[n - 1]);
+        std::printf("%ld, %ld\n", n, avg / (n_iter - 1));
+    }
+
+    deallocate(x);
+    deallocate(y);
+    deallocate(r);
+}
diff --git a/application-performance/demos/idle_resources/hip_saxpy.cpp b/application-performance/demos/idle_resources/hip_saxpy.cpp
@@ -0,0 +1,47 @@
+#include "common.h"
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+__global__ void saxpy_(size_t n, float a, float *x, float *y, float *r) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        saxpy(tid, a, x, y, r);
+    }
+}
+
+__global__ void init_data(size_t n, float *x, float *y) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        init_x(tid, x);
+        init_y(tid, y);
+    }
+}
+
+void *gpu_allocate(size_t bytes) {
+    void *p = nullptr;
+    [[maybe_unused]] const auto result = hipMalloc(&p, bytes);
+    return p;
+}
+
+void gpu_free(void *p) { [[maybe_unused]] const auto result = hipFree(p); }
+
+void gpu_init(size_t n, float *x, float *y) {
+    constexpr dim3 blocks(32);
+    constexpr dim3 threads(256);
+    init_data<<<blocks, threads, 0, 0>>>(n, x, y);
+}
+
+int main() {
+    run(gpu_allocate, gpu_free, gpu_init,
+        [](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
+            constexpr dim3 blocks(32);
+            constexpr dim3 threads(256);
+
+            saxpy_<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
+            [[maybe_unused]] const auto result = hipDeviceSynchronize();
+        });
+}
diff --git a/application-performance/demos/idle_resources/omp_saxpy.cpp b/application-performance/demos/idle_resources/omp_saxpy.cpp
@@ -0,0 +1,12 @@
+#include "common.h"
+#include <cstddef>
+
+int main() {
+    run(malloc, free, init<float>,
+        [](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
+#pragma omp parallel for
+            for (size_t i = 0; i < n; i++) {
+                saxpy(i, a, x, y, r);
+            }
+        });
+}
diff --git a/application-performance/demos/idle_resources/run.sh b/application-performance/demos/idle_resources/run.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+submit_job() {
+  sub="$(sbatch "$@")"
+
+  if [[ "$sub" =~ Submitted\ batch\ job\ ([0-9]+) ]]; then
+    echo "${BASH_REMATCH[1]}"
+  else
+    exit 1
+  fi
+}
+
+echo "Submitting cpu job"
+cpujobid=$(submit_job << "EOF"
+#!/bin/bash
+
+#SBATCH --account=project_465001194
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=64
+#SBATCH --mem=13G
+#SBATCH --time=00:30:00
+#SBATCH --partition=debug
+#SBATCH --exclusive
+
+ml PrgEnv-cray
+
+(srun CC -std=c++17 -O3 -fopenmp -Wall -Wextra -Wpedantic -pedantic-errors -o omp omp_saxpy.cpp) || { echo "Failed to build openMP code"; exit 1; }
+(srun CC -std=c++17 -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o serial serial_saxpy.cpp) || { echo "Failed to build serial code"; exit 1; }
+
+srun ./serial > "serial.dat"
+
+export OMP_PROC_BIND=close
+export OMP_PLACES=cores
+
+for nthreads in 2 64
+do
+    OMP_NUM_THREADS=$nthreads srun ./omp > "omp$nthreads.dat"
+done
+EOF
+)
+
+echo "Submitting gpu job"
+gpujobid=$(submit_job << EOF
+#!/bin/bash
+
+#SBATCH --account=project_465001194
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=1
+#SBATCH --gpus-per-task=1
+#SBATCH --mem=1G
+#SBATCH --time=00:01:00
+#SBATCH --partition=dev-g
+
+ml PrgEnv-cray
+ml craype-accel-amd-gfx90a
+ml rocm
+
+(srun CC -std=c++17 -xhip -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o hip hip_saxpy.cpp) || { echo "Failed to build hip code"; exit 1; }
+srun ./hip > "hip.dat"
+EOF
+)
+
+echo "Submitting gnuplot job with dependency on jobs $cpujobid and $gpujobid"
+sbatch --dependency afterok:$cpujobid:$gpujobid << EOF
+#!/bin/bash
+
+#SBATCH --account=project_465001194
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=1
+#SBATCH --time=00:01:00
+#SBATCH --partition=debug
+
+echo "Loading modules"
+ml LUMI/23.09
+ml partition/C
+ml gnuplot/5.4.8-cpeGNU-23.09
+
+echo "Plotting problem size vs runtimes "
+gnuplot -e "\
+    set terminal png size 1000,1000; \
+    set output \"runtimes.png\"; \
+    set style data linespoints; \
+    set key left top; \
+    set logscale x; \
+    set logscale y; \
+    set title \"Runtime of Ax + y with different implementation strategies\"; \
+    set xlabel \"problem size\"; \
+    set ylabel \"time [ns]\"; \
+    set grid; \
+    set xrange [10:10000000000]; \
+    plot \"serial.dat\" title \"serial\" lw 2.5, \
+        \"omp2.dat\" title \"OpenMP 2 threads\" lw 2.5, \
+        \"omp64.dat\" title \"OpenMP 64 threads\" lw 2.5, \
+        \"hip.dat\" title \"gpu\" lw 2.5; \
+    " 
+EOF
diff --git a/application-performance/demos/idle_resources/runtimes.png b/application-performance/demos/idle_resources/runtimes.png
diff --git a/application-performance/demos/idle_resources/runtimes_annotated.png b/application-performance/demos/idle_resources/runtimes_annotated.png
diff --git a/application-performance/demos/idle_resources/serial_saxpy.cpp b/application-performance/demos/idle_resources/serial_saxpy.cpp
@@ -0,0 +1,11 @@
+#include "common.h"
+#include <cstddef>
+
+int main() {
+    run(malloc, free, init<float>,
+        [](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
+            for (size_t i = 0; i < n; i++) {
+                saxpy(i, a, x, y, r);
+            }
+        });
+}
diff --git a/application-performance/demos/omniperf/01_three_kernels/README.md b/application-performance/demos/omniperf/01_three_kernels/README.md
@@ -0,0 +1,19 @@
+# Build
+
+Build on the login node with `./build.sh`
+
+# Run
+
+Run with `sbatch profile.sbatch`
+
+# Analyze
+
+1. Go to www.lumi.csc.fi
+2. Start a desktop session 
+3. Launch a terminal on the desktop session
+4. cd to this directory
+5. Do `. ../sourceme.sh`
+6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
+7. Open Firefox
+8. Go to address `localhost:8050`
+9. Analyze
diff --git a/application-performance/demos/omniperf/01_three_kernels/build.sh b/application-performance/demos/omniperf/01_three_kernels/build.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+ml LUMI/23.09
+ml partition/G
+ml rocm/5.4.6
+ml PrgEnv-cray/8.4.0
+
+CC -xhip -pg -O2 main.cpp
diff --git a/application-performance/demos/omniperf/01_three_kernels/main.cpp b/application-performance/demos/omniperf/01_three_kernels/main.cpp
@@ -0,0 +1,77 @@
+#include <cstddef>
+#include <hip/hip_runtime.h>
+#include <math.h>
+
+__global__ void kernel1(size_t n, float *x, float *y) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        x[tid] = 0.666f * sin(tid);
+        y[tid] = 1.337f * cos(tid);
+    }
+}
+
+__global__ void kernel2(size_t n, float a, float *x, float *y, float *r) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        r[tid] = a * x[tid] + y[tid];
+    }
+}
+
+__global__ void kernel3(size_t n, float a, float *x, float *y, float *r) {
+    size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const size_t stride = gridDim.x * blockDim.x;
+
+    for (; tid < n; tid += stride) {
+        const float x1 = x[tid];
+        const float x2 = x1 * x1;
+        const float x3 = x1 * x2;
+        const float x4 = x2 * x2;
+
+        const float y1 = y[tid];
+        const float y2 = y1 * y1;
+        const float y3 = y1 * y2;
+        const float y4 = y2 * y2;
+        // clang-format off
+        r[tid] = 
+              1.0f * a * x1
+            - 2.0f * a * x2
+            + 3.0f * a * x3
+            - 4.0f * a * x4
+            + 4.0f * a * y1
+            - 3.0f * a * y2
+            + 2.0f * a * y3
+            - 1.0f * a * y4;
+        // clang-format on
+    }
+}
+
+void *gpu_allocate(size_t bytes) {
+    void *p = nullptr;
+    [[maybe_unused]] const auto result = hipMalloc(&p, bytes);
+    return p;
+}
+
+int main() {
+    constexpr size_t n = 1 << 30;
+    constexpr size_t num_bytes = sizeof(float) * n;
+    constexpr float a = 3.4f;
+
+    float *const x = static_cast<float *>(gpu_allocate(num_bytes));
+    float *const y = static_cast<float *>(gpu_allocate(num_bytes));
+    float *const r = static_cast<float *>(gpu_allocate(num_bytes));
+
+    constexpr dim3 blocks(1024);
+    constexpr dim3 threads(1024);
+    kernel1<<<blocks, threads, 0, 0>>>(n, x, y);
+    kernel2<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
+    kernel3<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
+    [[maybe_unused]] auto t = hipDeviceSynchronize();
+
+    hipFree(x);
+    hipFree(y);
+    hipFree(r);
+}
diff --git a/application-performance/demos/omniperf/01_three_kernels/profile.sbatch b/application-performance/demos/omniperf/01_three_kernels/profile.sbatch
@@ -0,0 +1,23 @@
+#!/bin/bash -l
+
+#SBATCH --account=project_465001194
+#SBATCH --job-name=01_three_kernels
+#SBATCH --output=01_three_kernels.out%j
+#SBATCH --error=01_three_kernelsname.err%j
+#SBATCH --partition=small-g
+#SBATCH --reservation=CSC_summer_school_gpu
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=1
+#SBATCH --mem=10G
+#SBATCH --time=00:30:00
+
+ml LUMI/23.09
+ml partition/G
+ml PrgEnv-cray
+ml craype-accel-amd-gfx90a
+ml rocm/5.4.6
+
+export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH
+
+srun omniperf profile -n 01_three_kernels -- ./a.out
diff --git a/application-performance/demos/omniperf/02_strided_data_access/README.md b/application-performance/demos/omniperf/02_strided_data_access/README.md
@@ -0,0 +1,19 @@
+# Build
+
+Build on the login node with `./build.sh`
+
+# Run
+
+Run with `sbatch profile.sbatch`
+
+# Analyze
+
+1. Go to www.lumi.csc.fi
+2. Start a desktop session 
+3. Launch a terminal on the desktop session
+4. cd to this directory
+5. Do `. ../sourceme.sh`
+6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
+7. Open Firefox
+8. Go to address `localhost:8050`
+9. Analyze
diff --git a/application-performance/demos/omniperf/02_strided_data_access/build.sh b/application-performance/demos/omniperf/02_strided_data_access/build.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+ml LUMI/23.09
+ml partition/G
+ml rocm/5.4.6
+ml PrgEnv-cray/8.4.0
+
+CC -xhip -pg -O2 main.cpp