Skip to content

Commit

Permalink
Merge branch 'gpu_performance_examples'
Browse files Browse the repository at this point in the history
  • Loading branch information
cscjlan committed Jul 1, 2024
2 parents c13615b + 90b8e49 commit 13920f4
Show file tree
Hide file tree
Showing 57 changed files with 22,601 additions and 42 deletions.
62 changes: 62 additions & 0 deletions application-performance/demos/idle_resources/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#pragma once

#include <algorithm>
#include <array>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstdio>
#include <ratio>
#include <vector>

template <typename T> constexpr void saxpy(size_t i, T a, T *x, T *y, T *r) {
r[i] = a * x[i] + y[i];
}

template <typename T> constexpr void init_x(size_t i, T *x) {
x[i] = (T)2.3 * sin(i);
}

template <typename T> constexpr void init_y(size_t i, T *y) {
y[i] = (T)1.1 * cos(i);
}

template <typename T> void init(size_t n, T *x, T *y) {
for (size_t i = 0; i < n; i++) {
init_x(i, x);
init_y(i, y);
}
}

template <typename Allocate, typename Deallocate, typename Init, typename Func>
void run(Allocate allocate, Deallocate deallocate, Init init, Func func) {
constexpr std::array ns{1 << 6, 1 << 9, 1 << 12, 1 << 15, 1 << 18,
1 << 21, 1 << 24, 1 << 27, 1 << 30};
constexpr size_t max_n = *std::max_element(ns.begin(), ns.end());
constexpr size_t num_bytes = sizeof(float) * max_n;

float *const x = static_cast<float *>(allocate(num_bytes));
float *const y = static_cast<float *>(allocate(num_bytes));
float *const r = static_cast<float *>(allocate(num_bytes));
init(max_n, x, y);

for (size_t n : ns) {
constexpr auto n_iter = 20;
size_t avg = 0;
for (auto iteration = 0; iteration < n_iter; iteration++) {
constexpr float a = 3.4f;
const auto start = std::chrono::high_resolution_clock::now();
func(n, a, x, y, r);
const auto end = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::nano> dur = end - start;
avg += iteration == 0 ? 0 : dur.count();
}

std::fprintf(stderr, "%f\n", r[n - 1]);
std::printf("%ld, %ld\n", n, avg / (n_iter - 1));
}

deallocate(x);
deallocate(y);
deallocate(r);
}
47 changes: 47 additions & 0 deletions application-performance/demos/idle_resources/hip_saxpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include "common.h"
#include <cstddef>
#include <hip/hip_runtime.h>

__global__ void saxpy_(size_t n, float a, float *x, float *y, float *r) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
saxpy(tid, a, x, y, r);
}
}

__global__ void init_data(size_t n, float *x, float *y) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
init_x(tid, x);
init_y(tid, y);
}
}

void *gpu_allocate(size_t bytes) {
void *p = nullptr;
[[maybe_unused]] const auto result = hipMalloc(&p, bytes);
return p;
}

void gpu_free(void *p) { [[maybe_unused]] const auto result = hipFree(p); }

void gpu_init(size_t n, float *x, float *y) {
constexpr dim3 blocks(32);
constexpr dim3 threads(256);
init_data<<<blocks, threads, 0, 0>>>(n, x, y);
}

int main() {
run(gpu_allocate, gpu_free, gpu_init,
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
constexpr dim3 blocks(32);
constexpr dim3 threads(256);

saxpy_<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
[[maybe_unused]] const auto result = hipDeviceSynchronize();
});
}
12 changes: 12 additions & 0 deletions application-performance/demos/idle_resources/omp_saxpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#include "common.h"
#include <cstddef>

int main() {
run(malloc, free, init<float>,
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
#pragma omp parallel for
for (size_t i = 0; i < n; i++) {
saxpy(i, a, x, y, r);
}
});
}
99 changes: 99 additions & 0 deletions application-performance/demos/idle_resources/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash

submit_job() {
sub="$(sbatch "$@")"

if [[ "$sub" =~ Submitted\ batch\ job\ ([0-9]+) ]]; then
echo "${BASH_REMATCH[1]}"
else
exit 1
fi
}

echo "Submitting cpu job"
cpujobid=$(submit_job << "EOF"
#!/bin/bash
#SBATCH --account=project_465001194
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=64
#SBATCH --mem=13G
#SBATCH --time=00:30:00
#SBATCH --partition=debug
#SBATCH --exclusive
ml PrgEnv-cray
(srun CC -std=c++17 -O3 -fopenmp -Wall -Wextra -Wpedantic -pedantic-errors -o omp omp_saxpy.cpp) || { echo "Failed to build openMP code"; exit 1; }
(srun CC -std=c++17 -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o serial serial_saxpy.cpp) || { echo "Failed to build serial code"; exit 1; }
srun ./serial > "serial.dat"
export OMP_PROC_BIND=close
export OMP_PLACES=cores
for nthreads in 2 64
do
OMP_NUM_THREADS=$nthreads srun ./omp > "omp$nthreads.dat"
done
EOF
)

echo "Submitting gpu job"
gpujobid=$(submit_job << EOF
#!/bin/bash
#SBATCH --account=project_465001194
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --gpus-per-task=1
#SBATCH --mem=1G
#SBATCH --time=00:01:00
#SBATCH --partition=dev-g
ml PrgEnv-cray
ml craype-accel-amd-gfx90a
ml rocm
(srun CC -std=c++17 -xhip -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o hip hip_saxpy.cpp) || { echo "Failed to build hip code"; exit 1; }
srun ./hip > "hip.dat"
EOF
)

echo "Submitting gnuplot job with dependency on jobs $cpujobid and $gpujobid"
sbatch --dependency afterok:$cpujobid:$gpujobid << EOF
#!/bin/bash
#SBATCH --account=project_465001194
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --time=00:01:00
#SBATCH --partition=debug
echo "Loading modules"
ml LUMI/23.09
ml partition/C
ml gnuplot/5.4.8-cpeGNU-23.09
echo "Plotting problem size vs runtimes "
gnuplot -e "\
set terminal png size 1000,1000; \
set output \"runtimes.png\"; \
set style data linespoints; \
set key left top; \
set logscale x; \
set logscale y; \
set title \"Runtime of Ax + y with different implementation strategies\"; \
set xlabel \"problem size\"; \
set ylabel \"time [ns]\"; \
set grid; \
set xrange [10:10000000000]; \
plot \"serial.dat\" title \"serial\" lw 2.5, \
\"omp2.dat\" title \"OpenMP 2 threads\" lw 2.5, \
\"omp64.dat\" title \"OpenMP 64 threads\" lw 2.5, \
\"hip.dat\" title \"gpu\" lw 2.5; \
"
EOF
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 11 additions & 0 deletions application-performance/demos/idle_resources/serial_saxpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include "common.h"
#include <cstddef>

int main() {
run(malloc, free, init<float>,
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
for (size_t i = 0; i < n; i++) {
saxpy(i, a, x, y, r);
}
});
}
19 changes: 19 additions & 0 deletions application-performance/demos/omniperf/01_three_kernels/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Build

Build on the login node with `./build.sh`

# Run

Run with `sbatch profile.sbatch`

# Analyze

1. Go to www.lumi.csc.fi
2. Start a desktop session
3. Launch a terminal on the desktop session
4. cd to this directory
5. Do `. ../sourceme.sh`
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
7. Open Firefox
8. Go to address `localhost:8050`
9. Analyze
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

ml LUMI/23.09
ml partition/G
ml rocm/5.4.6
ml PrgEnv-cray/8.4.0

CC -xhip -pg -O2 main.cpp
77 changes: 77 additions & 0 deletions application-performance/demos/omniperf/01_three_kernels/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include <cstddef>
#include <hip/hip_runtime.h>
#include <math.h>

__global__ void kernel1(size_t n, float *x, float *y) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
x[tid] = 0.666f * sin(tid);
y[tid] = 1.337f * cos(tid);
}
}

__global__ void kernel2(size_t n, float a, float *x, float *y, float *r) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
r[tid] = a * x[tid] + y[tid];
}
}

__global__ void kernel3(size_t n, float a, float *x, float *y, float *r) {
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const size_t stride = gridDim.x * blockDim.x;

for (; tid < n; tid += stride) {
const float x1 = x[tid];
const float x2 = x1 * x1;
const float x3 = x1 * x2;
const float x4 = x2 * x2;

const float y1 = y[tid];
const float y2 = y1 * y1;
const float y3 = y1 * y2;
const float y4 = y2 * y2;
// clang-format off
r[tid] =
1.0f * a * x1
- 2.0f * a * x2
+ 3.0f * a * x3
- 4.0f * a * x4
+ 4.0f * a * y1
- 3.0f * a * y2
+ 2.0f * a * y3
- 1.0f * a * y4;
// clang-format on
}
}

void *gpu_allocate(size_t bytes) {
void *p = nullptr;
[[maybe_unused]] const auto result = hipMalloc(&p, bytes);
return p;
}

int main() {
constexpr size_t n = 1 << 30;
constexpr size_t num_bytes = sizeof(float) * n;
constexpr float a = 3.4f;

float *const x = static_cast<float *>(gpu_allocate(num_bytes));
float *const y = static_cast<float *>(gpu_allocate(num_bytes));
float *const r = static_cast<float *>(gpu_allocate(num_bytes));

constexpr dim3 blocks(1024);
constexpr dim3 threads(1024);
kernel1<<<blocks, threads, 0, 0>>>(n, x, y);
kernel2<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
kernel3<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
[[maybe_unused]] auto t = hipDeviceSynchronize();

hipFree(x);
hipFree(y);
hipFree(r);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash -l

#SBATCH --account=project_465001194
#SBATCH --job-name=01_three_kernels
#SBATCH --output=01_three_kernels.out%j
#SBATCH --error=01_three_kernelsname.err%j
#SBATCH --partition=small-g
#SBATCH --reservation=CSC_summer_school_gpu
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=1
#SBATCH --mem=10G
#SBATCH --time=00:30:00

ml LUMI/23.09
ml partition/G
ml PrgEnv-cray
ml craype-accel-amd-gfx90a
ml rocm/5.4.6

export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH

srun omniperf profile -n 01_three_kernels -- ./a.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Build

Build on the login node with `./build.sh`

# Run

Run with `sbatch profile.sbatch`

# Analyze

1. Go to www.lumi.csc.fi
2. Start a desktop session
3. Launch a terminal on the desktop session
4. cd to this directory
5. Do `. ../sourceme.sh`
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
7. Open Firefox
8. Go to address `localhost:8050`
9. Analyze
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

ml LUMI/23.09
ml partition/G
ml rocm/5.4.6
ml PrgEnv-cray/8.4.0

CC -xhip -pg -O2 main.cpp
Loading

0 comments on commit 13920f4

Please sign in to comment.