-
Notifications
You must be signed in to change notification settings - Fork 182
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'gpu_performance_examples'
- Loading branch information
Showing
57 changed files
with
22,601 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#pragma once | ||
|
||
#include <algorithm> | ||
#include <array> | ||
#include <chrono> | ||
#include <cmath> | ||
#include <cstddef> | ||
#include <cstdio> | ||
#include <ratio> | ||
#include <vector> | ||
|
||
template <typename T> constexpr void saxpy(size_t i, T a, T *x, T *y, T *r) { | ||
r[i] = a * x[i] + y[i]; | ||
} | ||
|
||
template <typename T> constexpr void init_x(size_t i, T *x) { | ||
x[i] = (T)2.3 * sin(i); | ||
} | ||
|
||
template <typename T> constexpr void init_y(size_t i, T *y) { | ||
y[i] = (T)1.1 * cos(i); | ||
} | ||
|
||
template <typename T> void init(size_t n, T *x, T *y) { | ||
for (size_t i = 0; i < n; i++) { | ||
init_x(i, x); | ||
init_y(i, y); | ||
} | ||
} | ||
|
||
template <typename Allocate, typename Deallocate, typename Init, typename Func> | ||
void run(Allocate allocate, Deallocate deallocate, Init init, Func func) { | ||
constexpr std::array ns{1 << 6, 1 << 9, 1 << 12, 1 << 15, 1 << 18, | ||
1 << 21, 1 << 24, 1 << 27, 1 << 30}; | ||
constexpr size_t max_n = *std::max_element(ns.begin(), ns.end()); | ||
constexpr size_t num_bytes = sizeof(float) * max_n; | ||
|
||
float *const x = static_cast<float *>(allocate(num_bytes)); | ||
float *const y = static_cast<float *>(allocate(num_bytes)); | ||
float *const r = static_cast<float *>(allocate(num_bytes)); | ||
init(max_n, x, y); | ||
|
||
for (size_t n : ns) { | ||
constexpr auto n_iter = 20; | ||
size_t avg = 0; | ||
for (auto iteration = 0; iteration < n_iter; iteration++) { | ||
constexpr float a = 3.4f; | ||
const auto start = std::chrono::high_resolution_clock::now(); | ||
func(n, a, x, y, r); | ||
const auto end = std::chrono::high_resolution_clock::now(); | ||
const std::chrono::duration<double, std::nano> dur = end - start; | ||
avg += iteration == 0 ? 0 : dur.count(); | ||
} | ||
|
||
std::fprintf(stderr, "%f\n", r[n - 1]); | ||
std::printf("%ld, %ld\n", n, avg / (n_iter - 1)); | ||
} | ||
|
||
deallocate(x); | ||
deallocate(y); | ||
deallocate(r); | ||
} |
47 changes: 47 additions & 0 deletions
47
application-performance/demos/idle_resources/hip_saxpy.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#include "common.h" | ||
#include <cstddef> | ||
#include <hip/hip_runtime.h> | ||
|
||
__global__ void saxpy_(size_t n, float a, float *x, float *y, float *r) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
saxpy(tid, a, x, y, r); | ||
} | ||
} | ||
|
||
__global__ void init_data(size_t n, float *x, float *y) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
init_x(tid, x); | ||
init_y(tid, y); | ||
} | ||
} | ||
|
||
void *gpu_allocate(size_t bytes) { | ||
void *p = nullptr; | ||
[[maybe_unused]] const auto result = hipMalloc(&p, bytes); | ||
return p; | ||
} | ||
|
||
void gpu_free(void *p) { [[maybe_unused]] const auto result = hipFree(p); } | ||
|
||
void gpu_init(size_t n, float *x, float *y) { | ||
constexpr dim3 blocks(32); | ||
constexpr dim3 threads(256); | ||
init_data<<<blocks, threads, 0, 0>>>(n, x, y); | ||
} | ||
|
||
int main() { | ||
run(gpu_allocate, gpu_free, gpu_init, | ||
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto { | ||
constexpr dim3 blocks(32); | ||
constexpr dim3 threads(256); | ||
|
||
saxpy_<<<blocks, threads, 0, 0>>>(n, a, x, y, r); | ||
[[maybe_unused]] const auto result = hipDeviceSynchronize(); | ||
}); | ||
} |
12 changes: 12 additions & 0 deletions
12
application-performance/demos/idle_resources/omp_saxpy.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#include "common.h" | ||
#include <cstddef> | ||
|
||
int main() { | ||
run(malloc, free, init<float>, | ||
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto { | ||
#pragma omp parallel for | ||
for (size_t i = 0; i < n; i++) { | ||
saxpy(i, a, x, y, r); | ||
} | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#!/bin/bash | ||
|
||
submit_job() { | ||
sub="$(sbatch "$@")" | ||
|
||
if [[ "$sub" =~ Submitted\ batch\ job\ ([0-9]+) ]]; then | ||
echo "${BASH_REMATCH[1]}" | ||
else | ||
exit 1 | ||
fi | ||
} | ||
|
||
echo "Submitting cpu job" | ||
cpujobid=$(submit_job << "EOF" | ||
#!/bin/bash | ||
#SBATCH --account=project_465001194 | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks=1 | ||
#SBATCH --cpus-per-task=64 | ||
#SBATCH --mem=13G | ||
#SBATCH --time=00:30:00 | ||
#SBATCH --partition=debug | ||
#SBATCH --exclusive | ||
ml PrgEnv-cray | ||
(srun CC -std=c++17 -O3 -fopenmp -Wall -Wextra -Wpedantic -pedantic-errors -o omp omp_saxpy.cpp) || { echo "Failed to build openMP code"; exit 1; } | ||
(srun CC -std=c++17 -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o serial serial_saxpy.cpp) || { echo "Failed to build serial code"; exit 1; } | ||
srun ./serial > "serial.dat" | ||
export OMP_PROC_BIND=close | ||
export OMP_PLACES=cores | ||
for nthreads in 2 64 | ||
do | ||
OMP_NUM_THREADS=$nthreads srun ./omp > "omp$nthreads.dat" | ||
done | ||
EOF | ||
) | ||
|
||
echo "Submitting gpu job" | ||
gpujobid=$(submit_job << EOF | ||
#!/bin/bash | ||
#SBATCH --account=project_465001194 | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks=1 | ||
#SBATCH --cpus-per-task=1 | ||
#SBATCH --gpus-per-task=1 | ||
#SBATCH --mem=1G | ||
#SBATCH --time=00:01:00 | ||
#SBATCH --partition=dev-g | ||
ml PrgEnv-cray | ||
ml craype-accel-amd-gfx90a | ||
ml rocm | ||
(srun CC -std=c++17 -xhip -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o hip hip_saxpy.cpp) || { echo "Failed to build hip code"; exit 1; } | ||
srun ./hip > "hip.dat" | ||
EOF | ||
) | ||
|
||
echo "Submitting gnuplot job with dependency on jobs $cpujobid and $gpujobid" | ||
sbatch --dependency afterok:$cpujobid:$gpujobid << EOF | ||
#!/bin/bash | ||
#SBATCH --account=project_465001194 | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks=1 | ||
#SBATCH --cpus-per-task=1 | ||
#SBATCH --time=00:01:00 | ||
#SBATCH --partition=debug | ||
echo "Loading modules" | ||
ml LUMI/23.09 | ||
ml partition/C | ||
ml gnuplot/5.4.8-cpeGNU-23.09 | ||
echo "Plotting problem size vs runtimes " | ||
gnuplot -e "\ | ||
set terminal png size 1000,1000; \ | ||
set output \"runtimes.png\"; \ | ||
set style data linespoints; \ | ||
set key left top; \ | ||
set logscale x; \ | ||
set logscale y; \ | ||
set title \"Runtime of Ax + y with different implementation strategies\"; \ | ||
set xlabel \"problem size\"; \ | ||
set ylabel \"time [ns]\"; \ | ||
set grid; \ | ||
set xrange [10:10000000000]; \ | ||
plot \"serial.dat\" title \"serial\" lw 2.5, \ | ||
\"omp2.dat\" title \"OpenMP 2 threads\" lw 2.5, \ | ||
\"omp64.dat\" title \"OpenMP 64 threads\" lw 2.5, \ | ||
\"hip.dat\" title \"gpu\" lw 2.5; \ | ||
" | ||
EOF |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 11 additions & 0 deletions
11
application-performance/demos/idle_resources/serial_saxpy.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#include "common.h" | ||
#include <cstddef> | ||
|
||
int main() { | ||
run(malloc, free, init<float>, | ||
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto { | ||
for (size_t i = 0; i < n; i++) { | ||
saxpy(i, a, x, y, r); | ||
} | ||
}); | ||
} |
19 changes: 19 additions & 0 deletions
19
application-performance/demos/omniperf/01_three_kernels/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Build | ||
|
||
Build on the login node with `./build.sh` | ||
|
||
# Run | ||
|
||
Run with `sbatch profile.sbatch` | ||
|
||
# Analyze | ||
|
||
1. Go to www.lumi.csc.fi | ||
2. Start a desktop session | ||
3. Launch a terminal on the desktop session | ||
4. cd to this directory | ||
5. Do `. ../sourceme.sh` | ||
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui` | ||
7. Open Firefox | ||
8. Go to address `localhost:8050` | ||
9. Analyze |
8 changes: 8 additions & 0 deletions
8
application-performance/demos/omniperf/01_three_kernels/build.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
|
||
ml LUMI/23.09 | ||
ml partition/G | ||
ml rocm/5.4.6 | ||
ml PrgEnv-cray/8.4.0 | ||
|
||
CC -xhip -pg -O2 main.cpp |
77 changes: 77 additions & 0 deletions
77
application-performance/demos/omniperf/01_three_kernels/main.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#include <cstddef> | ||
#include <hip/hip_runtime.h> | ||
#include <math.h> | ||
|
||
__global__ void kernel1(size_t n, float *x, float *y) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
x[tid] = 0.666f * sin(tid); | ||
y[tid] = 1.337f * cos(tid); | ||
} | ||
} | ||
|
||
__global__ void kernel2(size_t n, float a, float *x, float *y, float *r) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
r[tid] = a * x[tid] + y[tid]; | ||
} | ||
} | ||
|
||
__global__ void kernel3(size_t n, float a, float *x, float *y, float *r) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
const float x1 = x[tid]; | ||
const float x2 = x1 * x1; | ||
const float x3 = x1 * x2; | ||
const float x4 = x2 * x2; | ||
|
||
const float y1 = y[tid]; | ||
const float y2 = y1 * y1; | ||
const float y3 = y1 * y2; | ||
const float y4 = y2 * y2; | ||
// clang-format off | ||
r[tid] = | ||
1.0f * a * x1 | ||
- 2.0f * a * x2 | ||
+ 3.0f * a * x3 | ||
- 4.0f * a * x4 | ||
+ 4.0f * a * y1 | ||
- 3.0f * a * y2 | ||
+ 2.0f * a * y3 | ||
- 1.0f * a * y4; | ||
// clang-format on | ||
} | ||
} | ||
|
||
void *gpu_allocate(size_t bytes) { | ||
void *p = nullptr; | ||
[[maybe_unused]] const auto result = hipMalloc(&p, bytes); | ||
return p; | ||
} | ||
|
||
int main() { | ||
constexpr size_t n = 1 << 30; | ||
constexpr size_t num_bytes = sizeof(float) * n; | ||
constexpr float a = 3.4f; | ||
|
||
float *const x = static_cast<float *>(gpu_allocate(num_bytes)); | ||
float *const y = static_cast<float *>(gpu_allocate(num_bytes)); | ||
float *const r = static_cast<float *>(gpu_allocate(num_bytes)); | ||
|
||
constexpr dim3 blocks(1024); | ||
constexpr dim3 threads(1024); | ||
kernel1<<<blocks, threads, 0, 0>>>(n, x, y); | ||
kernel2<<<blocks, threads, 0, 0>>>(n, a, x, y, r); | ||
kernel3<<<blocks, threads, 0, 0>>>(n, a, x, y, r); | ||
[[maybe_unused]] auto t = hipDeviceSynchronize(); | ||
|
||
hipFree(x); | ||
hipFree(y); | ||
hipFree(r); | ||
} |
23 changes: 23 additions & 0 deletions
23
application-performance/demos/omniperf/01_three_kernels/profile.sbatch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/bin/bash -l | ||
|
||
#SBATCH --account=project_465001194 | ||
#SBATCH --job-name=01_three_kernels | ||
#SBATCH --output=01_three_kernels.out%j | ||
#SBATCH --error=01_three_kernelsname.err%j | ||
#SBATCH --partition=small-g | ||
#SBATCH --reservation=CSC_summer_school_gpu | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks-per-node=1 | ||
#SBATCH --gpus-per-node=1 | ||
#SBATCH --mem=10G | ||
#SBATCH --time=00:30:00 | ||
|
||
ml LUMI/23.09 | ||
ml partition/G | ||
ml PrgEnv-cray | ||
ml craype-accel-amd-gfx90a | ||
ml rocm/5.4.6 | ||
|
||
export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH | ||
|
||
srun omniperf profile -n 01_three_kernels -- ./a.out |
19 changes: 19 additions & 0 deletions
19
application-performance/demos/omniperf/02_strided_data_access/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Build | ||
|
||
Build on the login node with `./build.sh` | ||
|
||
# Run | ||
|
||
Run with `sbatch profile.sbatch` | ||
|
||
# Analyze | ||
|
||
1. Go to www.lumi.csc.fi | ||
2. Start a desktop session | ||
3. Launch a terminal on the desktop session | ||
4. cd to this directory | ||
5. Do `. ../sourceme.sh` | ||
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui` | ||
7. Open Firefox | ||
8. Go to address `localhost:8050` | ||
9. Analyze |
8 changes: 8 additions & 0 deletions
8
application-performance/demos/omniperf/02_strided_data_access/build.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
|
||
ml LUMI/23.09 | ||
ml partition/G | ||
ml rocm/5.4.6 | ||
ml PrgEnv-cray/8.4.0 | ||
|
||
CC -xhip -pg -O2 main.cpp |
Oops, something went wrong.