-
Notifications
You must be signed in to change notification settings - Fork 182
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
223 additions
and
54 deletions.
There are no files selected for viewing
19 changes: 19 additions & 0 deletions
19
application-performance/demos/omniperf/01_three_kernels/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Build | ||
|
||
Build on the login node with `./build.sh` | ||
|
||
# Run | ||
|
||
Run with `sbatch profile.sbatch` | ||
|
||
# Analyze | ||
|
||
1. Go to www.lumi.csc.fi | ||
2. Start a desktop session | ||
3. Launch a terminal on the desktop session | ||
4. cd to this directory | ||
5. Do `. ../sourceme.sh` | ||
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui` | ||
7. Open Firefox | ||
8. Go to address `localhost:8050` | ||
9. Analyze |
File renamed without changes.
77 changes: 77 additions & 0 deletions
77
application-performance/demos/omniperf/01_three_kernels/main.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#include <cstddef> | ||
#include <hip/hip_runtime.h> | ||
#include <math.h> | ||
|
||
__global__ void kernel1(size_t n, float *x, float *y) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
x[tid] = 0.666f * sin(tid); | ||
y[tid] = 1.337f * cos(tid); | ||
} | ||
} | ||
|
||
__global__ void kernel2(size_t n, float a, float *x, float *y, float *r) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
r[tid] = a * x[tid] + y[tid]; | ||
} | ||
} | ||
|
||
__global__ void kernel3(size_t n, float a, float *x, float *y, float *r) { | ||
size_t tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
const size_t stride = gridDim.x * blockDim.x; | ||
|
||
for (; tid < n; tid += stride) { | ||
const float x1 = x[tid]; | ||
const float x2 = x1 * x1; | ||
const float x3 = x1 * x2; | ||
const float x4 = x2 * x2; | ||
|
||
const float y1 = y[tid]; | ||
const float y2 = y1 * y1; | ||
const float y3 = y1 * y2; | ||
const float y4 = y2 * y2; | ||
// clang-format off | ||
r[tid] = | ||
1.0f * a * x1 | ||
- 2.0f * a * x2 | ||
+ 3.0f * a * x3 | ||
- 4.0f * a * x4 | ||
+ 4.0f * a * y1 | ||
- 3.0f * a * y2 | ||
+ 2.0f * a * y3 | ||
- 1.0f * a * y4; | ||
// clang-format on | ||
} | ||
} | ||
|
||
void *gpu_allocate(size_t bytes) { | ||
void *p = nullptr; | ||
[[maybe_unused]] const auto result = hipMalloc(&p, bytes); | ||
return p; | ||
} | ||
|
||
int main() { | ||
constexpr size_t n = 1 << 30; | ||
constexpr size_t num_bytes = sizeof(float) * n; | ||
constexpr float a = 3.4f; | ||
|
||
float *const x = static_cast<float *>(gpu_allocate(num_bytes)); | ||
float *const y = static_cast<float *>(gpu_allocate(num_bytes)); | ||
float *const r = static_cast<float *>(gpu_allocate(num_bytes)); | ||
|
||
constexpr dim3 blocks(1024); | ||
constexpr dim3 threads(1024); | ||
kernel1<<<blocks, threads, 0, 0>>>(n, x, y); | ||
kernel2<<<blocks, threads, 0, 0>>>(n, a, x, y, r); | ||
kernel3<<<blocks, threads, 0, 0>>>(n, a, x, y, r); | ||
[[maybe_unused]] auto t = hipDeviceSynchronize(); | ||
|
||
hipFree(x); | ||
hipFree(y); | ||
hipFree(r); | ||
} |
23 changes: 23 additions & 0 deletions
23
application-performance/demos/omniperf/01_three_kernels/profile.sbatch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/bin/bash -l | ||
|
||
#SBATCH --account=project_465001194 | ||
#SBATCH --job-name=01_three_kernels | ||
#SBATCH --output=01_three_kernels.out%j | ||
#SBATCH --error=01_three_kernelsname.err%j | ||
#SBATCH --partition=small-g | ||
#SBATCH --reservation=CSC_summer_school_gpu | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks-per-node=1 | ||
#SBATCH --gpus-per-node=1 | ||
#SBATCH --mem=10G | ||
#SBATCH --time=00:30:00 | ||
|
||
ml LUMI/23.09 | ||
ml partition/G | ||
ml PrgEnv-cray | ||
ml craype-accel-amd-gfx90a | ||
ml rocm/5.4.6 | ||
|
||
export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH | ||
|
||
srun omniperf profile -n 01_three_kernels -- ./a.out |
19 changes: 19 additions & 0 deletions
19
application-performance/demos/omniperf/02_strided_data_access/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Build | ||
|
||
Build on the login node with `./build.sh` | ||
|
||
# Run | ||
|
||
Run with `sbatch profile.sbatch` | ||
|
||
# Analyze | ||
|
||
1. Go to www.lumi.csc.fi | ||
2. Start a desktop session | ||
3. Launch a terminal on the desktop session | ||
4. cd to this directory | ||
5. Do `. ../sourceme.sh` | ||
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui` | ||
7. Open Firefox | ||
8. Go to address `localhost:8050` | ||
9. Analyze |
8 changes: 8 additions & 0 deletions
8
application-performance/demos/omniperf/02_strided_data_access/build.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
|
||
ml LUMI/23.09 | ||
ml partition/G | ||
ml rocm/5.4.6 | ||
ml PrgEnv-cray/8.4.0 | ||
|
||
CC -xhip -pg -O2 main.cpp |
64 changes: 64 additions & 0 deletions
64
application-performance/demos/omniperf/02_strided_data_access/main.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#include <cstddef> | ||
#include <hip/hip_runtime.h> | ||
|
||
__global__ void init(size_t num_rows, float *a, float *b, float *c) { | ||
const size_t col = threadIdx.x; | ||
size_t row = threadIdx.y + blockIdx.x * blockDim.y; | ||
const size_t row_stride = gridDim.x * blockDim.y; | ||
|
||
for (; row < num_rows; row += row_stride) { | ||
const size_t i = col + row * blockDim.x; | ||
a[i] = 1.0f; | ||
b[i] = 2.0f; | ||
} | ||
} | ||
|
||
__global__ void row_major(size_t num_rows, float *a, float *b, float *c) { | ||
const size_t col = threadIdx.x; | ||
size_t row = threadIdx.y + blockIdx.x * blockDim.y; | ||
const size_t row_stride = gridDim.x * blockDim.y; | ||
|
||
for (; row < num_rows; row += row_stride) { | ||
const size_t i = col + row * blockDim.x; | ||
c[i] = a[i] + b[i]; | ||
} | ||
} | ||
|
||
__global__ void col_major(size_t num_rows, float *a, float *b, float *c) { | ||
const size_t col = threadIdx.x; | ||
size_t row = threadIdx.y + blockIdx.x * blockDim.y; | ||
const size_t row_stride = gridDim.x * blockDim.y; | ||
|
||
for (; row < num_rows; row += row_stride) { | ||
const size_t i = row + col * num_rows; | ||
c[i] = a[i] + b[i]; | ||
} | ||
} | ||
|
||
void *gpu_allocate(size_t bytes) { | ||
void *p = nullptr; | ||
[[maybe_unused]] const auto result = hipMalloc(&p, bytes); | ||
return p; | ||
} | ||
|
||
int main() { | ||
constexpr size_t num_rows = 1 << 24; | ||
constexpr size_t num_cols = 64; | ||
constexpr size_t n = num_rows * num_cols; | ||
constexpr size_t num_bytes = sizeof(float) * n; | ||
|
||
float *const a = static_cast<float *>(gpu_allocate(num_bytes)); | ||
float *const b = static_cast<float *>(gpu_allocate(num_bytes)); | ||
float *const c = static_cast<float *>(gpu_allocate(num_bytes)); | ||
|
||
constexpr dim3 blocks(1024); | ||
constexpr dim3 threads(64, 16); | ||
row_major<<<blocks, threads, 0, 0>>>(num_rows, a, b, c); | ||
col_major<<<blocks, threads, 0, 0>>>(num_rows, a, b, c); | ||
|
||
[[maybe_unused]] auto t = hipDeviceSynchronize(); | ||
|
||
hipFree(a); | ||
hipFree(b); | ||
hipFree(c); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
50 changes: 0 additions & 50 deletions
50
application-performance/demos/omniperf/cpu_gpu_synchronization/main.cpp
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
|
||
ml LUMI/23.09 | ||
ml partition/G | ||
ml PrgEnv-cray | ||
ml craype-accel-amd-gfx90a | ||
ml rocm/5.4.6 | ||
|
||
export PATH=/projappl/project_465001194/apps/omniperf/bin:$PATH |