Skip to content

Commit

Permalink
[v1.17] Improved compilation and perfomance for newer uarch
Browse files Browse the repository at this point in the history
  • Loading branch information
Dasor committed Jun 24, 2024
1 parent 9006e9b commit f5633bc
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 10 deletions.
16 changes: 13 additions & 3 deletions src/gpu/arch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ enum {
ARCH_VOLTA,
ARCH_TURING,
ARCH_AMPERE,
ARCH_ADA,
ARCH_UNKNOWN
};

Expand All @@ -27,6 +28,7 @@ static const char *uarch_str[] = {
/*[ARCH_VOLTA] = */ "Volta",
/*[ARCH_TURING] = */ "Turing",
/*[ARCH_AMPERE] = */ "Ampere",
/*[ARCH_ADA] = */ "Ada",
};

struct benchmark_gpu {
Expand Down Expand Up @@ -143,8 +145,12 @@ struct gpu* get_gpu_info(int gpu_idx) {
break;
case 80:
case 86:
case 87:
gpu->uarch = ARCH_AMPERE;
break;
case 89:
gpu->uarch = ARCH_ADA;
break;
default:
printf("GPU: %s\n", gpu->name);
printErr("Invalid uarch: %d.%d\n", deviceProp.major, deviceProp.minor);
Expand All @@ -162,6 +168,7 @@ struct gpu* get_gpu_info(int gpu_idx) {
break;
case ARCH_TURING:
case ARCH_AMPERE: // UNTESTED
case ARCH_ADA: // UNTESTED
gpu->latency = 4;
break;
default:
Expand All @@ -185,21 +192,23 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) {
bench->nbk = (nbk == INVALID_CFG) ? (gpu->latency * gpu->sm_count) : nbk;
bench->tpb = (tpb == INVALID_CFG) ? _ConvertSMVer2Cores(gpu->cc_major, gpu->cc_minor) : tpb;
}
bench->n = bench->nbk * bench->tpb;
bench->n = 16 * bench->nbk * bench->tpb;
bench->gflops = (double)(BENCHMARK_GPU_ITERS * 2 * (long)bench->n)/(long)1000000000;

cudaError_t err = cudaSuccess;
float *h_A;
float *h_B;
int size = bench->n * sizeof(float);

cudaSetDevice(0);

if ((err = cudaMallocHost((void **)&h_A, size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
}

if ((err = cudaMallocHost((void **)&h_B, size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
}

Expand All @@ -208,6 +217,7 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) {
h_B[i] = rand()/(float)RAND_MAX;
}


if ((err = cudaMalloc((void **) &(bench->d_A), size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "../getarg.hpp"

#define BENCHMARK_GPU_ITERS 400000000
#define BENCHMARK_GPU_ITERS 40000000

struct benchmark_gpu;

Expand Down
35 changes: 29 additions & 6 deletions src/gpu/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,38 @@
#include "kernel.hpp"
#include <stdio.h>
#include <stdint.h>
#define N 16
#define gid threadIdx.x + blockIdx.x * blockDim.x
#define off gid*N


__global__
void compute_kernel(float *vec_a, float *vec_b, float *vec_c, int n) {
float a = vec_a[0];
float b = vec_b[0];
float c = 0.0;
__shared__ float myblockA[N];
__shared__ float myblockB[N];
__shared__ float myblockC[N];

#pragma unroll
for(int i = 0; i < N; i++){
myblockA[i] = vec_a[off+i];
myblockB[i] = vec_b[off+i];
myblockC[i] = vec_a[off+i];
}

__syncthreads();

#pragma unroll 2000
#pragma unroll 32
for(long i=0; i < BENCHMARK_GPU_ITERS; i++) {
c = (c * a) + b;
#pragma unroll
for(int j = 0; j < N; j++){
myblockC[j] = (myblockC[j] * myblockA[j]) + myblockB[j];
}
}

#pragma unroll
for(int i = 0; i < N; i++){
vec_c[off+i] = myblockC[i];
}

vec_c[0] = c;
}

0 comments on commit f5633bc

Please sign in to comment.