-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added FP16 support to HIP/alt variant
- Loading branch information
Showing
1 changed file
with
53 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,8 @@ | |
* Contact: Elias Konstantinidis <[email protected]> | ||
**/ | ||
|
||
#include "hip/hip_runtime.h" | ||
#include <hip/hip_runtime.h> | ||
#include <hip/hip_fp16.h> | ||
#include <stdio.h> | ||
|
||
#ifdef __CUDACC__ | ||
|
@@ -15,6 +16,7 @@ | |
#define GPU_INF(_T) std::numeric_limits<_T>::infinity() | ||
#endif | ||
|
||
typedef __half2 half2; | ||
|
||
#include "lhiputil.h" | ||
|
||
|
@@ -24,56 +26,53 @@ | |
|
||
#define UNROLLED_MEMORY_ACCESSES (UNROLL_ITERATIONS/2) | ||
|
||
template <class T> | ||
class functor_mad{ | ||
public: | ||
__device__ T operator()(T a, T b, T c){ | ||
return a * b + c; | ||
} | ||
}; | ||
template<class T> | ||
inline __device__ T mad(const T& a, const T& b, const T& c){ return a*b+c; } | ||
|
||
template<> | ||
class functor_mad<double>{ | ||
public: | ||
__device__ double operator()(double a, double b, double c){ | ||
return fma(a, b, c); | ||
} | ||
}; | ||
inline __device__ double mad(const double& a, const double& b, const double& c){ return fma(a, b, c); } | ||
|
||
template<> | ||
inline __device__ half2 mad(const half2& a, const half2& b, const half2& c){ return __hfma2(a, b, c); } | ||
|
||
template<class T> | ||
inline __device__ bool is_equal(const T& a, const T& b){ return a == b; } | ||
|
||
template <class T, int blockdim, int memory_ratio> | ||
template<> | ||
inline __device__ bool is_equal(const half2& a, const half2& b){ return __hbeq2(a, b); } | ||
|
||
template <class T, int blockSize, int memory_ratio> | ||
__global__ void | ||
benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){ | ||
functor_mad<T> mad_op; | ||
const int index_stride = blockdim; | ||
const int index_base = hipBlockIdx_x*blockdim*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x; | ||
const int halfarraysize = hipGridDim_x*blockdim*UNROLLED_MEMORY_ACCESSES; | ||
const int index_base = hipBlockIdx_x*blockSize*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x; | ||
const int halfarraysize = hipGridDim_x*blockSize*UNROLLED_MEMORY_ACCESSES; | ||
const int offset_slips = 1+UNROLLED_MEMORY_ACCESSES-((memory_ratio+1)/2); | ||
const int array_index_bound = index_base+offset_slips*index_stride; | ||
const int array_index_bound = index_base+offset_slips*blockSize; | ||
const int initial_index_range = memory_ratio>0 ? UNROLLED_MEMORY_ACCESSES % ((memory_ratio+1)/2) : 1; | ||
int initial_index_factor = 0; | ||
volatile T *data = g_data; | ||
|
||
int array_index = index_base; | ||
T r0 = seed + hipBlockIdx_x * blockdim + hipThreadIdx_x, | ||
r1 = r0+(T)(2), | ||
r2 = r0+(T)(3), | ||
r3 = r0+(T)(5), | ||
r4 = r0+(T)(7), | ||
r5 = r0+(T)(11), | ||
r6 = r0+(T)(13), | ||
r7 = r0+(T)(17); | ||
T r0 = seed + hipBlockIdx_x * blockSize + hipThreadIdx_x, | ||
r1 = r0+static_cast<T>(2), | ||
r2 = r0+static_cast<T>(3), | ||
r3 = r0+static_cast<T>(5), | ||
r4 = r0+static_cast<T>(7), | ||
r5 = r0+static_cast<T>(11), | ||
r6 = r0+static_cast<T>(13), | ||
r7 = r0+static_cast<T>(17); | ||
|
||
for(int j=0; j<COMP_ITERATIONS; j+=UNROLL_ITERATIONS){ | ||
#pragma unroll | ||
for(int i=0; i<UNROLL_ITERATIONS-memory_ratio; i++){ | ||
r0 = mad_op(r0, r0, r4); | ||
r1 = mad_op(r1, r1, r5); | ||
r2 = mad_op(r2, r2, r6); | ||
r3 = mad_op(r3, r3, r7); | ||
r4 = mad_op(r4, r4, r0); | ||
r5 = mad_op(r5, r5, r1); | ||
r6 = mad_op(r6, r6, r2); | ||
r7 = mad_op(r7, r7, r3); | ||
r0 = mad<T>(r0, r0, r4); | ||
r1 = mad<T>(r1, r1, r5); | ||
r2 = mad<T>(r2, r2, r6); | ||
r3 = mad<T>(r3, r3, r7); | ||
r4 = mad<T>(r4, r4, r0); | ||
r5 = mad<T>(r5, r5, r1); | ||
r6 = mad<T>(r6, r6, r2); | ||
r7 = mad<T>(r7, r7, r3); | ||
} | ||
bool do_write = true; | ||
int reg_idx = 0; | ||
|
@@ -87,18 +86,18 @@ benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){ | |
r = data[ array_index ]; | ||
if( ++reg_idx>=REGBLOCK_SIZE ) | ||
reg_idx = 0; | ||
array_index += index_stride; | ||
array_index += blockSize; | ||
} | ||
do_write = !do_write; | ||
} | ||
if( array_index >= array_index_bound ){ | ||
if( ++initial_index_factor > initial_index_range) | ||
initial_index_factor = 0; | ||
array_index = index_base + initial_index_factor*index_stride; | ||
array_index = index_base + initial_index_factor*blockSize; | ||
} | ||
} | ||
if( (r0==GPU_INF(T)) && (r1==GPU_INF(T)) && (r2==GPU_INF(T)) && (r3==GPU_INF(T)) && | ||
(r4==GPU_INF(T)) && (r5==GPU_INF(T)) && (r6==GPU_INF(T)) && (r7==GPU_INF(T)) ){ // extremely unlikely to happen | ||
if( is_equal(r0, GPU_INF(T)) && is_equal(r1, GPU_INF(T)) && is_equal(r2, GPU_INF(T)) && is_equal(r3, GPU_INF(T)) && | ||
is_equal(r4, GPU_INF(T)) && is_equal(r5, GPU_INF(T)) && is_equal(r6, GPU_INF(T)) && is_equal(r7, GPU_INF(T)) ){ // extremely unlikely to happen | ||
g_data[0] = r0+r1+r2+r3+r4+r5+r6+r7; | ||
} | ||
} | ||
|
@@ -158,14 +157,19 @@ void runbench(double *cd, long size){ | |
hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); | ||
float kernel_time_mad_dp = finalizeEvents(start, stop); | ||
|
||
initializeEvents(&start, &stop); | ||
half2 h_ones(1.0f); | ||
hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< half2, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, h_ones, (half2*)cd); | ||
float kernel_time_mad_hp = finalizeEvents(start, stop); | ||
|
||
initializeEvents(&start, &stop); | ||
hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); | ||
float kernel_time_mad_int = finalizeEvents(start, stop); | ||
|
||
const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS; | ||
const double computations_ratio = 1.0-memaccesses_ratio; | ||
|
||
printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", | ||
printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", | ||
UNROLL_ITERATIONS-memory_ratio, | ||
(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)), | ||
kernel_time_mad_sp, | ||
|
@@ -175,6 +179,10 @@ void runbench(double *cd, long size){ | |
kernel_time_mad_dp, | ||
(computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), | ||
(memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), | ||
(computations_ratio*(double)2*computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(half2)), | ||
kernel_time_mad_hp, | ||
(computations_ratio*(double)2*computations)/kernel_time_mad_hp*1000./(double)(1000*1000*1000), | ||
(memaccesses_ratio*(double)memoryoperations*sizeof(half2))/kernel_time_mad_hp*1000./(1000.*1000.*1000.), | ||
(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)), | ||
kernel_time_mad_int, | ||
(computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), | ||
|
@@ -194,9 +202,9 @@ extern "C" void mixbenchGPU(double *c, long size){ | |
// Synchronize in order to wait for memory operations to finish | ||
CUDA_SAFE_CALL( hipDeviceSynchronize() ); | ||
|
||
printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); | ||
printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); | ||
printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); | ||
printf("----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------\n"); | ||
printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, \n"); | ||
printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); | ||
|
||
runbench_warmup(cd, size); | ||
|
||
|
@@ -234,7 +242,7 @@ extern "C" void mixbenchGPU(double *c, long size){ | |
runbench<1>(cd, size); | ||
runbench<0>(cd, size); | ||
|
||
printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); | ||
printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); | ||
|
||
// Copy results back to host memory | ||
CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) ); | ||
|