Skip to content

Commit

Permalink
Added FP16 support to HIP/alt variant
Browse files Browse the repository at this point in the history
  • Loading branch information
ekondis committed Jul 26, 2017
1 parent 6b70fcc commit 5a988c1
Showing 1 changed file with 53 additions and 45 deletions.
98 changes: 53 additions & 45 deletions mix_kernels_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
* Contact: Elias Konstantinidis <[email protected]>
**/

#include "hip/hip_runtime.h"
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>

#ifdef __CUDACC__
Expand All @@ -15,6 +16,7 @@
#define GPU_INF(_T) std::numeric_limits<_T>::infinity()
#endif

typedef __half2 half2;

#include "lhiputil.h"

Expand All @@ -24,56 +26,53 @@

#define UNROLLED_MEMORY_ACCESSES (UNROLL_ITERATIONS/2)

template <class T>
class functor_mad{
public:
__device__ T operator()(T a, T b, T c){
return a * b + c;
}
};
template<class T>
inline __device__ T mad(const T& a, const T& b, const T& c){ return a*b+c; }

template<>
class functor_mad<double>{
public:
__device__ double operator()(double a, double b, double c){
return fma(a, b, c);
}
};
inline __device__ double mad(const double& a, const double& b, const double& c){ return fma(a, b, c); }

template<>
inline __device__ half2 mad(const half2& a, const half2& b, const half2& c){ return __hfma2(a, b, c); }

template<class T>
inline __device__ bool is_equal(const T& a, const T& b){ return a == b; }

template <class T, int blockdim, int memory_ratio>
template<>
inline __device__ bool is_equal(const half2& a, const half2& b){ return __hbeq2(a, b); }

template <class T, int blockSize, int memory_ratio>
__global__ void
benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){
functor_mad<T> mad_op;
const int index_stride = blockdim;
const int index_base = hipBlockIdx_x*blockdim*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x;
const int halfarraysize = hipGridDim_x*blockdim*UNROLLED_MEMORY_ACCESSES;
const int index_base = hipBlockIdx_x*blockSize*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x;
const int halfarraysize = hipGridDim_x*blockSize*UNROLLED_MEMORY_ACCESSES;
const int offset_slips = 1+UNROLLED_MEMORY_ACCESSES-((memory_ratio+1)/2);
const int array_index_bound = index_base+offset_slips*index_stride;
const int array_index_bound = index_base+offset_slips*blockSize;
const int initial_index_range = memory_ratio>0 ? UNROLLED_MEMORY_ACCESSES % ((memory_ratio+1)/2) : 1;
int initial_index_factor = 0;
volatile T *data = g_data;

int array_index = index_base;
T r0 = seed + hipBlockIdx_x * blockdim + hipThreadIdx_x,
r1 = r0+(T)(2),
r2 = r0+(T)(3),
r3 = r0+(T)(5),
r4 = r0+(T)(7),
r5 = r0+(T)(11),
r6 = r0+(T)(13),
r7 = r0+(T)(17);
T r0 = seed + hipBlockIdx_x * blockSize + hipThreadIdx_x,
r1 = r0+static_cast<T>(2),
r2 = r0+static_cast<T>(3),
r3 = r0+static_cast<T>(5),
r4 = r0+static_cast<T>(7),
r5 = r0+static_cast<T>(11),
r6 = r0+static_cast<T>(13),
r7 = r0+static_cast<T>(17);

for(int j=0; j<COMP_ITERATIONS; j+=UNROLL_ITERATIONS){
#pragma unroll
for(int i=0; i<UNROLL_ITERATIONS-memory_ratio; i++){
r0 = mad_op(r0, r0, r4);
r1 = mad_op(r1, r1, r5);
r2 = mad_op(r2, r2, r6);
r3 = mad_op(r3, r3, r7);
r4 = mad_op(r4, r4, r0);
r5 = mad_op(r5, r5, r1);
r6 = mad_op(r6, r6, r2);
r7 = mad_op(r7, r7, r3);
r0 = mad<T>(r0, r0, r4);
r1 = mad<T>(r1, r1, r5);
r2 = mad<T>(r2, r2, r6);
r3 = mad<T>(r3, r3, r7);
r4 = mad<T>(r4, r4, r0);
r5 = mad<T>(r5, r5, r1);
r6 = mad<T>(r6, r6, r2);
r7 = mad<T>(r7, r7, r3);
}
bool do_write = true;
int reg_idx = 0;
Expand All @@ -87,18 +86,18 @@ benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){
r = data[ array_index ];
if( ++reg_idx>=REGBLOCK_SIZE )
reg_idx = 0;
array_index += index_stride;
array_index += blockSize;
}
do_write = !do_write;
}
if( array_index >= array_index_bound ){
if( ++initial_index_factor > initial_index_range)
initial_index_factor = 0;
array_index = index_base + initial_index_factor*index_stride;
array_index = index_base + initial_index_factor*blockSize;
}
}
if( (r0==GPU_INF(T)) && (r1==GPU_INF(T)) && (r2==GPU_INF(T)) && (r3==GPU_INF(T)) &&
(r4==GPU_INF(T)) && (r5==GPU_INF(T)) && (r6==GPU_INF(T)) && (r7==GPU_INF(T)) ){ // extremely unlikely to happen
if( is_equal(r0, GPU_INF(T)) && is_equal(r1, GPU_INF(T)) && is_equal(r2, GPU_INF(T)) && is_equal(r3, GPU_INF(T)) &&
is_equal(r4, GPU_INF(T)) && is_equal(r5, GPU_INF(T)) && is_equal(r6, GPU_INF(T)) && is_equal(r7, GPU_INF(T)) ){ // extremely unlikely to happen
g_data[0] = r0+r1+r2+r3+r4+r5+r6+r7;
}
}
Expand Down Expand Up @@ -158,14 +157,19 @@ void runbench(double *cd, long size){
hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd);
float kernel_time_mad_dp = finalizeEvents(start, stop);

initializeEvents(&start, &stop);
half2 h_ones(1.0f);
hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< half2, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, h_ones, (half2*)cd);
float kernel_time_mad_hp = finalizeEvents(start, stop);

initializeEvents(&start, &stop);
hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd);
float kernel_time_mad_int = finalizeEvents(start, stop);

const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS;
const double computations_ratio = 1.0-memaccesses_ratio;

printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n",
printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n",
UNROLL_ITERATIONS-memory_ratio,
(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)),
kernel_time_mad_sp,
Expand All @@ -175,6 +179,10 @@ void runbench(double *cd, long size){
kernel_time_mad_dp,
(computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000),
(memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.),
(computations_ratio*(double)2*computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(half2)),
kernel_time_mad_hp,
(computations_ratio*(double)2*computations)/kernel_time_mad_hp*1000./(double)(1000*1000*1000),
(memaccesses_ratio*(double)memoryoperations*sizeof(half2))/kernel_time_mad_hp*1000./(1000.*1000.*1000.),
(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)),
kernel_time_mad_int,
(computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000),
Expand All @@ -194,9 +202,9 @@ extern "C" void mixbenchGPU(double *c, long size){
// Synchronize in order to wait for memory operations to finish
CUDA_SAFE_CALL( hipDeviceSynchronize() );

printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n");
printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n");
printf("----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------\n");
printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, \n");
printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n");

runbench_warmup(cd, size);

Expand Down Expand Up @@ -234,7 +242,7 @@ extern "C" void mixbenchGPU(double *c, long size){
runbench<1>(cd, size);
runbench<0>(cd, size);

printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");

// Copy results back to host memory
CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) );
Expand Down

0 comments on commit 5a988c1

Please sign in to comment.