From 5a988c10a5eda3b82d98615e895e64a5f70253a1 Mon Sep 17 00:00:00 2001
From: Elias Konstantinidis <ekondis@gmail.com>
Date: Wed, 26 Jul 2017 23:37:53 +0300
Subject: [PATCH] Added FP16 support to HIP/alt variant

---
 mix_kernels_hip.cpp | 98 ++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 45 deletions(-)
diff --git a/mix_kernels_hip.cpp b/mix_kernels_hip.cpp
index 22e5298..e0db25d 100644
--- a/mix_kernels_hip.cpp
+++ b/mix_kernels_hip.cpp
@@ -4,7 +4,8 @@
  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  **/
 
-#include "hip/hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
 #include <stdio.h>
 
 #ifdef __CUDACC__
@@ -15,6 +16,7 @@
 #define GPU_INF(_T)   std::numeric_limits<_T>::infinity()
 #endif
 
+typedef __half2 half2;
 
 #include "lhiputil.h"
 
@@ -24,56 +26,53 @@
 
 #define UNROLLED_MEMORY_ACCESSES (UNROLL_ITERATIONS/2)
 
-template <class T>
-class functor_mad{
-	public:
-	__device__ T operator()(T a, T b, T c){
-		return a * b + c;
-	}
-};
+template<class T>
+inline __device__ T mad(const T& a, const T& b, const T& c){ return a*b+c; }
 
 template<>
-class functor_mad<double>{
-	public:
-	__device__ double operator()(double a, double b, double c){
-		return fma(a, b, c);
-	}
-};
+inline __device__ double mad(const double& a, const double& b, const double& c){ return fma(a, b, c); }
+
+template<>
+inline __device__ half2 mad(const half2& a, const half2& b, const half2& c){ return __hfma2(a, b, c); }
+
+template<class T>
+inline __device__ bool is_equal(const T& a, const T& b){ return a == b; }
 
-template <class T, int blockdim, int memory_ratio>
+template<>
+inline __device__ bool is_equal(const half2& a, const half2& b){ return __hbeq2(a, b); }
+
+template <class T, int blockSize, int memory_ratio>
 __global__ void
 benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){
-	functor_mad<T> mad_op;
-	const int index_stride = blockdim;
-	const int index_base = hipBlockIdx_x*blockdim*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x;
-	const int halfarraysize = hipGridDim_x*blockdim*UNROLLED_MEMORY_ACCESSES;
+	const int index_base = hipBlockIdx_x*blockSize*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x;
+	const int halfarraysize = hipGridDim_x*blockSize*UNROLLED_MEMORY_ACCESSES;
 	const int offset_slips = 1+UNROLLED_MEMORY_ACCESSES-((memory_ratio+1)/2);
-	const int array_index_bound = index_base+offset_slips*index_stride;
+	const int array_index_bound = index_base+offset_slips*blockSize;
 	const int initial_index_range = memory_ratio>0 ? UNROLLED_MEMORY_ACCESSES % ((memory_ratio+1)/2) : 1;
 	int initial_index_factor = 0;
 	volatile T *data = g_data;
 
 	int array_index = index_base;
-	T r0 = seed + hipBlockIdx_x * blockdim + hipThreadIdx_x,
-	  r1 = r0+(T)(2),
-	  r2 = r0+(T)(3),
-	  r3 = r0+(T)(5),
-	  r4 = r0+(T)(7),
-	  r5 = r0+(T)(11),
-	  r6 = r0+(T)(13),
-	  r7 = r0+(T)(17);
+	T r0 = seed + hipBlockIdx_x * blockSize + hipThreadIdx_x,
+	  r1 = r0+static_cast<T>(2),
+	  r2 = r0+static_cast<T>(3),
+	  r3 = r0+static_cast<T>(5),
+	  r4 = r0+static_cast<T>(7),
+	  r5 = r0+static_cast<T>(11),
+	  r6 = r0+static_cast<T>(13),
+	  r7 = r0+static_cast<T>(17);
 
 	for(int j=0; j<COMP_ITERATIONS; j+=UNROLL_ITERATIONS){
 		#pragma unroll
 		for(int i=0; i<UNROLL_ITERATIONS-memory_ratio; i++){
-			r0 = mad_op(r0, r0, r4);
-			r1 = mad_op(r1, r1, r5);
-			r2 = mad_op(r2, r2, r6);
-			r3 = mad_op(r3, r3, r7);
-			r4 = mad_op(r4, r4, r0);
-			r5 = mad_op(r5, r5, r1);
-			r6 = mad_op(r6, r6, r2);
-			r7 = mad_op(r7, r7, r3);
+			r0 = mad<T>(r0, r0, r4);
+			r1 = mad<T>(r1, r1, r5);
+			r2 = mad<T>(r2, r2, r6);
+			r3 = mad<T>(r3, r3, r7);
+			r4 = mad<T>(r4, r4, r0);
+			r5 = mad<T>(r5, r5, r1);
+			r6 = mad<T>(r6, r6, r2);
+			r7 = mad<T>(r7, r7, r3);
 		}
 		bool do_write = true;
 		int reg_idx = 0;
@@ -87,18 +86,18 @@ benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){
 				r = data[ array_index ];
 				if( ++reg_idx>=REGBLOCK_SIZE )
 					reg_idx = 0;
-				array_index += index_stride;
+				array_index += blockSize;
 			}
 			do_write = !do_write;
 		}
 		if( array_index >= array_index_bound ){
 			if( ++initial_index_factor > initial_index_range)
 				initial_index_factor = 0;
-			array_index = index_base + initial_index_factor*index_stride;
+			array_index = index_base + initial_index_factor*blockSize;
 		}
 	}
-	if( (r0==GPU_INF(T)) && (r1==GPU_INF(T)) && (r2==GPU_INF(T)) && (r3==GPU_INF(T)) &&
-	    (r4==GPU_INF(T)) && (r5==GPU_INF(T)) && (r6==GPU_INF(T)) && (r7==GPU_INF(T)) ){ // extremely unlikely to happen
+	if( is_equal(r0, GPU_INF(T)) && is_equal(r1, GPU_INF(T)) && is_equal(r2, GPU_INF(T)) && is_equal(r3, GPU_INF(T)) &&
+	    is_equal(r4, GPU_INF(T)) && is_equal(r5, GPU_INF(T)) && is_equal(r6, GPU_INF(T)) && is_equal(r7, GPU_INF(T)) ){ // extremely unlikely to happen
 		g_data[0] = r0+r1+r2+r3+r4+r5+r6+r7;
 	}
 }
@@ -158,6 +157,11 @@ void runbench(double *cd, long size){
 	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd);
 	float kernel_time_mad_dp = finalizeEvents(start, stop);
 
+	initializeEvents(&start, &stop);
+	half2 h_ones(1.0f);
+	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< half2, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, h_ones, (half2*)cd);
+	float kernel_time_mad_hp = finalizeEvents(start, stop);
+
 	initializeEvents(&start, &stop);
 	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd);
 	float kernel_time_mad_int = finalizeEvents(start, stop);
@@ -165,7 +169,7 @@ void runbench(double *cd, long size){
 	const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS;
 	const double computations_ratio = 1.0-memaccesses_ratio;
 
-	printf("         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,  %8.3f,%8.2f,%8.2f,%7.2f\n",
+	printf("         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,  %8.3f,%8.2f,%8.2f,%7.2f\n",
 		UNROLL_ITERATIONS-memory_ratio,
 		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)),
 		kernel_time_mad_sp,
@@ -175,6 +179,10 @@ void runbench(double *cd, long size){
 		kernel_time_mad_dp,
 		(computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000),
 		(memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.),
+		(computations_ratio*(double)2*computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(half2)),
+		kernel_time_mad_hp,
+		(computations_ratio*(double)2*computations)/kernel_time_mad_hp*1000./(double)(1000*1000*1000),
+		(memaccesses_ratio*(double)memoryoperations*sizeof(half2))/kernel_time_mad_hp*1000./(1000.*1000.*1000.),
 		(computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)),
 		kernel_time_mad_int,
 		(computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000),
@@ -194,9 +202,9 @@ extern "C" void mixbenchGPU(double *c, long size){
 	// Synchronize in order to wait for memory operations to finish
 	CUDA_SAFE_CALL( hipDeviceSynchronize() );
 
-	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
-	printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Integer operations,,, \n");
-	printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");
+	printf("----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------\n");
+	printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Half precision ops,,,,                Integer operations,,, \n");
+	printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");
 
 	runbench_warmup(cd, size);
 
@@ -234,7 +242,7 @@ extern "C" void mixbenchGPU(double *c, long size){
 	runbench<1>(cd, size);
 	runbench<0>(cd, size);
 
-	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
+	printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
 
 	// Copy results back to host memory
 	CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) );