From 5a988c10a5eda3b82d98615e895e64a5f70253a1 Mon Sep 17 00:00:00 2001 From: Elias Konstantinidis Date: Wed, 26 Jul 2017 23:37:53 +0300 Subject: [PATCH] Added FP16 support to HIP/alt variant --- mix_kernels_hip.cpp | 98 ++++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/mix_kernels_hip.cpp b/mix_kernels_hip.cpp index 22e5298..e0db25d 100644 --- a/mix_kernels_hip.cpp +++ b/mix_kernels_hip.cpp @@ -4,7 +4,8 @@ * Contact: Elias Konstantinidis **/ -#include "hip/hip_runtime.h" +#include +#include #include #ifdef __CUDACC__ @@ -15,6 +16,7 @@ #define GPU_INF(_T) std::numeric_limits<_T>::infinity() #endif +typedef __half2 half2; #include "lhiputil.h" @@ -24,56 +26,53 @@ #define UNROLLED_MEMORY_ACCESSES (UNROLL_ITERATIONS/2) -template -class functor_mad{ - public: - __device__ T operator()(T a, T b, T c){ - return a * b + c; - } -}; +template +inline __device__ T mad(const T& a, const T& b, const T& c){ return a*b+c; } template<> -class functor_mad{ - public: - __device__ double operator()(double a, double b, double c){ - return fma(a, b, c); - } -}; +inline __device__ double mad(const double& a, const double& b, const double& c){ return fma(a, b, c); } + +template<> +inline __device__ half2 mad(const half2& a, const half2& b, const half2& c){ return __hfma2(a, b, c); } + +template +inline __device__ bool is_equal(const T& a, const T& b){ return a == b; } -template +template<> +inline __device__ bool is_equal(const half2& a, const half2& b){ return __hbeq2(a, b); } + +template __global__ void benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){ - functor_mad mad_op; - const int index_stride = blockdim; - const int index_base = hipBlockIdx_x*blockdim*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x; - const int halfarraysize = hipGridDim_x*blockdim*UNROLLED_MEMORY_ACCESSES; + const int index_base = hipBlockIdx_x*blockSize*UNROLLED_MEMORY_ACCESSES + hipThreadIdx_x; + const int halfarraysize = hipGridDim_x*blockSize*UNROLLED_MEMORY_ACCESSES; const int offset_slips = 1+UNROLLED_MEMORY_ACCESSES-((memory_ratio+1)/2); - const int array_index_bound = index_base+offset_slips*index_stride; + const int array_index_bound = index_base+offset_slips*blockSize; const int initial_index_range = memory_ratio>0 ? UNROLLED_MEMORY_ACCESSES % ((memory_ratio+1)/2) : 1; int initial_index_factor = 0; volatile T *data = g_data; int array_index = index_base; - T r0 = seed + hipBlockIdx_x * blockdim + hipThreadIdx_x, - r1 = r0+(T)(2), - r2 = r0+(T)(3), - r3 = r0+(T)(5), - r4 = r0+(T)(7), - r5 = r0+(T)(11), - r6 = r0+(T)(13), - r7 = r0+(T)(17); + T r0 = seed + hipBlockIdx_x * blockSize + hipThreadIdx_x, + r1 = r0+static_cast(2), + r2 = r0+static_cast(3), + r3 = r0+static_cast(5), + r4 = r0+static_cast(7), + r5 = r0+static_cast(11), + r6 = r0+static_cast(13), + r7 = r0+static_cast(17); for(int j=0; j(r0, r0, r4); + r1 = mad(r1, r1, r5); + r2 = mad(r2, r2, r6); + r3 = mad(r3, r3, r7); + r4 = mad(r4, r4, r0); + r5 = mad(r5, r5, r1); + r6 = mad(r6, r6, r2); + r7 = mad(r7, r7, r3); } bool do_write = true; int reg_idx = 0; @@ -87,18 +86,18 @@ benchmark_func(hipLaunchParm lp, T seed, volatile T *g_data){ r = data[ array_index ]; if( ++reg_idx>=REGBLOCK_SIZE ) reg_idx = 0; - array_index += index_stride; + array_index += blockSize; } do_write = !do_write; } if( array_index >= array_index_bound ){ if( ++initial_index_factor > initial_index_range) initial_index_factor = 0; - array_index = index_base + initial_index_factor*index_stride; + array_index = index_base + initial_index_factor*blockSize; } } - if( (r0==GPU_INF(T)) && (r1==GPU_INF(T)) && (r2==GPU_INF(T)) && (r3==GPU_INF(T)) && - (r4==GPU_INF(T)) && (r5==GPU_INF(T)) && (r6==GPU_INF(T)) && (r7==GPU_INF(T)) ){ // extremely unlikely to happen + if( is_equal(r0, GPU_INF(T)) && is_equal(r1, GPU_INF(T)) && is_equal(r2, GPU_INF(T)) && is_equal(r3, GPU_INF(T)) && + is_equal(r4, GPU_INF(T)) && is_equal(r5, GPU_INF(T)) && is_equal(r6, GPU_INF(T)) && is_equal(r7, GPU_INF(T)) ){ // extremely unlikely to happen g_data[0] = r0+r1+r2+r3+r4+r5+r6+r7; } } @@ -158,6 +157,11 @@ void runbench(double *cd, long size){ hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); + initializeEvents(&start, &stop); + half2 h_ones(1.0f); + hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< half2, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, h_ones, (half2*)cd); + float kernel_time_mad_hp = finalizeEvents(start, stop); + initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); @@ -165,7 +169,7 @@ void runbench(double *cd, long size){ const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS; const double computations_ratio = 1.0-memaccesses_ratio; - printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", + printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", UNROLL_ITERATIONS-memory_ratio, (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)), kernel_time_mad_sp, @@ -175,6 +179,10 @@ void runbench(double *cd, long size){ kernel_time_mad_dp, (computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), + (computations_ratio*(double)2*computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(half2)), + kernel_time_mad_hp, + (computations_ratio*(double)2*computations)/kernel_time_mad_hp*1000./(double)(1000*1000*1000), + (memaccesses_ratio*(double)memoryoperations*sizeof(half2))/kernel_time_mad_hp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)), kernel_time_mad_int, (computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), @@ -194,9 +202,9 @@ extern "C" void mixbenchGPU(double *c, long size){ // Synchronize in order to wait for memory operations to finish CUDA_SAFE_CALL( hipDeviceSynchronize() ); - printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); - printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); - printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); + printf("----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------\n"); + printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, \n"); + printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); runbench_warmup(cd, size); @@ -234,7 +242,7 @@ extern "C" void mixbenchGPU(double *c, long size){ runbench<1>(cd, size); runbench<0>(cd, size); - printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); + printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); // Copy results back to host memory CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) );