From 82f7e9785ff4e76555b3fdd35aeb23c4245586e9 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Wed, 12 Jul 2017 22:43:53 +0200 Subject: [PATCH 01/19] fist commit --- src/gpuarray_sort.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 src/gpuarray_sort.c diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c new file mode 100644 index 0000000000..a19c6d59a1 --- /dev/null +++ b/src/gpuarray_sort.c @@ -0,0 +1,17 @@ +#include + +#include "util/strb.h" + + +int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int numaxes, unsigned int *axes, GpuArray *arg) +{ + if (arg != NULL) + { + // perform argsort + } + else + { + // perform regular sort + } + +} From b29b11e26b26eb4e0e46dbc2f9a41b2c9336e044 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Fri, 14 Jul 2017 00:27:52 +0200 Subject: [PATCH 02/19] added sort files --- CMakeLists.txt | 2 +- src/CMakeLists.txt | 2 + src/gpuarray/sort.h | 31 +++ src/gpuarray_sort.c | 491 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 524 insertions(+), 2 deletions(-) create mode 100644 src/gpuarray/sort.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d6a96e7339..aa5defc384 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.0) +cmake_minimum_required(VERSION 2.8) PROJECT(libgpuarray C) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a45db024ff..defb27a4c2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,6 +55,7 @@ gpuarray_collectives_cuda_nccl.c gpuarray_buffer_opencl.c gpuarray_blas_opencl_clblas.c gpuarray_blas_opencl_clblast.c +gpuarray_sort.c ) check_function_exists(strlcat HAVE_STRL) @@ -126,6 +127,7 @@ set(headers gpuarray/kernel.h gpuarray/types.h gpuarray/util.h + gpuarray/sort.h ) install(FILES ${headers} DESTINATION include/gpuarray) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h new file mode 100644 index 0000000000..096bdf6856 --- /dev/null +++ b/src/gpuarray/sort.h @@ -0,0 +1,31 @@ +#ifndef GPUARRAY_SORT_H +#define GPUARRAY_SORT_H +/** \file sort.h + * \brief Sort operations generator. + */ + +#include +#include + + +#ifdef __cplusplus +extern "C" { +#endif +#ifdef CONFUSE_EMACS +} +#endif + +#define SHARED_SIZE_LIMIT 1024U +#define SAMPLE_STRIDE 128 + + +int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int numaxes, unsigned int *axes, GpuArray *arg); + + + + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index a19c6d59a1..56767ff706 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -1,17 +1,506 @@ #include +#include +#include +#include + + #include "util/strb.h" +#include "private.h" + +const int flags = GA_USE_CUDA; + +#define NUMARGS_BITONIC_KERNEL 5 +const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_BUFFER, GA_UINT, GA_UINT, GA_UINT}; +static const char *code_bitonic_smem = \ +" __device__ unsigned int readArray(unsigned int *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ +" if (pos >= length) { " \ +" if (sortDir) { " \ +" return 4294967295; " \ +" } " \ +" else { " \ +" return 0; " \ +" } " \ +" } " \ +" else { " \ +" return a[pos]; " \ +" } " \ +" } " \ +" __device__ void writeArray(unsigned int *a, unsigned int pos, unsigned int value, unsigned int length) " \ +" { " \ +" if (pos >= length) " \ +" { " \ +" return; " \ +" } "\ +" else { " \ +" a[pos] = value; " \ +" } "\ +" } " \ +" extern \"C\" __global__ void bitonicSortSharedKernel( "\ +" unsigned int *d_DstKey, "\ +" unsigned int *d_SrcKey, "\ +" unsigned int batchSize, "\ +" unsigned int arrayLength, "\ +" unsigned int sortDir "\ +" ) "\ +" { "\ +" __shared__ unsigned int s_key[1024]; "\ +" s_key[threadIdx.x] = readArray( d_SrcKey, "\ +" blockIdx.x * 1024 + threadIdx.x, "\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ +" s_key[threadIdx.x + (1024 / 2)] = readArray( d_SrcKey, "\ +" blockIdx.x * 1024 + threadIdx.x + (1024 / 2), "\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ +" for (unsigned int size = 2; size < 1024; size <<= 1) "\ +" { "\ +" unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); "\ +" for (unsigned int stride = size / 2; stride > 0; stride >>= 1) "\ +" { "\ +" __syncthreads(); "\ +" unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); "\ +" unsigned int t; "\ +" if ((s_key[pos] > s_key[pos + stride]) == ddd) { "\ +" t = s_key[pos]; "\ +" s_key[pos] = s_key[pos + stride]; "\ +" s_key[pos + stride] = t; "\ +" } "\ +" } "\ +" } "\ +" { "\ +" for (unsigned int stride = 1024 / 2; stride > 0; stride >>= 1) {" \ +" __syncthreads(); "\ +" unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); "\ +" unsigned int t; "\ +" if ((s_key[pos] > s_key[pos + stride]) == sortDir) {" \ +" t = s_key[pos]; "\ +" s_key[pos] = s_key[pos + stride]; "\ +" s_key[pos + stride] = t; "\ +" } "\ +" } "\ +" } "\ +" __syncthreads(); "\ +" writeArray( d_DstKey, "\ +" blockIdx.x * 1024 + threadIdx.x, "\ +" s_key[threadIdx.x], "\ +" arrayLength * batchSize "\ +" ); "\ +" writeArray( d_DstKey, "\ +" blockIdx.x * 1024 + threadIdx.x + (1024 / 2), "\ +" s_key[threadIdx.x + (1024 / 2)], "\ +" arrayLength * batchSize "\ +" ); "\ +" }\n"; + + +static unsigned int iDivUp(unsigned int a, unsigned int b) +{ + return ((a % b) == 0) ? (a / b) : (a / b + 1); +} + +static unsigned int getSampleCount(unsigned int dividend) +{ + return iDivUp(dividend, SAMPLE_STRIDE); +} + +static unsigned int ceiling(unsigned int n, unsigned int v) +{ + return (!n%v) ? n/v : (n/v) + 1; +} + +static void bitonicSortShared( + GpuArray *d_DstKey, + GpuArray *d_SrcKey, + unsigned int batchSize, + unsigned int arrayLength, + unsigned int sortDir, + GpuKernel *k_bitonic, + gpucontext *ctx +) +{ + + int errI; + size_t lens[1] = {strlen(code_bitonic_smem)}; + char *err_str = NULL; + size_t ls, gs; + + void *arguments[NUMARGS_BITONIC_KERNEL]; // = (void**) malloc(sizeof(void *) * NUM_ARGS_KERNEL_1); + + errI = GpuKernel_init( k_bitonic, ctx, 1, + &code_bitonic_smem, lens, "bitonicSortSharedKernel", + NUMARGS_BITONIC_KERNEL, type_args_bitonic, flags, &err_str); + + printf("error kernel init: %s \n", gpuarray_error_str(errI)); + printf("error backend: %s \n", err_str); + + //unsigned int blockCount = batchSize; + //unsigned int blockDim = SHARED_SIZE_LIMIT / 2; + ls = SHARED_SIZE_LIMIT / 2; + gs = 1; + GpuKernel_sched(k_bitonic, (size_t)arrayLength * batchSize, &gs, &ls); + + arguments[0] = (void*)d_DstKey->data; + arguments[1] = (void*)d_SrcKey->data; + arguments[2] = (void*)&batchSize; + arguments[3] = (void*)&arrayLength; + arguments[4] = (void*)&sortDir; + + GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, arguments); + + /*if (sortDir) + { + //bitonicSortSharedKernel<1U><<>>(d_DstKey, d_SrcKey, batchSize, arrayLength); + //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + } + else + { + //bitonicSortSharedKernel<0U><<>>(d_DstKey, d_SrcKey, batchSize, arrayLength); + //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + }*/ +} + +static void generateSampleRanks( + gpudata *d_RanksA, + gpudata *d_RanksB, + GpuArray *d_SrcKey, + unsigned int stride, + unsigned int N, + unsigned int sortDir +) +{ + unsigned int lastSegmentElements = N % (2 * stride); + unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + + if (sortDir) + { + //generateSampleRanksKernel<1U><<>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); + //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + } + else + { + //generateSampleRanksKernel<0U><<>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); + //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + } +} + +static void mergeRanksAndIndices( + gpudata *d_LimitsA, + gpudata *d_LimitsB, + gpudata *d_RanksA, + gpudata *d_RanksB, + unsigned int stride, + unsigned int N +) +{ + unsigned int lastSegmentElements = N % (2 * stride); + unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + + /*mergeRanksAndIndicesKernel<<>>( + d_LimitsA, + d_RanksA, + stride, + N, + threadCount + ); + printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + + mergeRanksAndIndicesKernel<<>>( + d_LimitsB, + d_RanksB, + stride, + N, + threadCount + ); + printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + */ +} + +static void mergeElementaryIntervals( + GpuArray *d_DstKey, + GpuArray *d_SrcKey, + gpudata *d_LimitsA, + gpudata *d_LimitsB, + unsigned int stride, + unsigned int N, + unsigned int sortDir +) +{ + unsigned int lastSegmentElements = N % (2 * stride); + unsigned int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; + + if (sortDir) + { + /* mergeElementaryIntervalsKernel<1U><<>>( + d_DstKey, + d_SrcKey, + d_LimitsA, + d_LimitsB, + stride, + N + ); + printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); */ + } + else + { + /* + mergeElementaryIntervalsKernel<0U><<>>( + d_DstKey, + d_SrcKey, + d_LimitsA, + d_LimitsB, + stride, + N + ); + printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + */ + } +} + +static void mergeLeftMostSegment( + GpuArray *d_DstKey, + GpuArray *d_SrcKey, + unsigned int segmentSizeA, + unsigned int segmentSizeB, + unsigned int N, + unsigned int sortDir +) +{ + unsigned int blockDim = 256; + unsigned int blockCount = ceiling(N, blockDim); + + if (sortDir) + { + //mergeLeftMostSegmentKernel<1U><<>>(d_DstKey, d_SrcKey, segmentSizeA, segmentSizeB, N); + //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + } + else + { + //mergeLeftMostSegmentKernel<0U><<>>(d_DstKey, d_SrcKey, segmentSizeA, segmentSizeB, N); + //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); + } +} +static void sort( + GpuArray *d_DstKey, + GpuArray *d_BufKey, + GpuArray *d_SrcKey, + gpudata *d_RanksA, + gpudata *d_RanksB, + gpudata *d_LimitsA, + gpudata *d_LimitsB, + unsigned int N, + unsigned int Nfloor, + int Nleft, + unsigned int sortDir, + gpucontext *ctx +) +{ + GpuArray *ikey, *okey; + GpuArray *t; // Aux pointer + + GpuKernel k_bitonic; + + unsigned int stageCount = 0; + unsigned int stride; + for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1, stageCount++); + + if (stageCount & 1) + { + ikey = d_BufKey; + okey = d_DstKey; + } + else + { + ikey = d_DstKey; + okey = d_BufKey; + } + + ///////////////////////////////////////////////////////////////////////// + // Sort the array with bitonic sort for arrays shorter than 1024 elements + // Bitonic sort gives better performance than merge sort for short arrays + ///////////////////////////////////////////////////////////////////////// + + if (N <= SHARED_SIZE_LIMIT) + { + + bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, &k_bitonic, ctx); + } + /////////////////////////////////////////////////////////////////////////////// + // Sort the array with merge sort for arrays equal or bigger than 1024 elements + /////////////////////////////////////////////////////////////////////////////// + else + { + unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; + unsigned int arrayLength = SHARED_SIZE_LIMIT; + //mergeSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0U, Nfloor); + bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, &k_bitonic, ctx); + + for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) + { + unsigned int lastSegmentElements = Nfloor % (2 * stride); + + //Find sample ranks and prepare for limiters merge + generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir); + + //Merge ranks and indices + mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor); + + //Merge elementary intervals + mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir); + + if (lastSegmentElements <= stride) + { + //Last merge segment consists of a single array which just needs to be passed through + printf("inside last segment\n"); + // TODO: uncomment and fix sizeof + ////////////////////////////////// + /*cudaMemcpy( okey + (Nfloor - lastSegmentElements), + ikey + (Nfloor - lastSegmentElements), + lastSegmentElements * sizeof(t_key), + cudaMemcpyDeviceToDevice + ); + */ + } + // Swap pointers + t = ikey; + ikey = okey; + okey = t; + } + + // If the array is not multiple of 1024, sort the leftmost part + // and perform merge sort of the two last segments + if (Nleft > 0) + { + printf("Sorting Remaining part %d \n", Nleft); + bitonicSortShared(d_SrcKey + Nfloor, d_DstKey + Nfloor, 1, Nleft, sortDir, &k_bitonic, ctx); + + // Copy the leftMost segment to the output array of which contains the first sorted sequence + + // TODO: uncomment and fix sizeof + ////////////////////////////////// + //checkCudaErrors(cudaMemcpy(d_DstKey + Nfloor, d_SrcKey + Nfloor, Nleft * sizeof(t_key), cudaMemcpyDeviceToDevice)); + GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); // TODO: copy just the needed part of the buffer + + mergeLeftMostSegment(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir); + + // TODO: uncomment and fix sizeof + ////////////////////////////////// + //checkCudaErrors(cudaMemcpy(d_DstKey, d_SrcKey , N * sizeof(t_key), cudaMemcpyDeviceToDevice)); + GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); + } + } + //cudaDeviceSynchronize(); +} + + + +unsigned int roundDown(unsigned int numToRound, unsigned int multiple) +{ + if (numToRound <= multiple) + { + return numToRound; + } + else + { + return (numToRound / multiple) * multiple; + } +} + +void initMergeSort( + gpudata *d_RanksA, + gpudata *d_RanksB, + gpudata *d_LimitsA, + gpudata *d_LimitsB, + unsigned int MAX_SAMPLE_COUNT, + gpucontext *ctx +) +{ + /*cudaMalloc((void **)d_RanksA, MAX_SAMPLE_COUNT * sizeof(unsigned int)); + cudaMalloc((void **)d_RanksB, MAX_SAMPLE_COUNT * sizeof(unsigned int)); + cudaMalloc((void **)d_LimitsA, MAX_SAMPLE_COUNT * sizeof(unsigned int)); + cudaMalloc((void **)d_LimitsB, MAX_SAMPLE_COUNT * sizeof(unsigned int)); + */ -int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int numaxes, unsigned int *axes, GpuArray *arg) + int res = GA_NO_ERROR; + + d_RanksA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + printf("error allocating aux structures %d\n", res); + d_RanksB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + printf("error allocating aux structures %d\n", res); + d_LimitsA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + printf("error allocating aux structures %d\n", res); + d_LimitsB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + printf("error allocating aux structures %d\n", res); +} + + +int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int numaxes, unsigned int *axes, GpuArray *arg) { + + int type = src->typecode; + gpucontext *ctx = GpuArray_context(src); + + // Device pointers - auxiiary data structure + gpudata *d_RanksA = NULL, *d_RanksB = NULL, *d_LimitsA = NULL, *d_LimitsB = NULL; + if (arg != NULL) { // perform argsort + assert(arg != NULL); } else { + const unsigned int nd = 1; + const size_t dims = src->dimensions[0]; + + const unsigned int Nfloor = roundDown(dims, SHARED_SIZE_LIMIT); + const int Nleft = dims - Nfloor; + + const unsigned int DIR = 0; + + // Device pointers - buffer data strucute + GpuArray BufKey; + GpuArray_empty(&BufKey, ctx, type, nd, &dims, GA_C_ORDER); + + + //checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(t_key))); + + // Initialize device auxiliary data structure + initMergeSort(d_RanksA, d_RanksB, d_LimitsA, d_LimitsB, Nfloor / 128, ctx); + // perform regular sort + sort( + dst, + &BufKey, + src, + d_RanksA, + d_RanksB, + d_LimitsA, + d_LimitsB, + dims, + Nfloor, + Nleft, + DIR, + ctx + ); + + + // type -> get typecode of the array + + // vectorType -> "type" + + // stbr_append all the kernels.... + + // Set arguments + + + + } + return 0; + } From b105068ae93be6a92f5233e150574036a908007c Mon Sep 17 00:00:00 2001 From: vcampmany Date: Mon, 17 Jul 2017 22:38:24 +0200 Subject: [PATCH 03/19] offset support --- src/gpuarray_sort.c | 68 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 56767ff706..8cb1091cd6 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -95,7 +95,18 @@ static const char *code_bitonic_smem = " ); "\ " }\n"; - +#define NUMARGS_CODE_K 5 +const int type_args_code_k[NUMARGS_CODE_K] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT}; +static const char *code_k = \ +" extern \"C\" __global__ void add( "\ +" unsigned int *d_DstKey, size_t dstOff, unsigned int *d_SrcKey, size_t srcOff, unsigned int N "\ +" ) "\ +" { "\ +" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;" \ +" if (i < N) d_DstKey[i] = d_SrcKey[i] + 1;" \ +" }\n"; static unsigned int iDivUp(unsigned int a, unsigned int b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); @@ -123,18 +134,20 @@ static void bitonicSortShared( { int errI; - size_t lens[1] = {strlen(code_bitonic_smem)}; + size_t lens[1] = {strlen(code_k)}; char *err_str = NULL; size_t ls, gs; + unsigned int p = 0; + int err; - void *arguments[NUMARGS_BITONIC_KERNEL]; // = (void**) malloc(sizeof(void *) * NUM_ARGS_KERNEL_1); + //void *arguments[NUMARGS_BITONIC_KERNEL]; // = (void**) malloc(sizeof(void *) * NUM_ARGS_KERNEL_1); - errI = GpuKernel_init( k_bitonic, ctx, 1, - &code_bitonic_smem, lens, "bitonicSortSharedKernel", - NUMARGS_BITONIC_KERNEL, type_args_bitonic, flags, &err_str); + err = GpuKernel_init( k_bitonic, ctx, 1, + &code_k, lens, "add", + NUMARGS_CODE_K, type_args_code_k, flags, &err_str); - printf("error kernel init: %s \n", gpuarray_error_str(errI)); - printf("error backend: %s \n", err_str); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(errI)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); //unsigned int blockCount = batchSize; //unsigned int blockDim = SHARED_SIZE_LIMIT / 2; @@ -142,13 +155,34 @@ static void bitonicSortShared( gs = 1; GpuKernel_sched(k_bitonic, (size_t)arrayLength * batchSize, &gs, &ls); - arguments[0] = (void*)d_DstKey->data; + /*arguments[0] = (void*)d_DstKey->data; arguments[1] = (void*)d_SrcKey->data; arguments[2] = (void*)&batchSize; arguments[3] = (void*)&arrayLength; arguments[4] = (void*)&sortDir; +*/ + err = GpuKernel_setarg(k_bitonic, p++, d_DstKey->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); - GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, arguments); + err = GpuKernel_setarg(k_bitonic, p++, &d_DstKey->offset); + + err = GpuKernel_setarg(k_bitonic, p++, d_SrcKey->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_bitonic, p++, &d_SrcKey->offset); + + unsigned int sz = 16; + err = GpuKernel_setarg(k_bitonic, p++, &sz); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + /*err = GpuKernel_setarg(k_bitonic, p++, &arrayLength); + if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); + + err = GpuKernel_setarg(k_bitonic, p++, &sortDir); + if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); +*/ + err = GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL /*arguments*/); + if (err != GA_NO_ERROR) printf("error calling kernel %d \n", p); /*if (sortDir) { @@ -324,8 +358,7 @@ static void sort( ///////////////////////////////////////////////////////////////////////// if (N <= SHARED_SIZE_LIMIT) - { - + { bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, &k_bitonic, ctx); } /////////////////////////////////////////////////////////////////////////////// @@ -427,13 +460,16 @@ void initMergeSort( int res = GA_NO_ERROR; d_RanksA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); - printf("error allocating aux structures %d\n", res); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + d_RanksB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); - printf("error allocating aux structures %d\n", res); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + d_LimitsA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); - printf("error allocating aux structures %d\n", res); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + d_LimitsB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); - printf("error allocating aux structures %d\n", res); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); } From 67524a9518fa6af6bd58bd0070df2a0fbdc9c2d4 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Tue, 18 Jul 2017 22:55:13 +0200 Subject: [PATCH 04/19] ranks kernel --- src/gpuarray/sort.h | 4 +- src/gpuarray_sort.c | 280 ++++++++++++++++++++++++++++++-------------- 2 files changed, 195 insertions(+), 89 deletions(-) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index 096bdf6856..efa26a3141 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -19,9 +19,7 @@ extern "C" { #define SAMPLE_STRIDE 128 -int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int numaxes, unsigned int *axes, GpuArray *arg); - - +int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *arg); #ifdef __cplusplus diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 8cb1091cd6..9e76e1385c 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -10,8 +10,52 @@ const int flags = GA_USE_CUDA; -#define NUMARGS_BITONIC_KERNEL 5 -const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_BUFFER, GA_UINT, GA_UINT, GA_UINT}; +static const char *code_helper_funcs = \ +"__device__ unsigned int iDivUp(unsigned int a, unsigned int b)"\ +"{"\ +" return ((a % b) == 0) ? (a / b) : (a / b + 1); "\ +"} "\ +"__device__ unsigned int getSampleCount(unsigned int dividend) "\ +"{ "\ +" return iDivUp(dividend, 1024); "\ +"}"\ +" \n #define W (sizeof(unsigned int) * 8) \n"\ +"__device__ unsigned int nextPowerOfTwo(unsigned int x) "\ +"{"\ +" return 1U << (W - __clz(x - 1));"\ +"}\n"; + +static const char *code_bin_search = \ +"__device__ unsigned int binarySearchInclusive(unsigned int val, unsigned int *data, unsigned int L, "\ +" unsigned int stride, unsigned int sortDir){"\ +" if (L == 0) "\ +" return 0; "\ +" unsigned int pos = 0; "\ +" for (; stride > 0; stride >>= 1){ "\ +" unsigned int newPos = min(pos + stride, L); "\ +" if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))){ "\ +" pos = newPos; "\ +" } "\ +" } "\ +" return pos; "\ +"} "\ +"__device__ unsigned int binarySearchExclusive(unsigned int val, unsigned int *data, unsigned int L, " \ +" unsigned int stride, unsigned int sortDir) "\ +"{ "\ +" if (L == 0) "\ +" return 0; "\ +" unsigned int pos = 0; "\ +" for (; stride > 0; stride >>= 1){ "\ +" unsigned int newPos = min(pos + stride, L); "\ +" if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))){ "\ +" pos = newPos; "\ +" } "\ +" } "\ +" return pos; "\ +"}\n"; + +#define NUMARGS_BITONIC_KERNEL 7 +const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; static const char *code_bitonic_smem = \ " __device__ unsigned int readArray(unsigned int *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ " if (pos >= length) { " \ @@ -38,12 +82,16 @@ static const char *code_bitonic_smem = " } " \ " extern \"C\" __global__ void bitonicSortSharedKernel( "\ " unsigned int *d_DstKey, "\ +" size_t dstOff," " unsigned int *d_SrcKey, "\ +" size_t srcOff," " unsigned int batchSize, "\ " unsigned int arrayLength, "\ " unsigned int sortDir "\ " ) "\ " { "\ +" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ " __shared__ unsigned int s_key[1024]; "\ " s_key[threadIdx.x] = readArray( d_SrcKey, "\ " blockIdx.x * 1024 + threadIdx.x, "\ @@ -51,10 +99,10 @@ static const char *code_bitonic_smem = " sortDir "\ " ); "\ " s_key[threadIdx.x + (1024 / 2)] = readArray( d_SrcKey, "\ -" blockIdx.x * 1024 + threadIdx.x + (1024 / 2), "\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ +" blockIdx.x * 1024 + threadIdx.x + (1024 / 2), "\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ " for (unsigned int size = 2; size < 1024; size <<= 1) "\ " { "\ " unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); "\ @@ -95,18 +143,53 @@ static const char *code_bitonic_smem = " ); "\ " }\n"; -#define NUMARGS_CODE_K 5 -const int type_args_code_k[NUMARGS_CODE_K] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT}; -static const char *code_k = \ -" extern \"C\" __global__ void add( "\ -" unsigned int *d_DstKey, size_t dstOff, unsigned int *d_SrcKey, size_t srcOff, unsigned int N "\ -" ) "\ -" { "\ -" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ +#define NUMARGS_SAMPLE_RANKS 8 +const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +static const char *code_sample_ranks = \ +"extern \"C\" __global__ void generateSampleRanksKernel(" \ +" unsigned int *d_RanksA,"\ +" unsigned int *d_RanksB,"\ +" unsigned int *d_SrcKey,"\ +" size_t srcOff," \ +" unsigned int stride," \ +" unsigned int N," \ +" unsigned int threadCount,"\ +" unsigned int sortDir" \ +")" \ +"{" \ " d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ -" unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;" \ -" if (i < N) d_DstKey[i] = d_SrcKey[i] + 1;" \ -" }\n"; +" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;"\ +" if (pos >= threadCount)" \ +" {"\ +" return;"\ +" }"\ +" const unsigned int i = pos & ((stride / 1024) - 1);"\ +" const unsigned int segmentBase = (pos - i) * (2 * 1024);"\ +" d_SrcKey += segmentBase;"\ +" d_RanksA += segmentBase / 1024;"\ +" d_RanksB += segmentBase / 1024;"\ +" const unsigned int segmentElementsA = stride;"\ +" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride);"\ +" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA);"\ +" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB);"\ +" if (i < segmentSamplesA)"\ +" {"\ +" d_RanksA[i] = i * 1024;"\ +" d_RanksB[i] = binarySearchExclusive("\ +" d_SrcKey[i * 1024], d_SrcKey + stride,"\ +" segmentElementsB, nextPowerOfTwo(segmentElementsB), sortDir"\ +" );"\ +" }"\ +" if (i < segmentSamplesB)"\ +" {"\ +" d_RanksB[(stride / 1024) + i] = i * 1024;"\ +" d_RanksA[(stride / 1024) + i] = binarySearchInclusive("\ +" d_SrcKey[stride + i * 1024], d_SrcKey + 0,"\ +" segmentElementsA, nextPowerOfTwo(segmentElementsA), sortDir"\ +" );"\ +" }"\ +"}\n"; + static unsigned int iDivUp(unsigned int a, unsigned int b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); @@ -132,92 +215,107 @@ static void bitonicSortShared( gpucontext *ctx ) { - - int errI; - size_t lens[1] = {strlen(code_k)}; + size_t lens[1] = {strlen(code_bitonic_smem)}; char *err_str = NULL; size_t ls, gs; unsigned int p = 0; int err; - //void *arguments[NUMARGS_BITONIC_KERNEL]; // = (void**) malloc(sizeof(void *) * NUM_ARGS_KERNEL_1); - err = GpuKernel_init( k_bitonic, ctx, 1, - &code_k, lens, "add", - NUMARGS_CODE_K, type_args_code_k, flags, &err_str); - - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(errI)); + &code_bitonic_smem, lens, "bitonicSortSharedKernel", + NUMARGS_BITONIC_KERNEL, type_args_bitonic, flags, &err_str); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); - //unsigned int blockCount = batchSize; - //unsigned int blockDim = SHARED_SIZE_LIMIT / 2; ls = SHARED_SIZE_LIMIT / 2; - gs = 1; - GpuKernel_sched(k_bitonic, (size_t)arrayLength * batchSize, &gs, &ls); - - /*arguments[0] = (void*)d_DstKey->data; - arguments[1] = (void*)d_SrcKey->data; - arguments[2] = (void*)&batchSize; - arguments[3] = (void*)&arrayLength; - arguments[4] = (void*)&sortDir; -*/ + gs = batchSize; + //GpuKernel_sched(k_bitonic, (size_t)arrayLength * batchSize, &gs, &ls); + err = GpuKernel_setarg(k_bitonic, p++, d_DstKey->data); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); err = GpuKernel_setarg(k_bitonic, p++, &d_DstKey->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); err = GpuKernel_setarg(k_bitonic, p++, d_SrcKey->data); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); err = GpuKernel_setarg(k_bitonic, p++, &d_SrcKey->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); - unsigned int sz = 16; - err = GpuKernel_setarg(k_bitonic, p++, &sz); + err = GpuKernel_setarg(k_bitonic, p++, &batchSize); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); - /*err = GpuKernel_setarg(k_bitonic, p++, &arrayLength); + err = GpuKernel_setarg(k_bitonic, p++, &arrayLength); if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); err = GpuKernel_setarg(k_bitonic, p++, &sortDir); if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); -*/ - err = GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL /*arguments*/); + + err = GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL); if (err != GA_NO_ERROR) printf("error calling kernel %d \n", p); - /*if (sortDir) - { - //bitonicSortSharedKernel<1U><<>>(d_DstKey, d_SrcKey, batchSize, arrayLength); - //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - } - else - { - //bitonicSortSharedKernel<0U><<>>(d_DstKey, d_SrcKey, batchSize, arrayLength); - //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - }*/ } +#define NSTRINGS 3 static void generateSampleRanks( - gpudata *d_RanksA, - gpudata *d_RanksB, - GpuArray *d_SrcKey, - unsigned int stride, - unsigned int N, - unsigned int sortDir + gpudata *d_RanksA, + gpudata *d_RanksB, + GpuArray *d_SrcKey, + unsigned int stride, + unsigned int N, + unsigned int sortDir, + GpuKernel *k_ranks, + gpucontext *ctx ) { - unsigned int lastSegmentElements = N % (2 * stride); - unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + unsigned int lastSegmentElements = N % (2 * stride); + unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - if (sortDir) - { - //generateSampleRanksKernel<1U><<>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); - //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - } - else - { - //generateSampleRanksKernel<0U><<>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); - //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - } + char *err_str = NULL; + size_t ls, gs; + unsigned int p = 0; + int err; + const char *codes[NSTRINGS] = {code_helper_funcs, code_bin_search, code_sample_ranks}; + size_t lens[NSTRINGS] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; + + err = GpuKernel_init(k_ranks, ctx, NSTRINGS, + codes, lens, "generateSampleRanksKernel", + NUMARGS_SAMPLE_RANKS, type_args_ranks, flags, &err_str); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + ls = 256U; + gs = iDivUp(threadCount, 256); + + err = GpuKernel_setarg(k_ranks, p++, d_RanksA); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, d_RanksB); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, d_SrcKey->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, &d_SrcKey->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, &stride); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, &N); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, &threadCount); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, &sortDir); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + printf("before segfault\n"); + + err = GpuKernel_call(k_ranks, 1, &gs, &ls, 0, NULL); + if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); } static void mergeRanksAndIndices( @@ -336,6 +434,10 @@ static void sort( GpuArray *t; // Aux pointer GpuKernel k_bitonic; + GpuKernel k_ranks; + + size_t lstCopyOff; + int err; unsigned int stageCount = 0; unsigned int stride; @@ -343,11 +445,13 @@ static void sort( if (stageCount & 1) { + printf("bffkey\n"); ikey = d_BufKey; okey = d_DstKey; } else { + printf("d_DstKey\n"); ikey = d_DstKey; okey = d_BufKey; } @@ -368,7 +472,6 @@ static void sort( { unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; unsigned int arrayLength = SHARED_SIZE_LIMIT; - //mergeSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0U, Nfloor); bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, &k_bitonic, ctx); for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) @@ -376,7 +479,7 @@ static void sort( unsigned int lastSegmentElements = Nfloor % (2 * stride); //Find sample ranks and prepare for limiters merge - generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir); + generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); //Merge ranks and indices mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor); @@ -390,19 +493,23 @@ static void sort( printf("inside last segment\n"); // TODO: uncomment and fix sizeof ////////////////////////////////// - /*cudaMemcpy( okey + (Nfloor - lastSegmentElements), - ikey + (Nfloor - lastSegmentElements), - lastSegmentElements * sizeof(t_key), - cudaMemcpyDeviceToDevice - ); - */ + //cudaMemcpy( okey + (Nfloor - lastSegmentElements), + // ikey + (Nfloor - lastSegmentElements), + // lastSegmentElements * sizeof(t_key), + // cudaMemcpyDeviceToDevice + // ); + + //lstCopyOff = okey->offset; // + ((Nfloor - lastSegmentElements) * sizeof(unsigned int)); + //err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, lastSegmentElements * sizeof(unsigned int)); + //if (err != GA_NO_ERROR) printf("error move data\n"); + GpuArray_copy(okey, ikey, GA_C_ORDER); } // Swap pointers t = ikey; ikey = okey; okey = t; } - + // If the array is not multiple of 1024, sort the leftmost part // and perform merge sort of the two last segments if (Nleft > 0) @@ -425,6 +532,7 @@ static void sort( GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); } } + //GpuArray_copy(d_DstKey, d_BufKey, GA_C_ORDER); //cudaDeviceSynchronize(); } @@ -459,21 +567,21 @@ void initMergeSort( int res = GA_NO_ERROR; - d_RanksA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + d_RanksA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); - d_RanksB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + d_RanksB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); - d_LimitsA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + d_LimitsA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); - d_LimitsB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, 0, &res); + d_LimitsB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); } -int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int numaxes, unsigned int *axes, GpuArray *arg) +int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray *arg) { int type = src->typecode; @@ -495,7 +603,7 @@ int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int numaxes, unsigned i const unsigned int Nfloor = roundDown(dims, SHARED_SIZE_LIMIT); const int Nleft = dims - Nfloor; - const unsigned int DIR = 0; + //const unsigned int DIR = 0; // Device pointers - buffer data strucute GpuArray BufKey; @@ -519,7 +627,7 @@ int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int numaxes, unsigned i dims, Nfloor, Nleft, - DIR, + sortDir, ctx ); From 4dda8d1c5964e821b1f2851eafc1e47c604f60b6 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Wed, 19 Jul 2017 22:23:23 +0200 Subject: [PATCH 05/19] pow2 --- src/gpuarray_sort.c | 835 +++++++++++++++++++++++++++++++++----------- 1 file changed, 637 insertions(+), 198 deletions(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 9e76e1385c..ddbd87d980 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -11,23 +11,63 @@ const int flags = GA_USE_CUDA; static const char *code_helper_funcs = \ +"\n#define SAMPLE_STRIDE 128 \n" \ +"\n#define SHARED_SIZE_LIMIT 1024U \n" \ "__device__ unsigned int iDivUp(unsigned int a, unsigned int b)"\ "{"\ " return ((a % b) == 0) ? (a / b) : (a / b + 1); "\ "} "\ "__device__ unsigned int getSampleCount(unsigned int dividend) "\ "{ "\ -" return iDivUp(dividend, 1024); "\ +" return iDivUp(dividend, SAMPLE_STRIDE); "\ "}"\ " \n #define W (sizeof(unsigned int) * 8) \n"\ "__device__ unsigned int nextPowerOfTwo(unsigned int x) "\ "{"\ " return 1U << (W - __clz(x - 1));"\ -"}\n"; +"} "\ +" __device__ unsigned int readArray(unsigned int *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ +" if (pos >= length) { " \ +" if (sortDir) { " \ +" return 4294967295; " \ +" } " \ +" else { " \ +" return 0; " \ +" } " \ +" } " \ +" else { " \ +" return a[pos]; " \ +" } " \ +" } " \ +" __device__ void writeArray(unsigned int *a, unsigned int pos, unsigned int value, unsigned int length) " \ +" { " \ +" if (pos >= length) " \ +" { " \ +" return; " \ +" } "\ +" else { " \ +" a[pos] = value; " \ +" } "\ +" }\n"; + +static unsigned int iDivUp(unsigned int a, unsigned int b) +{ + return ((a % b) == 0) ? (a / b) : (a / b + 1); +} + +static unsigned int getSampleCount(unsigned int dividend) +{ + return iDivUp(dividend, SAMPLE_STRIDE); +} + +static unsigned int ceiling(unsigned int n, unsigned int v) +{ + return (!n%v) ? n/v : (n/v) + 1; +} static const char *code_bin_search = \ "__device__ unsigned int binarySearchInclusive(unsigned int val, unsigned int *data, unsigned int L, "\ -" unsigned int stride, unsigned int sortDir){"\ +" unsigned int stride, unsigned int sortDir){"\ " if (L == 0) "\ " return 0; "\ " unsigned int pos = 0; "\ @@ -40,7 +80,7 @@ static const char *code_bin_search = " return pos; "\ "} "\ "__device__ unsigned int binarySearchExclusive(unsigned int val, unsigned int *data, unsigned int L, " \ -" unsigned int stride, unsigned int sortDir) "\ +" unsigned int stride, unsigned int sortDir) "\ "{ "\ " if (L == 0) "\ " return 0; "\ @@ -52,34 +92,55 @@ static const char *code_bin_search = " } "\ " } "\ " return pos; "\ +"}"\ +"__device__ unsigned int binarySearchLowerBoundExclusive(unsigned int val, unsigned int *ptr, unsigned int first, "\ +" unsigned int last, unsigned int sortDir) "\ +"{ "\ +" unsigned int len = last - first; "\ +" unsigned int half; "\ +" unsigned int middle; "\ +" while (len > 0) "\ +" { "\ +" half = len >> 1; "\ +" middle = first; "\ +" middle += half; "\ +" if ( (sortDir && ptr[middle] < val) || (!sortDir && ptr[middle] > val) ) "\ +" { "\ +" first = middle; "\ +" ++first; "\ +" len = len - half - 1; "\ +" } "\ +" else "\ +" len = half; "\ +" } "\ +" return first; "\ +"} "\ +"__device__ unsigned int binarySearchLowerBoundInclusive(unsigned int val, unsigned int *ptr, unsigned int first, "\ +" unsigned int last, unsigned int sortDir) "\ +"{ "\ +" unsigned int len = last - first; "\ +" unsigned int half; "\ +" unsigned int middle; "\ +" while (len > 0) "\ +" { "\ +" half = len >> 1; "\ +" middle = first; "\ +" middle += half; "\ +" if ( (sortDir && ptr[middle] <= val) || (!sortDir && ptr[middle] >= val) ) "\ +" { "\ +" first = middle; "\ +" ++first; "\ +" len = len - half - 1; "\ +" } "\ +" else "\ +" len = half; "\ +" } "\ +" return first; "\ "}\n"; -#define NUMARGS_BITONIC_KERNEL 7 -const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; +#define NUMARGS_BITONIC_KERNEL 8 +const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_bitonic_smem = \ -" __device__ unsigned int readArray(unsigned int *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ -" if (pos >= length) { " \ -" if (sortDir) { " \ -" return 4294967295; " \ -" } " \ -" else { " \ -" return 0; " \ -" } " \ -" } " \ -" else { " \ -" return a[pos]; " \ -" } " \ -" } " \ -" __device__ void writeArray(unsigned int *a, unsigned int pos, unsigned int value, unsigned int length) " \ -" { " \ -" if (pos >= length) " \ -" { " \ -" return; " \ -" } "\ -" else { " \ -" a[pos] = value; " \ -" } "\ -" } " \ " extern \"C\" __global__ void bitonicSortSharedKernel( "\ " unsigned int *d_DstKey, "\ " size_t dstOff," @@ -87,23 +148,26 @@ static const char *code_bitonic_smem = " size_t srcOff," " unsigned int batchSize, "\ " unsigned int arrayLength, "\ +" unsigned int elemsOff, " \ " unsigned int sortDir "\ " ) "\ " { "\ " d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ " d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ -" __shared__ unsigned int s_key[1024]; "\ +" d_DstKey += elemsOff;" \ +" d_SrcKey += elemsOff;" \ +" __shared__ unsigned int s_key[SHARED_SIZE_LIMIT]; "\ " s_key[threadIdx.x] = readArray( d_SrcKey, "\ -" blockIdx.x * 1024 + threadIdx.x, "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ " arrayLength * batchSize, "\ " sortDir "\ " ); "\ -" s_key[threadIdx.x + (1024 / 2)] = readArray( d_SrcKey, "\ -" blockIdx.x * 1024 + threadIdx.x + (1024 / 2), "\ +" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcKey, "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), "\ " arrayLength * batchSize, "\ " sortDir "\ " ); "\ -" for (unsigned int size = 2; size < 1024; size <<= 1) "\ +" for (unsigned int size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) "\ " { "\ " unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); "\ " for (unsigned int stride = size / 2; stride > 0; stride >>= 1) "\ @@ -119,7 +183,7 @@ static const char *code_bitonic_smem = " } "\ " } "\ " { "\ -" for (unsigned int stride = 1024 / 2; stride > 0; stride >>= 1) {" \ +" for (unsigned int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {" \ " __syncthreads(); "\ " unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); "\ " unsigned int t; "\ @@ -132,104 +196,43 @@ static const char *code_bitonic_smem = " } "\ " __syncthreads(); "\ " writeArray( d_DstKey, "\ -" blockIdx.x * 1024 + threadIdx.x, "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ " s_key[threadIdx.x], "\ " arrayLength * batchSize "\ " ); "\ " writeArray( d_DstKey, "\ -" blockIdx.x * 1024 + threadIdx.x + (1024 / 2), "\ -" s_key[threadIdx.x + (1024 / 2)], "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), "\ +" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], "\ " arrayLength * batchSize "\ " ); "\ " }\n"; - -#define NUMARGS_SAMPLE_RANKS 8 -const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; -static const char *code_sample_ranks = \ -"extern \"C\" __global__ void generateSampleRanksKernel(" \ -" unsigned int *d_RanksA,"\ -" unsigned int *d_RanksB,"\ -" unsigned int *d_SrcKey,"\ -" size_t srcOff," \ -" unsigned int stride," \ -" unsigned int N," \ -" unsigned int threadCount,"\ -" unsigned int sortDir" \ -")" \ -"{" \ -" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ -" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;"\ -" if (pos >= threadCount)" \ -" {"\ -" return;"\ -" }"\ -" const unsigned int i = pos & ((stride / 1024) - 1);"\ -" const unsigned int segmentBase = (pos - i) * (2 * 1024);"\ -" d_SrcKey += segmentBase;"\ -" d_RanksA += segmentBase / 1024;"\ -" d_RanksB += segmentBase / 1024;"\ -" const unsigned int segmentElementsA = stride;"\ -" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride);"\ -" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA);"\ -" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB);"\ -" if (i < segmentSamplesA)"\ -" {"\ -" d_RanksA[i] = i * 1024;"\ -" d_RanksB[i] = binarySearchExclusive("\ -" d_SrcKey[i * 1024], d_SrcKey + stride,"\ -" segmentElementsB, nextPowerOfTwo(segmentElementsB), sortDir"\ -" );"\ -" }"\ -" if (i < segmentSamplesB)"\ -" {"\ -" d_RanksB[(stride / 1024) + i] = i * 1024;"\ -" d_RanksA[(stride / 1024) + i] = binarySearchInclusive("\ -" d_SrcKey[stride + i * 1024], d_SrcKey + 0,"\ -" segmentElementsA, nextPowerOfTwo(segmentElementsA), sortDir"\ -" );"\ -" }"\ -"}\n"; - -static unsigned int iDivUp(unsigned int a, unsigned int b) -{ - return ((a % b) == 0) ? (a / b) : (a / b + 1); -} - -static unsigned int getSampleCount(unsigned int dividend) -{ - return iDivUp(dividend, SAMPLE_STRIDE); -} - -static unsigned int ceiling(unsigned int n, unsigned int v) -{ - return (!n%v) ? n/v : (n/v) + 1; -} - +#define NSTR_BITONIC 2 static void bitonicSortShared( GpuArray *d_DstKey, GpuArray *d_SrcKey, unsigned int batchSize, unsigned int arrayLength, unsigned int sortDir, + unsigned int elemsOff, GpuKernel *k_bitonic, gpucontext *ctx ) { - size_t lens[1] = {strlen(code_bitonic_smem)}; char *err_str = NULL; size_t ls, gs; unsigned int p = 0; int err; + size_t lens[NSTR_BITONIC] = {strlen(code_helper_funcs), strlen(code_bitonic_smem)}; + const char *codes[NSTR_BITONIC] = {code_helper_funcs, code_bitonic_smem}; - err = GpuKernel_init( k_bitonic, ctx, 1, - &code_bitonic_smem, lens, "bitonicSortSharedKernel", + err = GpuKernel_init( k_bitonic, ctx, NSTR_BITONIC, + codes, lens, "bitonicSortSharedKernel", NUMARGS_BITONIC_KERNEL, type_args_bitonic, flags, &err_str); if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); ls = SHARED_SIZE_LIMIT / 2; gs = batchSize; - //GpuKernel_sched(k_bitonic, (size_t)arrayLength * batchSize, &gs, &ls); err = GpuKernel_setarg(k_bitonic, p++, d_DstKey->data); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); @@ -248,6 +251,9 @@ static void bitonicSortShared( err = GpuKernel_setarg(k_bitonic, p++, &arrayLength); if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); + + err = GpuKernel_setarg(k_bitonic, p++, &elemsOff); + if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); err = GpuKernel_setarg(k_bitonic, p++, &sortDir); if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); @@ -257,10 +263,60 @@ static void bitonicSortShared( } -#define NSTRINGS 3 +#define NUMARGS_SAMPLE_RANKS 10 +const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +static const char *code_sample_ranks = \ +"extern \"C\" __global__ void generateSampleRanksKernel(" \ +" unsigned int *d_RanksA,"\ +" size_t rankAOff," \ +" unsigned int *d_RanksB,"\ +" size_t rankBOff," \ +" unsigned int *d_SrcKey,"\ +" size_t srcOff," \ +" unsigned int stride," \ +" unsigned int N," \ +" unsigned int threadCount,"\ +" unsigned int sortDir" \ +")" \ +"{" \ +" d_RanksA = (unsigned int*) (((char*)d_RanksA)+ rankAOff);" \ +" d_RanksB = (unsigned int*) (((char*)d_RanksB)+ rankBOff);" \ +" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;"\ +" if (pos >= threadCount)" \ +" {"\ +" return;"\ +" }"\ +" const unsigned int i = pos & ((stride / SAMPLE_STRIDE) - 1);"\ +" const unsigned int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);"\ +" d_SrcKey += segmentBase;"\ +" d_RanksA += segmentBase / SAMPLE_STRIDE;"\ +" d_RanksB += segmentBase / SAMPLE_STRIDE;"\ +" const unsigned int segmentElementsA = stride;"\ +" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride);"\ +" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA);"\ +" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB);"\ +" if (i < segmentSamplesA)"\ +" {"\ +" d_RanksA[i] = i * SAMPLE_STRIDE;"\ +" d_RanksB[i] = binarySearchExclusive("\ +" d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride,"\ +" segmentElementsB, nextPowerOfTwo(segmentElementsB), sortDir"\ +" );"\ +" }"\ +" if (i < segmentSamplesB)"\ +" {"\ +" d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;"\ +" d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive("\ +" d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0,"\ +" segmentElementsA, nextPowerOfTwo(segmentElementsA), sortDir"\ +" );"\ +" }"\ +"}\n"; +#define NSTR_RANKS 3 static void generateSampleRanks( - gpudata *d_RanksA, - gpudata *d_RanksB, + GpuArray *d_RanksA, + GpuArray *d_RanksB, GpuArray *d_SrcKey, unsigned int stride, unsigned int N, @@ -276,22 +332,28 @@ static void generateSampleRanks( size_t ls, gs; unsigned int p = 0; int err; - const char *codes[NSTRINGS] = {code_helper_funcs, code_bin_search, code_sample_ranks}; - size_t lens[NSTRINGS] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; + const char *codes[NSTR_RANKS] = {code_helper_funcs, code_bin_search, code_sample_ranks}; + size_t lens[NSTR_RANKS] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; - err = GpuKernel_init(k_ranks, ctx, NSTRINGS, + err = GpuKernel_init(k_ranks, ctx, NSTR_RANKS, codes, lens, "generateSampleRanksKernel", NUMARGS_SAMPLE_RANKS, type_args_ranks, flags, &err_str); if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); - ls = 256U; + ls = 256; gs = iDivUp(threadCount, 256); - err = GpuKernel_setarg(k_ranks, p++, d_RanksA); + err = GpuKernel_setarg(k_ranks, p++, d_RanksA->data); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); - err = GpuKernel_setarg(k_ranks, p++, d_RanksB); + err = GpuKernel_setarg(k_ranks, p++, &d_RanksA->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, d_RanksB->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks, p++, &d_RanksB->offset); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); err = GpuKernel_setarg(k_ranks, p++, d_SrcKey->data); @@ -312,24 +374,155 @@ static void generateSampleRanks( err = GpuKernel_setarg(k_ranks, p++, &sortDir); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); - printf("before segfault\n"); - err = GpuKernel_call(k_ranks, 1, &gs, &ls, 0, NULL); if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); + + + + /*unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); + err = GpuArray_read(h_dst, (2048/128) * sizeof(unsigned int), d_RanksA); + if (err != GA_NO_ERROR) printf("error reading \n"); + + unsigned int *h_dst2 = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); + err = GpuArray_read(h_dst2, (2048/128) * sizeof(unsigned int), d_RanksB); + if (err != GA_NO_ERROR) printf("error reading \n"); + + int i; + for (i = 0; i < 2048/128; i++) + { + printf("%d rankA %u rankB %u \n", i, h_dst[i], h_dst2[i]); + } + */ } +#define NUMARGS_RANKS_IDXS 7 +const int type_args_ranks_idxs[NUMARGS_RANKS_IDXS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; +static const char *code_ranks_idxs = \ +"extern \"C\" __global__ void mergeRanksAndIndicesKernel( "\ +" unsigned int *d_Limits, "\ +" size_t limOff," \ +" unsigned int *d_Ranks, "\ +" size_t rankOff," \ +" unsigned int stride, "\ +" unsigned int N, "\ +" unsigned int threadCount "\ +") "\ +"{ "\ +" d_Limits = (unsigned int*) (((char*)d_Limits)+ limOff);" \ +" d_Ranks = (unsigned int*) (((char*)d_Ranks)+ rankOff);" \ +" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; "\ +" if (pos >= threadCount) "\ +" return; "\ +" const unsigned int i = pos & ((stride / SAMPLE_STRIDE) - 1); "\ +" const unsigned int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); "\ +" d_Ranks += (pos - i) * 2; "\ +" d_Limits += (pos - i) * 2; "\ +" const unsigned int segmentElementsA = stride; "\ +" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride); "\ +" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA); "\ +" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB); "\ +" if (i < segmentSamplesA) "\ +" { "\ +" unsigned int dstPos = binarySearchExclusive(d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB), 1U) + i; "\ +" d_Limits[dstPos] = d_Ranks[i]; "\ +" } "\ +" if (i < segmentSamplesB) "\ +" { "\ +" unsigned int dstPos = binarySearchInclusive(d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA), 1U) + i; "\ +" d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; "\ +" } "\ +"}\n"; +#define NSTRINGS_RKS_IDX 3 static void mergeRanksAndIndices( - gpudata *d_LimitsA, - gpudata *d_LimitsB, - gpudata *d_RanksA, - gpudata *d_RanksB, + GpuArray *d_LimitsA, + GpuArray *d_LimitsB, + GpuArray *d_RanksA, + GpuArray *d_RanksB, unsigned int stride, - unsigned int N + unsigned int N, + unsigned int sortDir, + GpuKernel *k_ranks_idxs, + gpucontext *ctx ) { unsigned int lastSegmentElements = N % (2 * stride); unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + + char *err_str = NULL; + size_t ls, gs; + unsigned int p = 0; + int err; + const char *codes[NSTRINGS_RKS_IDX] = {code_helper_funcs, code_bin_search, code_ranks_idxs}; + size_t lens[NSTRINGS_RKS_IDX] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_ranks_idxs)}; + + err = GpuKernel_init(k_ranks_idxs, ctx, NSTRINGS_RKS_IDX, + codes, lens, "mergeRanksAndIndicesKernel", + NUMARGS_RANKS_IDXS, type_args_ranks_idxs, flags, &err_str); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + ls = 256U; + gs = iDivUp(threadCount, 256U); + + err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsA->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsA->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksA->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksA->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, &stride); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, &N); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, &threadCount); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); + if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); + + p = 0; + + err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsB->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsB->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksB->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksB->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); + if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); + + + unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); + err = GpuArray_read(h_dst, (2048/128) * sizeof(unsigned int), d_LimitsB); + if (err != GA_NO_ERROR) printf("error reading \n"); + + unsigned int *h_dst2 = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); + err = GpuArray_read(h_dst2, (2048/128) * sizeof(unsigned int), d_RanksB); + if (err != GA_NO_ERROR) printf("error reading \n"); + + /* + int i; + for (i = 0; i < 2048/128; i++) + { + printf("%d Limit %u Rank %u \n", i, h_dst[i], h_dst2[i]); + } + */ + /*mergeRanksAndIndicesKernel<<>>( d_LimitsA, d_RanksA, @@ -350,22 +543,183 @@ static void mergeRanksAndIndices( */ } +#define NUMARGS_MERGE 11 +const int type_args_merge[NUMARGS_MERGE] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; +static const char *code_merge = \ +"__device__ void merge( "\ +" unsigned int *dstKey, "\ +" unsigned int *srcAKey, "\ +" unsigned int *srcBKey, "\ +" unsigned int lenA, "\ +" unsigned int nPowTwoLenA, "\ +" unsigned int lenB, "\ +" unsigned int nPowTwoLenB, "\ +" unsigned int sortDir "\ +") "\ +"{ "\ +" unsigned int keyA, keyB; "\ +" unsigned int dstPosA , dstPosB;"\ +" if (threadIdx.x < lenA) "\ +" { "\ +" keyA = srcAKey[threadIdx.x]; "\ +" dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB, sortDir) + threadIdx.x; "\ +" } "\ +" if (threadIdx.x < lenB) "\ +" { "\ +" keyB = srcBKey[threadIdx.x]; "\ +" dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA, sortDir) + threadIdx.x; "\ +" } "\ +" __syncthreads(); "\ +" if (threadIdx.x < lenA) "\ +" { "\ +" dstKey[dstPosA] = keyA; "\ +" } "\ +" if (threadIdx.x < lenB) "\ +" { "\ +" dstKey[dstPosB] = keyB; "\ +" } "\ +"} "\ +"extern \"C\" __global__ void mergeElementaryIntervalsKernel( "\ +" unsigned int *d_DstKey, "\ +" size_t dstOff," \ +" unsigned int *d_SrcKey, "\ +" size_t srcOff," \ +" unsigned int *d_LimitsA, "\ +" size_t limAOff," \ +" unsigned int *d_LimitsB, "\ +" size_t limBOff," \ +" unsigned int stride, "\ +" unsigned int N, "\ +" unsigned int sortDir" +") "\ +"{ "\ +" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" d_LimitsA = (unsigned int*) (((char*)d_LimitsA)+ limAOff);" \ +" d_LimitsB = (unsigned int*) (((char*)d_LimitsB)+ limBOff);" \ +" __shared__ unsigned int s_key[2 * SAMPLE_STRIDE]; "\ +" const unsigned int intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); "\ +" const unsigned int segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; "\ +" d_SrcKey += segmentBase; "\ +" d_DstKey += segmentBase; "\ +" __shared__ unsigned int startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; "\ +" if (threadIdx.x == 0) "\ +" { "\ +" unsigned int segmentElementsA = stride; "\ +" unsigned int segmentElementsB = min(stride, N - segmentBase - stride); "\ +" unsigned int segmentSamplesA = getSampleCount(segmentElementsA); "\ +" unsigned int segmentSamplesB = getSampleCount(segmentElementsB); "\ +" unsigned int segmentSamples = segmentSamplesA + segmentSamplesB; "\ +" startSrcA = d_LimitsA[blockIdx.x]; "\ +" startSrcB = d_LimitsB[blockIdx.x]; "\ +" unsigned int endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA; "\ +" unsigned int endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB; "\ +" lenSrcA = endSrcA - startSrcA; "\ +" lenSrcB = endSrcB - startSrcB; "\ +" startDstA = startSrcA + startSrcB; "\ +" startDstB = startDstA + lenSrcA; "\ +" } "\ +" __syncthreads(); "\ +" if (threadIdx.x < lenSrcA) "\ +" { "\ +" s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x]; "\ +" } "\ +" if (threadIdx.x < lenSrcB) "\ +" { "\ +" s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x]; "\ +" } "\ +" __syncthreads(); "\ +" merge( "\ +" s_key, "\ +" s_key + 0, "\ +" s_key + SAMPLE_STRIDE, "\ +" lenSrcA, SAMPLE_STRIDE, "\ +" lenSrcB, SAMPLE_STRIDE, "\ +" sortDir "\ +" ); "\ +" __syncthreads(); "\ +" if (threadIdx.x < lenSrcA) "\ +" { "\ +" d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x]; "\ +" } "\ +" if (threadIdx.x < lenSrcB) "\ +" { "\ +" d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; "\ +" } "\ +"}\n"; +#define NSTRINGS_MERGE 3 static void mergeElementaryIntervals( GpuArray *d_DstKey, GpuArray *d_SrcKey, - gpudata *d_LimitsA, - gpudata *d_LimitsB, + GpuArray *d_LimitsA, + GpuArray *d_LimitsB, unsigned int stride, unsigned int N, - unsigned int sortDir + unsigned int sortDir, + GpuKernel *k_merge, + gpucontext *ctx ) { - unsigned int lastSegmentElements = N % (2 * stride); - unsigned int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; + unsigned int lastSegmentElements = N % (2 * stride); + unsigned int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; + + char *err_str = NULL; + size_t ls, gs; + unsigned int p = 0; + int err; + const char *codes[NSTRINGS_MERGE] = {code_helper_funcs, code_bin_search, code_merge}; + size_t lens[NSTRINGS_MERGE] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge)}; + + err = GpuKernel_init(k_merge, ctx, NSTRINGS_MERGE, + codes, lens, "mergeElementaryIntervalsKernel", + NUMARGS_MERGE, type_args_merge, flags, &err_str); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + + ls = SAMPLE_STRIDE; + gs = mergePairs; + + err = GpuKernel_setarg(k_merge, p++, d_DstKey->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, &d_DstKey->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, d_SrcKey->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, &d_SrcKey->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + err = GpuKernel_setarg(k_merge, p++, d_LimitsA->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, &d_LimitsA->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, d_LimitsB->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, &d_LimitsB->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, &stride); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, &N); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge, p++, &sortDir); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_call(k_merge, 1, &gs, &ls, 0, NULL); + if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); + +/* if (sortDir) { - /* mergeElementaryIntervalsKernel<1U><<>>( + mergeElementaryIntervalsKernel<1U><<>>( d_DstKey, d_SrcKey, d_LimitsA, @@ -373,11 +727,11 @@ static void mergeElementaryIntervals( stride, N ); - printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); */ + printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); } else { - /* + mergeElementaryIntervalsKernel<0U><<>>( d_DstKey, d_SrcKey, @@ -387,22 +741,103 @@ static void mergeElementaryIntervals( N ); printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - */ + } + */ } -static void mergeLeftMostSegment( +#define NUMARGS_MERGE_GLB 8 +const int type_args_merge_glb[NUMARGS_MERGE_GLB] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +static const char *code_merge_glb = \ +"extern \"C\" __global__ void mergeGlobalMemKernel( "\ +" unsigned int *d_DstKey, "\ +" size_t dstOff, "\ +" unsigned int *d_SrcKey, "\ +" size_t srcOff, "\ +" unsigned int segmentSizeA, "\ +" unsigned int segmentSizeB, "\ +" unsigned int N, "\ +" unsigned int sortDir "\ +") "\ +"{ "\ +" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; "\ +" unsigned int *segmentPtrA = d_SrcKey; "\ +" unsigned int *segmentPtrB = d_SrcKey + segmentSizeA; "\ +" unsigned int idxSegmentA = idx % segmentSizeA; "\ +" unsigned int idxSegmentB = idx - segmentSizeA; "\ +" if (idx >= N) "\ +" return; "\ +" unsigned int value = d_SrcKey[idx]; "\ +" unsigned int dstPos; "\ +" if (idx < segmentSizeA) "\ +" { "\ +" dstPos = binarySearchLowerBoundExclusive(value, segmentPtrB, 0, segmentSizeB, sortDir) + idxSegmentA; "\ +" } "\ +" else "\ +" { "\ +" dstPos = binarySearchLowerBoundInclusive(value, segmentPtrA, 0, segmentSizeA, sortDir) + idxSegmentB; "\ +" } "\ +" d_DstKey[dstPos] = value; "\ +"}\n"; + +#define NSTRINGS_MERGE_GLB 2 +static void mergeGlobalMem( GpuArray *d_DstKey, GpuArray *d_SrcKey, unsigned int segmentSizeA, unsigned int segmentSizeB, unsigned int N, - unsigned int sortDir + unsigned int sortDir, + GpuKernel *k_merge_global, + gpucontext *ctx ) { - unsigned int blockDim = 256; - unsigned int blockCount = ceiling(N, blockDim); + char *err_str = NULL; + size_t ls, gs; + unsigned int p = 0; + int err; + const char *codes[NSTRINGS_MERGE_GLB] = {code_bin_search, code_merge_glb}; + size_t lens[NSTRINGS_MERGE_GLB] = {strlen(code_bin_search), strlen(code_merge_glb)}; + + err = GpuKernel_init(k_merge_global, ctx, NSTRINGS_MERGE_GLB, + codes, lens, "mergeGlobalMemKernel", + NUMARGS_MERGE_GLB, type_args_merge_glb, flags, &err_str); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + ls = 256; + gs = ceiling(N, ls); + + + err = GpuKernel_setarg(k_merge_global, p++, d_DstKey->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge_global, p++, &d_DstKey->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge_global, p++, d_SrcKey->data); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge_global, p++, &d_SrcKey->offset); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeA); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeB); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge_global, p++, &N); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_setarg(k_merge_global, p++, &sortDir); + if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + + err = GpuKernel_call(k_merge_global, 1, &gs, &ls, 0, NULL); + if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); +/* if (sortDir) { //mergeLeftMostSegmentKernel<1U><<>>(d_DstKey, d_SrcKey, segmentSizeA, segmentSizeB, N); @@ -413,16 +848,17 @@ static void mergeLeftMostSegment( //mergeLeftMostSegmentKernel<0U><<>>(d_DstKey, d_SrcKey, segmentSizeA, segmentSizeB, N); //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); } +*/ } static void sort( GpuArray *d_DstKey, GpuArray *d_BufKey, GpuArray *d_SrcKey, - gpudata *d_RanksA, - gpudata *d_RanksB, - gpudata *d_LimitsA, - gpudata *d_LimitsB, + GpuArray *d_RanksA, + GpuArray *d_RanksB, + GpuArray *d_LimitsA, + GpuArray *d_LimitsB, unsigned int N, unsigned int Nfloor, int Nleft, @@ -435,6 +871,9 @@ static void sort( GpuKernel k_bitonic; GpuKernel k_ranks; + GpuKernel k_ranks_idxs; + GpuKernel k_merge; + GpuKernel k_merge_global; size_t lstCopyOff; int err; @@ -463,7 +902,7 @@ static void sort( if (N <= SHARED_SIZE_LIMIT) { - bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, &k_bitonic, ctx); + bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); } /////////////////////////////////////////////////////////////////////////////// // Sort the array with merge sort for arrays equal or bigger than 1024 elements @@ -472,7 +911,7 @@ static void sort( { unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; unsigned int arrayLength = SHARED_SIZE_LIMIT; - bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, &k_bitonic, ctx); + bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0, &k_bitonic, ctx); for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) { @@ -482,10 +921,10 @@ static void sort( generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); //Merge ranks and indices - mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor); + mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor, sortDir, &k_ranks_idxs, ctx); //Merge elementary intervals - mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir); + mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir, &k_merge, ctx); if (lastSegmentElements <= stride) { @@ -499,10 +938,11 @@ static void sort( // cudaMemcpyDeviceToDevice // ); - //lstCopyOff = okey->offset; // + ((Nfloor - lastSegmentElements) * sizeof(unsigned int)); - //err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, lastSegmentElements * sizeof(unsigned int)); + lstCopyOff = okey->offset + ((Nfloor - lastSegmentElements) * sizeof(unsigned int)); + err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, lastSegmentElements * sizeof(unsigned int)); + if (err != GA_NO_ERROR) printf("error move data\n"); + //err = GpuArray_copy(okey, ikey, GA_C_ORDER); //if (err != GA_NO_ERROR) printf("error move data\n"); - GpuArray_copy(okey, ikey, GA_C_ORDER); } // Swap pointers t = ikey; @@ -515,20 +955,29 @@ static void sort( if (Nleft > 0) { printf("Sorting Remaining part %d \n", Nleft); - bitonicSortShared(d_SrcKey + Nfloor, d_DstKey + Nfloor, 1, Nleft, sortDir, &k_bitonic, ctx); + bitonicSortShared(d_SrcKey, d_DstKey, 1, Nleft, sortDir, Nfloor, &k_bitonic, ctx); + + unsigned int *h_dst = (unsigned int *) malloc ( N * sizeof(unsigned int)); + err = GpuArray_read(h_dst, N * sizeof(unsigned int), d_SrcKey); + if (err != GA_NO_ERROR) printf("error reading \n"); + + int i; + for (i = 0; i < N; i++) + { + printf("%d value %u \n", i, h_dst[i]); + } - // Copy the leftMost segment to the output array of which contains the first sorted sequence + // Copy the leftMost segment to the output array of which contains the first sorted sequence // TODO: uncomment and fix sizeof ////////////////////////////////// //checkCudaErrors(cudaMemcpy(d_DstKey + Nfloor, d_SrcKey + Nfloor, Nleft * sizeof(t_key), cudaMemcpyDeviceToDevice)); - GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); // TODO: copy just the needed part of the buffer + lstCopyOff = okey->offset + Nfloor; + err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, Nleft * sizeof(unsigned int)); + //GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); // TODO: copy just the needed part of the buffer - mergeLeftMostSegment(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir); + mergeGlobalMem(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir, &k_merge_global, ctx); - // TODO: uncomment and fix sizeof - ////////////////////////////////// - //checkCudaErrors(cudaMemcpy(d_DstKey, d_SrcKey , N * sizeof(t_key), cudaMemcpyDeviceToDevice)); GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); } } @@ -536,48 +985,46 @@ static void sort( //cudaDeviceSynchronize(); } - - unsigned int roundDown(unsigned int numToRound, unsigned int multiple) { - if (numToRound <= multiple) - { - return numToRound; - } - else - { - return (numToRound / multiple) * multiple; - } + if (numToRound <= multiple) + { + return numToRound; + } + else + { + return (numToRound / multiple) * multiple; + } } void initMergeSort( - gpudata *d_RanksA, - gpudata *d_RanksB, - gpudata *d_LimitsA, - gpudata *d_LimitsB, + GpuArray *d_RanksA, + GpuArray *d_RanksB, + GpuArray *d_LimitsA, + GpuArray *d_LimitsB, unsigned int MAX_SAMPLE_COUNT, gpucontext *ctx ) { - /*cudaMalloc((void **)d_RanksA, MAX_SAMPLE_COUNT * sizeof(unsigned int)); - cudaMalloc((void **)d_RanksB, MAX_SAMPLE_COUNT * sizeof(unsigned int)); - cudaMalloc((void **)d_LimitsA, MAX_SAMPLE_COUNT * sizeof(unsigned int)); - cudaMalloc((void **)d_LimitsB, MAX_SAMPLE_COUNT * sizeof(unsigned int)); - */ - - int res = GA_NO_ERROR; + int res = GA_NO_ERROR; + const unsigned int nd = 1; + const size_t dims = MAX_SAMPLE_COUNT * sizeof(unsigned int); - d_RanksA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + //d_RanksA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); + res = GpuArray_empty(d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); - d_RanksB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); - - d_LimitsA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + //d_RanksB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); + res = GpuArray_empty(d_RanksB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + + //d_LimitsA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); + res = GpuArray_empty(d_LimitsA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); - d_LimitsB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + //d_LimitsB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); + res = GpuArray_empty(d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); } @@ -588,7 +1035,8 @@ int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray * gpucontext *ctx = GpuArray_context(src); // Device pointers - auxiiary data structure - gpudata *d_RanksA = NULL, *d_RanksB = NULL, *d_LimitsA = NULL, *d_LimitsB = NULL; + //gpudata *d_RanksA = NULL, *d_RanksB = NULL, *d_LimitsA = NULL, *d_LimitsB = NULL; + GpuArray d_RanksA, d_RanksB, d_LimitsA, d_LimitsB; if (arg != NULL) { @@ -603,27 +1051,22 @@ int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray * const unsigned int Nfloor = roundDown(dims, SHARED_SIZE_LIMIT); const int Nleft = dims - Nfloor; - //const unsigned int DIR = 0; - // Device pointers - buffer data strucute GpuArray BufKey; GpuArray_empty(&BufKey, ctx, type, nd, &dims, GA_C_ORDER); - - //checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(t_key))); - // Initialize device auxiliary data structure - initMergeSort(d_RanksA, d_RanksB, d_LimitsA, d_LimitsB, Nfloor / 128, ctx); + initMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, Nfloor / 128, ctx); // perform regular sort sort( dst, &BufKey, src, - d_RanksA, - d_RanksB, - d_LimitsA, - d_LimitsB, + &d_RanksA, + &d_RanksB, + &d_LimitsA, + &d_LimitsB, dims, Nfloor, Nleft, @@ -638,11 +1081,7 @@ int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray * // stbr_append all the kernels.... - // Set arguments - - - - + // Set arguments } return 0; From 656a103d67447a031962b6d552f72f0f90bbb0b2 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Thu, 20 Jul 2017 17:49:57 +0200 Subject: [PATCH 06/19] cluda --- src/gpuarray_sort.c | 569 ++++++++++++++++++++------------------------ 1 file changed, 261 insertions(+), 308 deletions(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index ddbd87d980..1ba47fbcc3 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -4,69 +4,77 @@ #include #include - #include "util/strb.h" #include "private.h" -const int flags = GA_USE_CUDA; - -static const char *code_helper_funcs = \ -"\n#define SAMPLE_STRIDE 128 \n" \ -"\n#define SHARED_SIZE_LIMIT 1024U \n" \ -"__device__ unsigned int iDivUp(unsigned int a, unsigned int b)"\ -"{"\ -" return ((a % b) == 0) ? (a / b) : (a / b + 1); "\ -"} "\ -"__device__ unsigned int getSampleCount(unsigned int dividend) "\ -"{ "\ -" return iDivUp(dividend, SAMPLE_STRIDE); "\ -"}"\ -" \n #define W (sizeof(unsigned int) * 8) \n"\ -"__device__ unsigned int nextPowerOfTwo(unsigned int x) "\ -"{"\ -" return 1U << (W - __clz(x - 1));"\ -"} "\ -" __device__ unsigned int readArray(unsigned int *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ -" if (pos >= length) { " \ -" if (sortDir) { " \ -" return 4294967295; " \ -" } " \ -" else { " \ -" return 0; " \ -" } " \ -" } " \ -" else { " \ -" return a[pos]; " \ -" } " \ -" } " \ -" __device__ void writeArray(unsigned int *a, unsigned int pos, unsigned int value, unsigned int length) " \ -" { " \ -" if (pos >= length) " \ -" { " \ -" return; " \ -" } "\ -" else { " \ -" a[pos] = value; " \ -" } "\ +const int flags = GA_USE_CLUDA; + +static const char *code_helper_funcs = \ +"\n#define SAMPLE_STRIDE 128 \n" \ +"\n#define SHARED_SIZE_LIMIT 1024U \n" \ +"\ntypedef unsigned int t_key;\n" \ +"__device__ unsigned int iDivUp(unsigned int a, unsigned int b)" \ +"{" \ +" return ((a % b) == 0) ? (a / b) : (a / b + 1); " \ +"} " \ +"__device__ unsigned int getSampleCount(unsigned int dividend) " \ +"{ " \ +" return iDivUp(dividend, SAMPLE_STRIDE); " \ +"}" \ +" \n #define W (sizeof(unsigned int) * 8) \n" \ +"__device__ unsigned int nextPowerOfTwo(unsigned int x) " \ +"{" \ +" return 1U << (W - __clz(x - 1));" \ +"} " \ +" __device__ unsigned int readArray(t_key *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ +" if (pos >= length) { " \ +" if (sortDir) { " \ +" return 4294967295; " \ +" } " \ +" else { " \ +" return 0; " \ +" } " \ +" } " \ +" else { " \ +" return a[pos]; " \ +" } " \ +" } " \ +" __device__ void writeArray(t_key *a, unsigned int pos, t_key value, unsigned int length) " \ +" { " \ +" if (pos >= length) " \ +" { " \ +" return; " \ +" } " \ +" else { " \ +" a[pos] = value; " \ +" } " \ " }\n"; -static unsigned int iDivUp(unsigned int a, unsigned int b) -{ +static unsigned int iDivUp(unsigned int a, unsigned int b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); } -static unsigned int getSampleCount(unsigned int dividend) -{ +static unsigned int getSampleCount(unsigned int dividend) { return iDivUp(dividend, SAMPLE_STRIDE); } -static unsigned int ceiling(unsigned int n, unsigned int v) -{ - return (!n%v) ? n/v : (n/v) + 1; +static unsigned int roundDown(unsigned int numToRound, unsigned int multiple) { + if (numToRound <= multiple) + return numToRound; + else + return (numToRound / multiple) * multiple; +} + +static inline const char *ctype(int typecode) { + return gpuarray_get_type(typecode)->cluda_name; +} + +static inline size_t typesize(int typecode) { + return gpuarray_get_type(typecode)->size; } static const char *code_bin_search = \ -"__device__ unsigned int binarySearchInclusive(unsigned int val, unsigned int *data, unsigned int L, "\ +"__device__ unsigned int binarySearchInclusive(t_key val, t_key *data, unsigned int L, "\ " unsigned int stride, unsigned int sortDir){"\ " if (L == 0) "\ " return 0; "\ @@ -79,7 +87,7 @@ static const char *code_bin_search = " } "\ " return pos; "\ "} "\ -"__device__ unsigned int binarySearchExclusive(unsigned int val, unsigned int *data, unsigned int L, " \ +"__device__ unsigned int binarySearchExclusive(t_key val, t_key *data, unsigned int L, " \ " unsigned int stride, unsigned int sortDir) "\ "{ "\ " if (L == 0) "\ @@ -93,29 +101,29 @@ static const char *code_bin_search = " } "\ " return pos; "\ "}"\ -"__device__ unsigned int binarySearchLowerBoundExclusive(unsigned int val, unsigned int *ptr, unsigned int first, "\ -" unsigned int last, unsigned int sortDir) "\ +"__device__ unsigned int binarySearchLowerBoundExclusive(t_key val, t_key *ptr, unsigned int first," \ +" unsigned int last, unsigned int sortDir) " \ "{ "\ -" unsigned int len = last - first; "\ -" unsigned int half; "\ -" unsigned int middle; "\ +" unsigned int len = last - first; " \ +" unsigned int half; " \ +" unsigned int middle; " \ " while (len > 0) "\ " { "\ -" half = len >> 1; "\ -" middle = first; "\ -" middle += half; "\ +" half = len >> 1; " \ +" middle = first; " \ +" middle += half; " \ " if ( (sortDir && ptr[middle] < val) || (!sortDir && ptr[middle] > val) ) "\ " { "\ -" first = middle; "\ +" first = middle; " \ " ++first; "\ " len = len - half - 1; "\ " } "\ " else "\ -" len = half; "\ +" len = half; " \ " } "\ " return first; "\ "} "\ -"__device__ unsigned int binarySearchLowerBoundInclusive(unsigned int val, unsigned int *ptr, unsigned int first, "\ +"__device__ unsigned int binarySearchLowerBoundInclusive(t_key val, t_key *ptr, unsigned int first, "\ " unsigned int last, unsigned int sortDir) "\ "{ "\ " unsigned int len = last - first; "\ @@ -142,9 +150,9 @@ static const char *code_bin_search = const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_bitonic_smem = \ " extern \"C\" __global__ void bitonicSortSharedKernel( "\ -" unsigned int *d_DstKey, "\ +" t_key *d_DstKey, "\ " size_t dstOff," -" unsigned int *d_SrcKey, "\ +" t_key *d_SrcKey, "\ " size_t srcOff," " unsigned int batchSize, "\ " unsigned int arrayLength, "\ @@ -156,7 +164,7 @@ static const char *code_bitonic_smem = " d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ " d_DstKey += elemsOff;" \ " d_SrcKey += elemsOff;" \ -" __shared__ unsigned int s_key[SHARED_SIZE_LIMIT]; "\ +" __shared__ t_key s_key[SHARED_SIZE_LIMIT]; "\ " s_key[threadIdx.x] = readArray( d_SrcKey, "\ " blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ " arrayLength * batchSize, "\ @@ -218,18 +226,9 @@ static void bitonicSortShared( gpucontext *ctx ) { - char *err_str = NULL; size_t ls, gs; unsigned int p = 0; int err; - size_t lens[NSTR_BITONIC] = {strlen(code_helper_funcs), strlen(code_bitonic_smem)}; - const char *codes[NSTR_BITONIC] = {code_helper_funcs, code_bitonic_smem}; - - err = GpuKernel_init( k_bitonic, ctx, NSTR_BITONIC, - codes, lens, "bitonicSortSharedKernel", - NUMARGS_BITONIC_KERNEL, type_args_bitonic, flags, &err_str); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); ls = SHARED_SIZE_LIMIT / 2; gs = batchSize; @@ -267,11 +266,11 @@ static void bitonicSortShared( const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_sample_ranks = \ "extern \"C\" __global__ void generateSampleRanksKernel(" \ -" unsigned int *d_RanksA,"\ -" size_t rankAOff," \ -" unsigned int *d_RanksB,"\ +" unsigned int *d_RanksA," \ +" size_t rankAOff," \ +" unsigned int *d_RanksB," \ " size_t rankBOff," \ -" unsigned int *d_SrcKey,"\ +" t_key *d_SrcKey,"\ " size_t srcOff," \ " unsigned int stride," \ " unsigned int N," \ @@ -328,18 +327,9 @@ static void generateSampleRanks( unsigned int lastSegmentElements = N % (2 * stride); unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - char *err_str = NULL; size_t ls, gs; unsigned int p = 0; int err; - const char *codes[NSTR_RANKS] = {code_helper_funcs, code_bin_search, code_sample_ranks}; - size_t lens[NSTR_RANKS] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; - - err = GpuKernel_init(k_ranks, ctx, NSTR_RANKS, - codes, lens, "generateSampleRanksKernel", - NUMARGS_SAMPLE_RANKS, type_args_ranks, flags, &err_str); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); ls = 256; gs = iDivUp(threadCount, 256); @@ -448,19 +438,9 @@ static void mergeRanksAndIndices( unsigned int lastSegmentElements = N % (2 * stride); unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - - char *err_str = NULL; size_t ls, gs; unsigned int p = 0; int err; - const char *codes[NSTRINGS_RKS_IDX] = {code_helper_funcs, code_bin_search, code_ranks_idxs}; - size_t lens[NSTRINGS_RKS_IDX] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_ranks_idxs)}; - - err = GpuKernel_init(k_ranks_idxs, ctx, NSTRINGS_RKS_IDX, - codes, lens, "mergeRanksAndIndicesKernel", - NUMARGS_RANKS_IDXS, type_args_ranks_idxs, flags, &err_str); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); ls = 256U; gs = iDivUp(threadCount, 256U); @@ -507,7 +487,7 @@ static void mergeRanksAndIndices( if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); - unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); + /*unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); err = GpuArray_read(h_dst, (2048/128) * sizeof(unsigned int), d_LimitsB); if (err != GA_NO_ERROR) printf("error reading \n"); @@ -515,31 +495,6 @@ static void mergeRanksAndIndices( err = GpuArray_read(h_dst2, (2048/128) * sizeof(unsigned int), d_RanksB); if (err != GA_NO_ERROR) printf("error reading \n"); - /* - int i; - for (i = 0; i < 2048/128; i++) - { - printf("%d Limit %u Rank %u \n", i, h_dst[i], h_dst2[i]); - } - */ - - /*mergeRanksAndIndicesKernel<<>>( - d_LimitsA, - d_RanksA, - stride, - N, - threadCount - ); - printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - - mergeRanksAndIndicesKernel<<>>( - d_LimitsB, - d_RanksB, - stride, - N, - threadCount - ); - printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); */ } @@ -663,19 +618,9 @@ static void mergeElementaryIntervals( unsigned int lastSegmentElements = N % (2 * stride); unsigned int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; - char *err_str = NULL; size_t ls, gs; unsigned int p = 0; int err; - const char *codes[NSTRINGS_MERGE] = {code_helper_funcs, code_bin_search, code_merge}; - size_t lens[NSTRINGS_MERGE] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge)}; - - err = GpuKernel_init(k_merge, ctx, NSTRINGS_MERGE, - codes, lens, "mergeElementaryIntervalsKernel", - NUMARGS_MERGE, type_args_merge, flags, &err_str); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); - ls = SAMPLE_STRIDE; gs = mergePairs; @@ -714,36 +659,7 @@ static void mergeElementaryIntervals( if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); err = GpuKernel_call(k_merge, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); - -/* - if (sortDir) - { - mergeElementaryIntervalsKernel<1U><<>>( - d_DstKey, - d_SrcKey, - d_LimitsA, - d_LimitsB, - stride, - N - ); - printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - } - else - { - - mergeElementaryIntervalsKernel<0U><<>>( - d_DstKey, - d_SrcKey, - d_LimitsA, - d_LimitsB, - stride, - N - ); - printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - - } - */ + if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); } #define NUMARGS_MERGE_GLB 8 @@ -794,22 +710,12 @@ static void mergeGlobalMem( gpucontext *ctx ) { - char *err_str = NULL; size_t ls, gs; unsigned int p = 0; int err; - const char *codes[NSTRINGS_MERGE_GLB] = {code_bin_search, code_merge_glb}; - size_t lens[NSTRINGS_MERGE_GLB] = {strlen(code_bin_search), strlen(code_merge_glb)}; - - err = GpuKernel_init(k_merge_global, ctx, NSTRINGS_MERGE_GLB, - codes, lens, "mergeGlobalMemKernel", - NUMARGS_MERGE_GLB, type_args_merge_glb, flags, &err_str); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); ls = 256; - gs = ceiling(N, ls); - + gs = iDivUp(N, ls); err = GpuKernel_setarg(k_merge_global, p++, d_DstKey->data); if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); @@ -837,20 +743,106 @@ static void mergeGlobalMem( err = GpuKernel_call(k_merge_global, 1, &gs, &ls, 0, NULL); if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); -/* - if (sortDir) - { - //mergeLeftMostSegmentKernel<1U><<>>(d_DstKey, d_SrcKey, segmentSizeA, segmentSizeB, N); - //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - } - else - { - //mergeLeftMostSegmentKernel<0U><<>>(d_DstKey, d_SrcKey, segmentSizeA, segmentSizeB, N); - //printLastCudaError(cudaGetLastError(), __LINE__, __FILE__); - } -*/ + +} + +static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k_ranks_idxs, GpuKernel *k_merge, GpuKernel *k_merge_global, gpucontext *ctx) +{ + char *err_str = NULL; + int err; + + // Compile Bitonic sort Kernel + size_t lens_bitonic[NSTR_BITONIC] = {strlen(code_helper_funcs), strlen(code_bitonic_smem)}; + const char *codes_bitonic[NSTR_BITONIC] = {code_helper_funcs, code_bitonic_smem}; + + err = GpuKernel_init( k_bitonic, + ctx, + NSTR_BITONIC, + codes_bitonic, + lens_bitonic, + "bitonicSortSharedKernel", + NUMARGS_BITONIC_KERNEL, + type_args_bitonic, + flags, + &err_str + ); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + // Compile ranks kernel + size_t lens_ranks[NSTR_RANKS] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; + const char *codes_ranks[NSTR_RANKS] = {code_helper_funcs, code_bin_search, code_sample_ranks}; + + err = GpuKernel_init( k_ranks, + ctx, + NSTR_RANKS, + codes_ranks, + lens_ranks, + "generateSampleRanksKernel", + NUMARGS_SAMPLE_RANKS, + type_args_ranks, + flags, + &err_str + ); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + // Compile ranks and idxs kernel + size_t lens_rks_idx[NSTRINGS_RKS_IDX] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_ranks_idxs)}; + const char *codes_rks_idx[NSTRINGS_RKS_IDX] = {code_helper_funcs, code_bin_search, code_ranks_idxs}; + + err = GpuKernel_init( k_ranks_idxs, + ctx, + NSTRINGS_RKS_IDX, + codes_rks_idx, + lens_rks_idx, + "mergeRanksAndIndicesKernel", + NUMARGS_RANKS_IDXS, + type_args_ranks_idxs, + flags, + &err_str + ); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + // Compile merge kernel + size_t lens_merge[NSTRINGS_MERGE] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge)}; + const char *codes_merge[NSTRINGS_MERGE] = {code_helper_funcs, code_bin_search, code_merge}; + + err = GpuKernel_init( k_merge, + ctx, + NSTRINGS_MERGE, + codes_merge, + lens_merge, + "mergeElementaryIntervalsKernel", + NUMARGS_MERGE, + type_args_merge, + flags, + &err_str + ); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + + // Compile merge global kernel + size_t lens_merge_glb[NSTRINGS_MERGE_GLB] = {strlen(code_bin_search), strlen(code_merge_glb)}; + const char *codes_merge_glb[NSTRINGS_MERGE_GLB] = {code_bin_search, code_merge_glb}; + + err = GpuKernel_init( k_merge_global, + ctx, + NSTRINGS_MERGE_GLB, + codes_merge_glb, + lens_merge_glb, + "mergeGlobalMemKernel", + NUMARGS_MERGE_GLB, + type_args_merge_glb, + flags, + &err_str + ); + if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); + if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); } + static void sort( GpuArray *d_DstKey, GpuArray *d_BufKey, @@ -866,138 +858,98 @@ static void sort( gpucontext *ctx ) { - GpuArray *ikey, *okey; - GpuArray *t; // Aux pointer - - GpuKernel k_bitonic; - GpuKernel k_ranks; - GpuKernel k_ranks_idxs; - GpuKernel k_merge; - GpuKernel k_merge_global; - - size_t lstCopyOff; - int err; - - unsigned int stageCount = 0; - unsigned int stride; - for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1, stageCount++); - - if (stageCount & 1) - { - printf("bffkey\n"); - ikey = d_BufKey; - okey = d_DstKey; - } - else - { - printf("d_DstKey\n"); - ikey = d_DstKey; - okey = d_BufKey; - } - - ///////////////////////////////////////////////////////////////////////// - // Sort the array with bitonic sort for arrays shorter than 1024 elements - // Bitonic sort gives better performance than merge sort for short arrays - ///////////////////////////////////////////////////////////////////////// - - if (N <= SHARED_SIZE_LIMIT) - { - bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); - } - /////////////////////////////////////////////////////////////////////////////// - // Sort the array with merge sort for arrays equal or bigger than 1024 elements - /////////////////////////////////////////////////////////////////////////////// - else - { - unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; - unsigned int arrayLength = SHARED_SIZE_LIMIT; - bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0, &k_bitonic, ctx); - - for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) - { - unsigned int lastSegmentElements = Nfloor % (2 * stride); - - //Find sample ranks and prepare for limiters merge - generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); - - //Merge ranks and indices - mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor, sortDir, &k_ranks_idxs, ctx); - - //Merge elementary intervals - mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir, &k_merge, ctx); - - if (lastSegmentElements <= stride) - { - //Last merge segment consists of a single array which just needs to be passed through - printf("inside last segment\n"); - // TODO: uncomment and fix sizeof - ////////////////////////////////// - //cudaMemcpy( okey + (Nfloor - lastSegmentElements), - // ikey + (Nfloor - lastSegmentElements), - // lastSegmentElements * sizeof(t_key), - // cudaMemcpyDeviceToDevice - // ); - - lstCopyOff = okey->offset + ((Nfloor - lastSegmentElements) * sizeof(unsigned int)); - err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, lastSegmentElements * sizeof(unsigned int)); - if (err != GA_NO_ERROR) printf("error move data\n"); - //err = GpuArray_copy(okey, ikey, GA_C_ORDER); - //if (err != GA_NO_ERROR) printf("error move data\n"); - } - // Swap pointers - t = ikey; - ikey = okey; - okey = t; - } - - // If the array is not multiple of 1024, sort the leftmost part - // and perform merge sort of the two last segments - if (Nleft > 0) - { - printf("Sorting Remaining part %d \n", Nleft); - bitonicSortShared(d_SrcKey, d_DstKey, 1, Nleft, sortDir, Nfloor, &k_bitonic, ctx); - - unsigned int *h_dst = (unsigned int *) malloc ( N * sizeof(unsigned int)); - err = GpuArray_read(h_dst, N * sizeof(unsigned int), d_SrcKey); - if (err != GA_NO_ERROR) printf("error reading \n"); + int typecode = d_SrcKey->typecode; + size_t typeSize = typesize(typecode); - int i; - for (i = 0; i < N; i++) - { - printf("%d value %u \n", i, h_dst[i]); - } + size_t lstCopyOff; + int err; + GpuArray *ikey, *okey, *t; + GpuKernel k_bitonic, k_ranks, k_ranks_idxs, k_merge, k_merge_global; + compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx); - // Copy the leftMost segment to the output array of which contains the first sorted sequence - // TODO: uncomment and fix sizeof - ////////////////////////////////// - //checkCudaErrors(cudaMemcpy(d_DstKey + Nfloor, d_SrcKey + Nfloor, Nleft * sizeof(t_key), cudaMemcpyDeviceToDevice)); - lstCopyOff = okey->offset + Nfloor; - err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, Nleft * sizeof(unsigned int)); - //GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); // TODO: copy just the needed part of the buffer + unsigned int stageCount = 0; + unsigned int stride; + for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1, stageCount++); - mergeGlobalMem(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir, &k_merge_global, ctx); + if (stageCount & 1) { + ikey = d_BufKey; + okey = d_DstKey; + } + else { + ikey = d_DstKey; + okey = d_BufKey; + } - GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); + ///////////////////////////////////////////////////////////////////////// + // Sort the array with bitonic sort for arrays shorter than 1024 elements + // Bitonic sort gives better performance than merge sort for short arrays + ///////////////////////////////////////////////////////////////////////// + if (N <= SHARED_SIZE_LIMIT) { + bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); + } + /////////////////////////////////////////////////////////////////////////////// + // Sort the array with merge sort for arrays equal or bigger than 1024 elements + /////////////////////////////////////////////////////////////////////////////// + else { + unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; + unsigned int arrayLength = SHARED_SIZE_LIMIT; + bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0, &k_bitonic, ctx); + + for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) { + unsigned int lastSegmentElements = Nfloor % (2 * stride); + + //Find sample ranks and prepare for limiters merge + generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); + + //Merge ranks and indices + mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor, sortDir, &k_ranks_idxs, ctx); + + //Merge elementary intervals + mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir, &k_merge, ctx); + + if (lastSegmentElements <= stride) { + //Last merge segment consists of a single array which just needs to be passed through + lstCopyOff = okey->offset + ((Nfloor - lastSegmentElements) * typeSize); + err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, lastSegmentElements * typeSize); + if (err != GA_NO_ERROR) printf("error move data\n"); } + // Swap pointers + t = ikey; + ikey = okey; + okey = t; } - //GpuArray_copy(d_DstKey, d_BufKey, GA_C_ORDER); - //cudaDeviceSynchronize(); -} + // If the array is not multiple of 1024, sort the remaining and merge + if (Nleft > 0) { + printf("Sorting Remaining part %d \n", Nleft); + bitonicSortShared(d_SrcKey, d_DstKey, 1, Nleft, sortDir, Nfloor, &k_bitonic, ctx); +/* + unsigned int *h_dst = (unsigned int *) malloc ( N * sizeof(unsigned int)); + err = GpuArray_read(h_dst, N * sizeof(unsigned int), d_SrcKey); + if (err != GA_NO_ERROR) printf("error reading \n"); -unsigned int roundDown(unsigned int numToRound, unsigned int multiple) -{ - if (numToRound <= multiple) - { - return numToRound; - } - else - { - return (numToRound / multiple) * multiple; + int i; + for (i = 0; i < N; i++) + { + printf("%d value %u \n", i, h_dst[i]); + } +*/ + + // Copy the leftMost segment to the output array of which contains the first sorted sequence + lstCopyOff = okey->offset + Nfloor * typeSize; + err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, Nleft * typeSize); + + mergeGlobalMem(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir, &k_merge_global, ctx); + GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); + } } + //GpuArray_copy(d_DstKey, d_BufKey, GA_C_ORDER); + //cudaDeviceSynchronize(); } -void initMergeSort( + + +static void initMergeSort( GpuArray *d_RanksA, GpuArray *d_RanksB, GpuArray *d_LimitsA, @@ -1027,15 +979,16 @@ void initMergeSort( if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); } - int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray *arg) { int type = src->typecode; gpucontext *ctx = GpuArray_context(src); + printf("%s\n", gpuarray_get_type(type)->cluda_name ); + printf("%u\n", typesize(type) ); + // Device pointers - auxiiary data structure - //gpudata *d_RanksA = NULL, *d_RanksB = NULL, *d_LimitsA = NULL, *d_LimitsB = NULL; GpuArray d_RanksA, d_RanksB, d_LimitsA, d_LimitsB; if (arg != NULL) From f3070ed6df4a06cedab63f133543bfdec81a3d11 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Thu, 20 Jul 2017 21:30:24 +0200 Subject: [PATCH 07/19] multi data type string generation --- src/gpuarray_sort.c | 191 +++++++++++++++++++++++++++----------------- 1 file changed, 117 insertions(+), 74 deletions(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 1ba47fbcc3..cd6ad904c4 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -8,11 +8,13 @@ #include "private.h" const int flags = GA_USE_CLUDA; +//"\ntypedef float t_key;\n" \ static const char *code_helper_funcs = \ "\n#define SAMPLE_STRIDE 128 \n" \ "\n#define SHARED_SIZE_LIMIT 1024U \n" \ -"\ntypedef unsigned int t_key;\n" \ +"\n#define MAX_NUM 340282346638528859811704183484516925440.000000F\n" \ +"\n#define MIN_NUM -340282346638528859811704183484516925440.000000F\n" \ "__device__ unsigned int iDivUp(unsigned int a, unsigned int b)" \ "{" \ " return ((a % b) == 0) ? (a / b) : (a / b + 1); " \ @@ -21,25 +23,25 @@ static const char *code_helper_funcs = "{ " \ " return iDivUp(dividend, SAMPLE_STRIDE); " \ "}" \ -" \n #define W (sizeof(unsigned int) * 8) \n" \ +"\n #define W (sizeof(unsigned int) * 8) \n" \ "__device__ unsigned int nextPowerOfTwo(unsigned int x) " \ "{" \ " return 1U << (W - __clz(x - 1));" \ "} " \ -" __device__ unsigned int readArray(t_key *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ +"template __device__ T readArray(T *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ " if (pos >= length) { " \ " if (sortDir) { " \ -" return 4294967295; " \ +" return MAX_NUM; " \ " } " \ " else { " \ -" return 0; " \ +" return MIN_NUM; " \ " } " \ " } " \ " else { " \ " return a[pos]; " \ " } " \ " } " \ -" __device__ void writeArray(t_key *a, unsigned int pos, t_key value, unsigned int length) " \ +"template __device__ void writeArray(T *a, unsigned int pos, T value, unsigned int length) " \ " { " \ " if (pos >= length) " \ " { " \ @@ -74,7 +76,7 @@ static inline size_t typesize(int typecode) { } static const char *code_bin_search = \ -"__device__ unsigned int binarySearchInclusive(t_key val, t_key *data, unsigned int L, "\ +"template __device__ unsigned int binarySearchInclusive(T val, T *data, unsigned int L, "\ " unsigned int stride, unsigned int sortDir){"\ " if (L == 0) "\ " return 0; "\ @@ -87,7 +89,7 @@ static const char *code_bin_search = " } "\ " return pos; "\ "} "\ -"__device__ unsigned int binarySearchExclusive(t_key val, t_key *data, unsigned int L, " \ +" template __device__ unsigned int binarySearchExclusive(T val, T *data, unsigned int L, " \ " unsigned int stride, unsigned int sortDir) "\ "{ "\ " if (L == 0) "\ @@ -101,8 +103,8 @@ static const char *code_bin_search = " } "\ " return pos; "\ "}"\ -"__device__ unsigned int binarySearchLowerBoundExclusive(t_key val, t_key *ptr, unsigned int first," \ -" unsigned int last, unsigned int sortDir) " \ +"template __device__ unsigned int binarySearchLowerBoundExclusive(T val, T *ptr, unsigned int first," \ +" unsigned int last, unsigned int sortDir) " \ "{ "\ " unsigned int len = last - first; " \ " unsigned int half; " \ @@ -123,8 +125,8 @@ static const char *code_bin_search = " } "\ " return first; "\ "} "\ -"__device__ unsigned int binarySearchLowerBoundInclusive(t_key val, t_key *ptr, unsigned int first, "\ -" unsigned int last, unsigned int sortDir) "\ +"template __device__ unsigned int binarySearchLowerBoundInclusive(T val, T *ptr, unsigned int first, "\ +" unsigned int last, unsigned int sortDir) "\ "{ "\ " unsigned int len = last - first; "\ " unsigned int half; "\ @@ -160,21 +162,21 @@ static const char *code_bitonic_smem = " unsigned int sortDir "\ " ) "\ " { "\ -" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ -" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ " d_DstKey += elemsOff;" \ " d_SrcKey += elemsOff;" \ " __shared__ t_key s_key[SHARED_SIZE_LIMIT]; "\ -" s_key[threadIdx.x] = readArray( d_SrcKey, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ -" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcKey, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), "\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ +" s_key[threadIdx.x] = readArray( d_SrcKey, "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ +" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcKey, "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), "\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ " for (unsigned int size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) "\ " { "\ " unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); "\ @@ -182,7 +184,7 @@ static const char *code_bitonic_smem = " { "\ " __syncthreads(); "\ " unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); "\ -" unsigned int t; "\ +" t_key t; "\ " if ((s_key[pos] > s_key[pos + stride]) == ddd) { "\ " t = s_key[pos]; "\ " s_key[pos] = s_key[pos + stride]; "\ @@ -194,7 +196,7 @@ static const char *code_bitonic_smem = " for (unsigned int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {" \ " __syncthreads(); "\ " unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); "\ -" unsigned int t; "\ +" t_key t; "\ " if ((s_key[pos] > s_key[pos + stride]) == sortDir) {" \ " t = s_key[pos]; "\ " s_key[pos] = s_key[pos + stride]; "\ @@ -203,18 +205,17 @@ static const char *code_bitonic_smem = " } "\ " } "\ " __syncthreads(); "\ -" writeArray( d_DstKey, "\ +" writeArray( d_DstKey, "\ " blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ " s_key[threadIdx.x], "\ " arrayLength * batchSize "\ " ); "\ -" writeArray( d_DstKey, "\ +" writeArray( d_DstKey, "\ " blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), "\ " s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], "\ " arrayLength * batchSize "\ " ); "\ " }\n"; -#define NSTR_BITONIC 2 static void bitonicSortShared( GpuArray *d_DstKey, GpuArray *d_SrcKey, @@ -260,6 +261,17 @@ static void bitonicSortShared( err = GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL); if (err != GA_NO_ERROR) printf("error calling kernel %d \n", p); + float *h_dst2 = (float *) malloc ( 16 * sizeof(float)); + err = GpuArray_read(h_dst2, 16 * sizeof(float), d_DstKey); + if (err != GA_NO_ERROR) printf("error reading \n"); + + /* + int i; + for (i = 0; i < 16; i++) + { + printf("%d afterbitonic %f \n", i, h_dst2[i]); + } + */ } #define NUMARGS_SAMPLE_RANKS 10 @@ -280,7 +292,7 @@ static const char *code_sample_ranks = "{" \ " d_RanksA = (unsigned int*) (((char*)d_RanksA)+ rankAOff);" \ " d_RanksB = (unsigned int*) (((char*)d_RanksB)+ rankBOff);" \ -" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ " unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;"\ " if (pos >= threadCount)" \ " {"\ @@ -298,7 +310,7 @@ static const char *code_sample_ranks = " if (i < segmentSamplesA)"\ " {"\ " d_RanksA[i] = i * SAMPLE_STRIDE;"\ -" d_RanksB[i] = binarySearchExclusive("\ +" d_RanksB[i] = binarySearchExclusive("\ " d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride,"\ " segmentElementsB, nextPowerOfTwo(segmentElementsB), sortDir"\ " );"\ @@ -306,13 +318,12 @@ static const char *code_sample_ranks = " if (i < segmentSamplesB)"\ " {"\ " d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;"\ -" d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive("\ +" d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive("\ " d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0,"\ " segmentElementsA, nextPowerOfTwo(segmentElementsA), sortDir"\ " );"\ " }"\ "}\n"; -#define NSTR_RANKS 3 static void generateSampleRanks( GpuArray *d_RanksA, GpuArray *d_RanksB, @@ -413,16 +424,15 @@ static const char *code_ranks_idxs = " const unsigned int segmentSamplesB = getSampleCount(segmentElementsB); "\ " if (i < segmentSamplesA) "\ " { "\ -" unsigned int dstPos = binarySearchExclusive(d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB), 1U) + i; "\ +" unsigned int dstPos = binarySearchExclusive(d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB), 1U) + i; "\ " d_Limits[dstPos] = d_Ranks[i]; "\ " } "\ " if (i < segmentSamplesB) "\ " { "\ -" unsigned int dstPos = binarySearchInclusive(d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA), 1U) + i; "\ +" unsigned int dstPos = binarySearchInclusive(d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA), 1U) + i; "\ " d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; "\ " } "\ "}\n"; -#define NSTRINGS_RKS_IDX 3 static void mergeRanksAndIndices( GpuArray *d_LimitsA, GpuArray *d_LimitsB, @@ -501,10 +511,10 @@ static void mergeRanksAndIndices( #define NUMARGS_MERGE 11 const int type_args_merge[NUMARGS_MERGE] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; static const char *code_merge = \ -"__device__ void merge( "\ -" unsigned int *dstKey, "\ -" unsigned int *srcAKey, "\ -" unsigned int *srcBKey, "\ +" template __device__ void merge( "\ +" T *dstKey, "\ +" T *srcAKey, "\ +" T *srcBKey, "\ " unsigned int lenA, "\ " unsigned int nPowTwoLenA, "\ " unsigned int lenB, "\ @@ -512,17 +522,17 @@ static const char *code_merge = \ " unsigned int sortDir "\ ") "\ "{ "\ -" unsigned int keyA, keyB; "\ +" T keyA, keyB; "\ " unsigned int dstPosA , dstPosB;"\ " if (threadIdx.x < lenA) "\ " { "\ " keyA = srcAKey[threadIdx.x]; "\ -" dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB, sortDir) + threadIdx.x; "\ +" dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB, sortDir) + threadIdx.x; "\ " } "\ " if (threadIdx.x < lenB) "\ " { "\ " keyB = srcBKey[threadIdx.x]; "\ -" dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA, sortDir) + threadIdx.x; "\ +" dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA, sortDir) + threadIdx.x; "\ " } "\ " __syncthreads(); "\ " if (threadIdx.x < lenA) "\ @@ -535,9 +545,9 @@ static const char *code_merge = \ " } "\ "} "\ "extern \"C\" __global__ void mergeElementaryIntervalsKernel( "\ -" unsigned int *d_DstKey, "\ +" t_key *d_DstKey, "\ " size_t dstOff," \ -" unsigned int *d_SrcKey, "\ +" t_key *d_SrcKey, "\ " size_t srcOff," \ " unsigned int *d_LimitsA, "\ " size_t limAOff," \ @@ -548,11 +558,11 @@ static const char *code_merge = \ " unsigned int sortDir" ") "\ "{ "\ -" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ -" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ " d_LimitsA = (unsigned int*) (((char*)d_LimitsA)+ limAOff);" \ " d_LimitsB = (unsigned int*) (((char*)d_LimitsB)+ limBOff);" \ -" __shared__ unsigned int s_key[2 * SAMPLE_STRIDE]; "\ +" __shared__ t_key s_key[2 * SAMPLE_STRIDE]; "\ " const unsigned int intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); "\ " const unsigned int segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; "\ " d_SrcKey += segmentBase; "\ @@ -584,7 +594,7 @@ static const char *code_merge = \ " s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x]; "\ " } "\ " __syncthreads(); "\ -" merge( "\ +" merge( "\ " s_key, "\ " s_key + 0, "\ " s_key + SAMPLE_STRIDE, "\ @@ -602,7 +612,6 @@ static const char *code_merge = \ " d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; "\ " } "\ "}\n"; -#define NSTRINGS_MERGE 3 static void mergeElementaryIntervals( GpuArray *d_DstKey, GpuArray *d_SrcKey, @@ -666,9 +675,9 @@ static void mergeElementaryIntervals( const int type_args_merge_glb[NUMARGS_MERGE_GLB] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_merge_glb = \ "extern \"C\" __global__ void mergeGlobalMemKernel( "\ -" unsigned int *d_DstKey, "\ +" t_key *d_DstKey, "\ " size_t dstOff, "\ -" unsigned int *d_SrcKey, "\ +" t_key *d_SrcKey, "\ " size_t srcOff, "\ " unsigned int segmentSizeA, "\ " unsigned int segmentSizeB, "\ @@ -676,29 +685,28 @@ static const char *code_merge_glb = \ " unsigned int sortDir "\ ") "\ "{ "\ -" d_DstKey = (unsigned int*) (((char*)d_DstKey)+ dstOff);" \ -" d_SrcKey = (unsigned int*) (((char*)d_SrcKey)+ srcOff);" \ +" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ " unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; "\ -" unsigned int *segmentPtrA = d_SrcKey; "\ -" unsigned int *segmentPtrB = d_SrcKey + segmentSizeA; "\ +" t_key *segmentPtrA = d_SrcKey; "\ +" t_key *segmentPtrB = d_SrcKey + segmentSizeA; "\ " unsigned int idxSegmentA = idx % segmentSizeA; "\ " unsigned int idxSegmentB = idx - segmentSizeA; "\ " if (idx >= N) "\ " return; "\ -" unsigned int value = d_SrcKey[idx]; "\ +" t_key value = d_SrcKey[idx]; "\ " unsigned int dstPos; "\ " if (idx < segmentSizeA) "\ " { "\ -" dstPos = binarySearchLowerBoundExclusive(value, segmentPtrB, 0, segmentSizeB, sortDir) + idxSegmentA; "\ +" dstPos = binarySearchLowerBoundExclusive(value, segmentPtrB, 0, segmentSizeB, sortDir) + idxSegmentA; "\ " } "\ " else "\ " { "\ -" dstPos = binarySearchLowerBoundInclusive(value, segmentPtrA, 0, segmentSizeA, sortDir) + idxSegmentB; "\ +" dstPos = binarySearchLowerBoundInclusive(value, segmentPtrA, 0, segmentSizeA, sortDir) + idxSegmentB; "\ " } "\ " d_DstKey[dstPos] = value; "\ "}\n"; -#define NSTRINGS_MERGE_GLB 2 static void mergeGlobalMem( GpuArray *d_DstKey, GpuArray *d_SrcKey, @@ -746,14 +754,39 @@ static void mergeGlobalMem( } -static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k_ranks_idxs, GpuKernel *k_merge, GpuKernel *k_merge_global, gpucontext *ctx) +// Generate type specific GPU code +static void genMergeSortCode(char *str, int typecode) +{ + // Generate typedef for the data type to be sorted + sprintf(str, "typedef %s t_key;\n", ctype(typecode)); + printf("sssss %s\n", str); + + // Generate macro for MIN and MAX value of a given data type + + + + +} + +#define NSTR_BITONIC 3 +#define NSTR_RANKS 4 +#define NSTRINGS_RKS_IDX 4 +#define NSTRINGS_MERGE 4 +#define NSTRINGS_MERGE_GLB 4 +static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k_ranks_idxs, GpuKernel *k_merge, + GpuKernel *k_merge_global, gpucontext *ctx, int typecode) { char *err_str = NULL; int err; + char code_typecode[100]; + genMergeSortCode(code_typecode, typecode); + // Compile Bitonic sort Kernel - size_t lens_bitonic[NSTR_BITONIC] = {strlen(code_helper_funcs), strlen(code_bitonic_smem)}; - const char *codes_bitonic[NSTR_BITONIC] = {code_helper_funcs, code_bitonic_smem}; + size_t lens_bitonic[NSTR_BITONIC] = {0, strlen(code_helper_funcs), strlen(code_bitonic_smem)}; + const char *codes_bitonic[NSTR_BITONIC] = {NULL, code_helper_funcs, code_bitonic_smem}; + lens_bitonic[0] = strlen(code_typecode); + codes_bitonic[0] = code_typecode; err = GpuKernel_init( k_bitonic, ctx, @@ -770,8 +803,10 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile ranks kernel - size_t lens_ranks[NSTR_RANKS] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; - const char *codes_ranks[NSTR_RANKS] = {code_helper_funcs, code_bin_search, code_sample_ranks}; + size_t lens_ranks[NSTR_RANKS] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; + const char *codes_ranks[NSTR_RANKS] = {NULL, code_helper_funcs, code_bin_search, code_sample_ranks}; + lens_ranks[0] = strlen(code_typecode); + codes_ranks[0] = code_typecode; err = GpuKernel_init( k_ranks, ctx, @@ -788,8 +823,10 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile ranks and idxs kernel - size_t lens_rks_idx[NSTRINGS_RKS_IDX] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_ranks_idxs)}; - const char *codes_rks_idx[NSTRINGS_RKS_IDX] = {code_helper_funcs, code_bin_search, code_ranks_idxs}; + size_t lens_rks_idx[NSTRINGS_RKS_IDX] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_ranks_idxs)}; + const char *codes_rks_idx[NSTRINGS_RKS_IDX] = {NULL, code_helper_funcs, code_bin_search, code_ranks_idxs}; + lens_rks_idx[0] = strlen(code_typecode); + codes_rks_idx[0] = code_typecode; err = GpuKernel_init( k_ranks_idxs, ctx, @@ -806,8 +843,10 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile merge kernel - size_t lens_merge[NSTRINGS_MERGE] = {strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge)}; - const char *codes_merge[NSTRINGS_MERGE] = {code_helper_funcs, code_bin_search, code_merge}; + size_t lens_merge[NSTRINGS_MERGE] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge)}; + const char *codes_merge[NSTRINGS_MERGE] = {NULL, code_helper_funcs, code_bin_search, code_merge}; + lens_merge[0] = strlen(code_typecode); + codes_merge[0] = code_typecode; err = GpuKernel_init( k_merge, ctx, @@ -824,8 +863,10 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile merge global kernel - size_t lens_merge_glb[NSTRINGS_MERGE_GLB] = {strlen(code_bin_search), strlen(code_merge_glb)}; - const char *codes_merge_glb[NSTRINGS_MERGE_GLB] = {code_bin_search, code_merge_glb}; + size_t lens_merge_glb[NSTRINGS_MERGE_GLB] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge_glb)}; + const char *codes_merge_glb[NSTRINGS_MERGE_GLB] = {NULL, code_helper_funcs, code_bin_search, code_merge_glb}; + lens_merge_glb[0] = strlen(code_typecode); + codes_merge_glb[0] = code_typecode; err = GpuKernel_init( k_merge_global, ctx, @@ -842,7 +883,6 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); } - static void sort( GpuArray *d_DstKey, GpuArray *d_BufKey, @@ -860,13 +900,12 @@ static void sort( { int typecode = d_SrcKey->typecode; size_t typeSize = typesize(typecode); - size_t lstCopyOff; int err; GpuArray *ikey, *okey, *t; GpuKernel k_bitonic, k_ranks, k_ranks_idxs, k_merge, k_merge_global; - compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx); + compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, typecode); unsigned int stageCount = 0; unsigned int stride; @@ -979,6 +1018,7 @@ static void initMergeSort( if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); } + int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray *arg) { @@ -1011,6 +1051,9 @@ int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray * // Initialize device auxiliary data structure initMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, Nfloor / 128, ctx); + // Generate typecode specific code + + // perform regular sort sort( dst, From c40cf691090e6ef573830900d7413cc97192d298 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Fri, 21 Jul 2017 17:04:48 +0200 Subject: [PATCH 08/19] min max macros generation --- src/gpuarray_sort.c | 403 +++++++++++++++++++++++--------------------- 1 file changed, 214 insertions(+), 189 deletions(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index cd6ad904c4..0491bbb081 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -1,4 +1,6 @@ #include +#include +#include #include #include @@ -8,14 +10,11 @@ #include "private.h" const int flags = GA_USE_CLUDA; -//"\ntypedef float t_key;\n" \ static const char *code_helper_funcs = \ "\n#define SAMPLE_STRIDE 128 \n" \ "\n#define SHARED_SIZE_LIMIT 1024U \n" \ -"\n#define MAX_NUM 340282346638528859811704183484516925440.000000F\n" \ -"\n#define MIN_NUM -340282346638528859811704183484516925440.000000F\n" \ -"__device__ unsigned int iDivUp(unsigned int a, unsigned int b)" \ +"__device__ unsigned int iDivUp(unsigned int a, unsigned int b)" \ "{" \ " return ((a % b) == 0) ? (a / b) : (a / b + 1); " \ "} " \ @@ -41,9 +40,9 @@ static const char *code_helper_funcs = " return a[pos]; " \ " } " \ " } " \ -"template __device__ void writeArray(T *a, unsigned int pos, T value, unsigned int length) " \ +"template __device__ void writeArray(T *a, unsigned int pos, T value, unsigned int length) " \ " { " \ -" if (pos >= length) " \ +" if (pos >= length) " \ " { " \ " return; " \ " } " \ @@ -105,7 +104,7 @@ static const char *code_bin_search = "}"\ "template __device__ unsigned int binarySearchLowerBoundExclusive(T val, T *ptr, unsigned int first," \ " unsigned int last, unsigned int sortDir) " \ -"{ "\ +"{ "\ " unsigned int len = last - first; " \ " unsigned int half; " \ " unsigned int middle; " \ @@ -139,7 +138,7 @@ static const char *code_bin_search = " if ( (sortDir && ptr[middle] <= val) || (!sortDir && ptr[middle] >= val) ) "\ " { "\ " first = middle; "\ -" ++first; "\ +" ++first; "\ " len = len - half - 1; "\ " } "\ " else "\ @@ -216,7 +215,7 @@ static const char *code_bitonic_smem = " arrayLength * batchSize "\ " ); "\ " }\n"; -static void bitonicSortShared( +static int bitonicSortShared( GpuArray *d_DstKey, GpuArray *d_SrcKey, unsigned int batchSize, @@ -229,43 +228,46 @@ static void bitonicSortShared( { size_t ls, gs; unsigned int p = 0; - int err; + int err = GA_NO_ERROR; ls = SHARED_SIZE_LIMIT / 2; gs = batchSize; err = GpuKernel_setarg(k_bitonic, p++, d_DstKey->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_bitonic, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_bitonic, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_bitonic, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_bitonic, p++, &batchSize); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_bitonic, p++, &arrayLength); - if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_bitonic, p++, &elemsOff); - if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_bitonic, p++, &sortDir); - if (err != GA_NO_ERROR) printf("eror setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) printf("error calling kernel %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; + error_call_basic: + return err; +/* float *h_dst2 = (float *) malloc ( 16 * sizeof(float)); err = GpuArray_read(h_dst2, 16 * sizeof(float), d_DstKey); if (err != GA_NO_ERROR) printf("error reading \n"); - /* + int i; for (i = 0; i < 16; i++) { @@ -278,7 +280,7 @@ static void bitonicSortShared( const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_sample_ranks = \ "extern \"C\" __global__ void generateSampleRanksKernel(" \ -" unsigned int *d_RanksA," \ +" unsigned int *d_RanksA," \ " size_t rankAOff," \ " unsigned int *d_RanksB," \ " size_t rankBOff," \ @@ -324,7 +326,7 @@ static const char *code_sample_ranks = " );"\ " }"\ "}\n"; -static void generateSampleRanks( +static int generateSampleRanks( GpuArray *d_RanksA, GpuArray *d_RanksB, GpuArray *d_SrcKey, @@ -340,44 +342,45 @@ static void generateSampleRanks( size_t ls, gs; unsigned int p = 0; - int err; + int err = GA_NO_ERROR; ls = 256; gs = iDivUp(threadCount, 256); err = GpuKernel_setarg(k_ranks, p++, d_RanksA->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, &d_RanksA->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, d_RanksB->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, &d_RanksB->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, &stride); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, &N); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, &threadCount); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks, p++, &sortDir); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k_ranks, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); - + if (err != GA_NO_ERROR) goto error_call_basic; + error_call_basic: + return err; /*unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); @@ -407,7 +410,7 @@ static const char *code_ranks_idxs = " unsigned int stride, "\ " unsigned int N, "\ " unsigned int threadCount "\ -") "\ +") "\ "{ "\ " d_Limits = (unsigned int*) (((char*)d_Limits)+ limOff);" \ " d_Ranks = (unsigned int*) (((char*)d_Ranks)+ rankOff);" \ @@ -433,7 +436,7 @@ static const char *code_ranks_idxs = " d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; "\ " } "\ "}\n"; -static void mergeRanksAndIndices( +static int mergeRanksAndIndices( GpuArray *d_LimitsA, GpuArray *d_LimitsB, GpuArray *d_RanksA, @@ -446,56 +449,57 @@ static void mergeRanksAndIndices( ) { unsigned int lastSegmentElements = N % (2 * stride); - unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); - + unsigned int threadCount = (lastSegmentElements > stride) ? + (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); size_t ls, gs; unsigned int p = 0; - int err; + int err = GA_NO_ERROR; ls = 256U; gs = iDivUp(threadCount, 256U); err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsA->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsA->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksA->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksA->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, &stride); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, &N); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, &threadCount); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; p = 0; err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsB->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsB->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksB->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksB->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); - + if (err != GA_NO_ERROR) goto error_call_basic; + error_call_basic: + return err; /*unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); err = GpuArray_read(h_dst, (2048/128) * sizeof(unsigned int), d_LimitsB); @@ -579,7 +583,7 @@ static const char *code_merge = \ " startSrcB = d_LimitsB[blockIdx.x]; "\ " unsigned int endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA; "\ " unsigned int endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB; "\ -" lenSrcA = endSrcA - startSrcA; "\ +" lenSrcA = endSrcA - startSrcA; "\ " lenSrcB = endSrcB - startSrcB; "\ " startDstA = startSrcA + startSrcB; "\ " startDstB = startDstA + lenSrcA; "\ @@ -612,7 +616,7 @@ static const char *code_merge = \ " d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; "\ " } "\ "}\n"; -static void mergeElementaryIntervals( +static int mergeElementaryIntervals( GpuArray *d_DstKey, GpuArray *d_SrcKey, GpuArray *d_LimitsA, @@ -629,46 +633,49 @@ static void mergeElementaryIntervals( size_t ls, gs; unsigned int p = 0; - int err; + int err = GA_NO_ERROR; ls = SAMPLE_STRIDE; gs = mergePairs; err = GpuKernel_setarg(k_merge, p++, d_DstKey->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, d_LimitsA->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, &d_LimitsA->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, d_LimitsB->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, &d_LimitsB->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, &stride); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, &N); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge, p++, &sortDir); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k_merge, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; + + error_call_basic: + return err; } #define NUMARGS_MERGE_GLB 8 @@ -683,7 +690,7 @@ static const char *code_merge_glb = \ " unsigned int segmentSizeB, "\ " unsigned int N, "\ " unsigned int sortDir "\ -") "\ +") "\ "{ "\ " d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ " d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ @@ -707,7 +714,7 @@ static const char *code_merge_glb = \ " d_DstKey[dstPos] = value; "\ "}\n"; -static void mergeGlobalMem( +static int mergeGlobalMem( GpuArray *d_DstKey, GpuArray *d_SrcKey, unsigned int segmentSizeA, @@ -720,52 +727,80 @@ static void mergeGlobalMem( { size_t ls, gs; unsigned int p = 0; - int err; + int err = GA_NO_ERROR; ls = 256; gs = iDivUp(N, ls); err = GpuKernel_setarg(k_merge_global, p++, d_DstKey->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge_global, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge_global, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge_global, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeA); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeB); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge_global, p++, &N); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_setarg(k_merge_global, p++, &sortDir); - if (err != GA_NO_ERROR) printf("error setting arg %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; err = GpuKernel_call(k_merge_global, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) printf("error calling Ranks kernel %d \n", p); + if (err != GA_NO_ERROR) goto error_call_basic; + + error_call_basic: + return err; } // Generate type specific GPU code -static void genMergeSortCode(char *str, int typecode) +static void genMergeSortTypeCode(char *str, int typecode) { + int nchars = 0; // Generate typedef for the data type to be sorted - sprintf(str, "typedef %s t_key;\n", ctype(typecode)); - printf("sssss %s\n", str); + nchars = sprintf(str, "typedef %s t_key;\n", ctype(typecode)); // Generate macro for MIN and MAX value of a given data type - - - - + switch (typecode){ + case GA_UINT: + sprintf(str + nchars, "#define MAX_NUM %u \n#define MIN_NUM %u \n", UINT_MAX, 0); + break; + case GA_INT: + sprintf(str + nchars, "#define MAX_NUM %d \n#define MIN_NUM %d \n", INT_MAX, INT_MIN); + break; + case GA_FLOAT: + sprintf(str + nchars, "#define MAX_NUM %f \n#define MIN_NUM %f \n", FLT_MAX, -FLT_MAX); + break; + case GA_DOUBLE: + sprintf(str + nchars, "#define MAX_NUM %g \n#define MIN_NUM %g \n", DBL_MAX, -DBL_MAX); + break; + case GA_UBYTE: + sprintf(str + nchars, "#define MAX_NUM %u \n#define MIN_NUM %u \n", UCHAR_MAX, 0); + break; + case GA_BYTE: + sprintf(str + nchars, "#define MAX_NUM %d \n#define MIN_NUM %d \n", SCHAR_MAX, SCHAR_MIN); + break; + case GA_USHORT: + sprintf(str + nchars, "#define MAX_NUM %u \n#define MIN_NUM %u \n", USHRT_MAX, 0); + break; + case GA_SHORT: + sprintf(str + nchars, "#define MAX_NUM %d \n#define MIN_NUM %d \n", SHRT_MAX, SHRT_MIN); + break; + default: + fprintf(stderr, "Type %s not supported", ctype(typecode)); + break; + } } #define NSTR_BITONIC 3 @@ -777,15 +812,25 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * GpuKernel *k_merge_global, gpucontext *ctx, int typecode) { char *err_str = NULL; - int err; + int err = GA_NO_ERROR; + + size_t lens_bitonic[NSTR_BITONIC] = {0, strlen(code_helper_funcs), strlen(code_bitonic_smem)}; + size_t lens_ranks[NSTR_RANKS] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; + size_t lens_rks_idx[NSTRINGS_RKS_IDX] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_ranks_idxs)}; + size_t lens_merge[NSTRINGS_MERGE] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge)}; + size_t lens_merge_glb[NSTRINGS_MERGE_GLB] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge_glb)}; + + const char *codes_bitonic[NSTR_BITONIC] = {NULL, code_helper_funcs, code_bitonic_smem}; + const char *codes_ranks[NSTR_RANKS] = {NULL, code_helper_funcs, code_bin_search, code_sample_ranks}; + const char *codes_rks_idx[NSTRINGS_RKS_IDX] = {NULL, code_helper_funcs, code_bin_search, code_ranks_idxs}; + const char *codes_merge[NSTRINGS_MERGE] = {NULL, code_helper_funcs, code_bin_search, code_merge}; + const char *codes_merge_glb[NSTRINGS_MERGE_GLB] = {NULL, code_helper_funcs, code_bin_search, code_merge_glb}; - char code_typecode[100]; - genMergeSortCode(code_typecode, typecode); + char code_typecode[500]; + genMergeSortTypeCode(code_typecode, typecode); - // Compile Bitonic sort Kernel - size_t lens_bitonic[NSTR_BITONIC] = {0, strlen(code_helper_funcs), strlen(code_bitonic_smem)}; - const char *codes_bitonic[NSTR_BITONIC] = {NULL, code_helper_funcs, code_bitonic_smem}; - lens_bitonic[0] = strlen(code_typecode); + // Compile Bitonic sort Kernel + lens_bitonic[0] = strlen(code_typecode); codes_bitonic[0] = code_typecode; err = GpuKernel_init( k_bitonic, @@ -803,8 +848,6 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile ranks kernel - size_t lens_ranks[NSTR_RANKS] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_sample_ranks)}; - const char *codes_ranks[NSTR_RANKS] = {NULL, code_helper_funcs, code_bin_search, code_sample_ranks}; lens_ranks[0] = strlen(code_typecode); codes_ranks[0] = code_typecode; @@ -823,8 +866,6 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile ranks and idxs kernel - size_t lens_rks_idx[NSTRINGS_RKS_IDX] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_ranks_idxs)}; - const char *codes_rks_idx[NSTRINGS_RKS_IDX] = {NULL, code_helper_funcs, code_bin_search, code_ranks_idxs}; lens_rks_idx[0] = strlen(code_typecode); codes_rks_idx[0] = code_typecode; @@ -843,8 +884,6 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile merge kernel - size_t lens_merge[NSTRINGS_MERGE] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge)}; - const char *codes_merge[NSTRINGS_MERGE] = {NULL, code_helper_funcs, code_bin_search, code_merge}; lens_merge[0] = strlen(code_typecode); codes_merge[0] = code_typecode; @@ -863,8 +902,6 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); // Compile merge global kernel - size_t lens_merge_glb[NSTRINGS_MERGE_GLB] = {0, strlen(code_helper_funcs), strlen(code_bin_search), strlen(code_merge_glb)}; - const char *codes_merge_glb[NSTRINGS_MERGE_GLB] = {NULL, code_helper_funcs, code_bin_search, code_merge_glb}; lens_merge_glb[0] = strlen(code_typecode); codes_merge_glb[0] = code_typecode; @@ -883,7 +920,7 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); } -static void sort( +static int sort( GpuArray *d_DstKey, GpuArray *d_BufKey, GpuArray *d_SrcKey, @@ -901,14 +938,15 @@ static void sort( int typecode = d_SrcKey->typecode; size_t typeSize = typesize(typecode); size_t lstCopyOff; - int err; + int err = GA_NO_ERROR; + + unsigned int stageCount = 0; + unsigned int stride; GpuArray *ikey, *okey, *t; GpuKernel k_bitonic, k_ranks, k_ranks_idxs, k_merge, k_merge_global; compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, typecode); - unsigned int stageCount = 0; - unsigned int stride; for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1, stageCount++); if (stageCount & 1) { @@ -920,32 +958,27 @@ static void sort( okey = d_BufKey; } - ///////////////////////////////////////////////////////////////////////// - // Sort the array with bitonic sort for arrays shorter than 1024 elements - // Bitonic sort gives better performance than merge sort for short arrays - ///////////////////////////////////////////////////////////////////////// + // Bitonic sort for arrays <= 1024 elements if (N <= SHARED_SIZE_LIMIT) { - bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); + err = bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); } - /////////////////////////////////////////////////////////////////////////////// - // Sort the array with merge sort for arrays equal or bigger than 1024 elements - /////////////////////////////////////////////////////////////////////////////// + // Merge - Bitonic sort for bigger arrays else { unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; unsigned int arrayLength = SHARED_SIZE_LIMIT; - bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0, &k_bitonic, ctx); + err = bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0, &k_bitonic, ctx); for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) { unsigned int lastSegmentElements = Nfloor % (2 * stride); //Find sample ranks and prepare for limiters merge - generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); + err = generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); //Merge ranks and indices - mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor, sortDir, &k_ranks_idxs, ctx); + err = mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor, sortDir, &k_ranks_idxs, ctx); //Merge elementary intervals - mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir, &k_merge, ctx); + err = mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir, &k_merge, ctx); if (lastSegmentElements <= stride) { //Last merge segment consists of a single array which just needs to be passed through @@ -960,35 +993,20 @@ static void sort( } // If the array is not multiple of 1024, sort the remaining and merge if (Nleft > 0) { - printf("Sorting Remaining part %d \n", Nleft); - bitonicSortShared(d_SrcKey, d_DstKey, 1, Nleft, sortDir, Nfloor, &k_bitonic, ctx); -/* - unsigned int *h_dst = (unsigned int *) malloc ( N * sizeof(unsigned int)); - err = GpuArray_read(h_dst, N * sizeof(unsigned int), d_SrcKey); - if (err != GA_NO_ERROR) printf("error reading \n"); - - int i; - for (i = 0; i < N; i++) - { - printf("%d value %u \n", i, h_dst[i]); - } -*/ + err = bitonicSortShared(d_SrcKey, d_DstKey, 1, Nleft, sortDir, Nfloor, &k_bitonic, ctx); // Copy the leftMost segment to the output array of which contains the first sorted sequence lstCopyOff = okey->offset + Nfloor * typeSize; err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, Nleft * typeSize); - mergeGlobalMem(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir, &k_merge_global, ctx); - GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); + err = mergeGlobalMem(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir, &k_merge_global, ctx); + err = GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); } } - //GpuArray_copy(d_DstKey, d_BufKey, GA_C_ORDER); - //cudaDeviceSynchronize(); + return err; } - - -static void initMergeSort( +static int initMergeSort( GpuArray *d_RanksA, GpuArray *d_RanksB, GpuArray *d_LimitsA, @@ -997,40 +1015,57 @@ static void initMergeSort( gpucontext *ctx ) { - int res = GA_NO_ERROR; + int err = GA_NO_ERROR; const unsigned int nd = 1; const size_t dims = MAX_SAMPLE_COUNT * sizeof(unsigned int); - //d_RanksA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - res = GpuArray_empty(d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + err = GpuArray_empty(d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); - //d_RanksB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - res = GpuArray_empty(d_RanksB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + err = GpuArray_empty(d_RanksB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); - //d_LimitsA = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - res = GpuArray_empty(d_LimitsA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); + err = GpuArray_empty(d_LimitsA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); - //d_LimitsB = gpudata_alloc(ctx, MAX_SAMPLE_COUNT * sizeof(unsigned int), NULL, GA_BUFFER_READ_WRITE, &res); - res = GpuArray_empty(d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (res != GA_NO_ERROR) printf("error allocating aux structures %d\n", res); -} + err = GpuArray_empty(d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); + return err; +} -int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray *arg) +static void destroyMergeSort( + GpuArray *d_RanksA, + GpuArray *d_RanksB, + GpuArray *d_LimitsA, + GpuArray *d_LimitsB, + GpuArray *BufKey +) { + GpuArray_clear(d_RanksA); + GpuArray_clear(d_RanksB); + GpuArray_clear(d_LimitsA); + GpuArray_clear(d_LimitsB); + GpuArray_clear(BufKey); +} - int type = src->typecode; - gpucontext *ctx = GpuArray_context(src); - - printf("%s\n", gpuarray_get_type(type)->cluda_name ); - printf("%u\n", typesize(type) ); +int GpuArray_sort( + GpuArray *dst, + GpuArray *src, + unsigned int sortDir, + GpuArray *arg +) +{ + int err = GA_NO_ERROR; // Device pointers - auxiiary data structure GpuArray d_RanksA, d_RanksB, d_LimitsA, d_LimitsB; + //int type = src->typecode; + gpucontext *ctx = GpuArray_context(src); + + printf("Sorting GPU Array (%s -> %zu bytes)\n", gpuarray_get_type(src->typecode)->cluda_name, typesize(src->typecode)); + if (arg != NULL) { // perform argsort @@ -1044,42 +1079,32 @@ int GpuArray_sort(GpuArray *dst, GpuArray *src, unsigned int sortDir, GpuArray * const unsigned int Nfloor = roundDown(dims, SHARED_SIZE_LIMIT); const int Nleft = dims - Nfloor; - // Device pointers - buffer data strucute + // Buffer data strucute GpuArray BufKey; - GpuArray_empty(&BufKey, ctx, type, nd, &dims, GA_C_ORDER); + err = GpuArray_empty(&BufKey, ctx, src->typecode, nd, &dims, GA_C_ORDER); - // Initialize device auxiliary data structure + // Auxiliary data structure for MergeSort initMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, Nfloor / 128, ctx); - // Generate typecode specific code - - // perform regular sort - sort( - dst, - &BufKey, - src, - &d_RanksA, - &d_RanksB, - &d_LimitsA, - &d_LimitsB, - dims, - Nfloor, - Nleft, - sortDir, - ctx - ); - - - // type -> get typecode of the array - - // vectorType -> "type" - - // stbr_append all the kernels.... - - // Set arguments + err = sort( + dst, + &BufKey, + src, + &d_RanksA, + &d_RanksB, + &d_LimitsA, + &d_LimitsB, + dims, + Nfloor, + Nleft, + sortDir, + ctx + ); + + destroyMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, &BufKey); } - return 0; + return err; } From 19fa520ba48ae750a66a0a6580f95cf752d07eb7 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Mon, 24 Jul 2017 21:03:52 +0200 Subject: [PATCH 09/19] error checking --- src/gpuarray/sort.h | 2 +- src/gpuarray_sort.c | 1061 +++++++++++++++++++++---------------------- 2 files changed, 526 insertions(+), 537 deletions(-) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index efa26a3141..3752d37928 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -19,7 +19,7 @@ extern "C" { #define SAMPLE_STRIDE 128 -int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *arg); +int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstArg, GpuArray *srcArg); #ifdef __cplusplus diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 0491bbb081..ecc5ce3bbb 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -9,6 +9,23 @@ #include "util/strb.h" #include "private.h" +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + * This software contains source code provided by NVIDIA Corporation. + * + * Read more at: http://docs.nvidia.com/cuda/eula/index.html#ixzz4lUbgXjsr + * Follow us: @GPUComputing on Twitter | NVIDIA on Facebook + * + * + */ + const int flags = GA_USE_CLUDA; static const char *code_helper_funcs = \ @@ -22,25 +39,25 @@ static const char *code_helper_funcs = "{ " \ " return iDivUp(dividend, SAMPLE_STRIDE); " \ "}" \ -"\n #define W (sizeof(unsigned int) * 8) \n" \ +"\n #define W (sizeof(unsigned int) * 8) \n" \ "__device__ unsigned int nextPowerOfTwo(unsigned int x) " \ "{" \ " return 1U << (W - __clz(x - 1));" \ "} " \ -"template __device__ T readArray(T *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ +"template __device__ T readArray(T *a, unsigned int pos, unsigned int length, unsigned int sortDir){" \ " if (pos >= length) { " \ " if (sortDir) { " \ -" return MAX_NUM; " \ +" return MAX_NUM; " \ " } " \ " else { " \ -" return MIN_NUM; " \ +" return MIN_NUM; " \ " } " \ " } " \ " else { " \ " return a[pos]; " \ " } " \ " } " \ -"template __device__ void writeArray(T *a, unsigned int pos, T value, unsigned int length) " \ +"template __device__ void writeArray(T *a, unsigned int pos, T value, unsigned int length) " \ " { " \ " if (pos >= length) " \ " { " \ @@ -74,146 +91,141 @@ static inline size_t typesize(int typecode) { return gpuarray_get_type(typecode)->size; } -static const char *code_bin_search = \ -"template __device__ unsigned int binarySearchInclusive(T val, T *data, unsigned int L, "\ -" unsigned int stride, unsigned int sortDir){"\ -" if (L == 0) "\ -" return 0; "\ -" unsigned int pos = 0; "\ -" for (; stride > 0; stride >>= 1){ "\ -" unsigned int newPos = min(pos + stride, L); "\ -" if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))){ "\ -" pos = newPos; "\ -" } "\ -" } "\ -" return pos; "\ -"} "\ -" template __device__ unsigned int binarySearchExclusive(T val, T *data, unsigned int L, " \ -" unsigned int stride, unsigned int sortDir) "\ -"{ "\ -" if (L == 0) "\ -" return 0; "\ -" unsigned int pos = 0; "\ -" for (; stride > 0; stride >>= 1){ "\ -" unsigned int newPos = min(pos + stride, L); "\ -" if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))){ "\ -" pos = newPos; "\ -" } "\ -" } "\ -" return pos; "\ -"}"\ -"template __device__ unsigned int binarySearchLowerBoundExclusive(T val, T *ptr, unsigned int first," \ -" unsigned int last, unsigned int sortDir) " \ -"{ "\ -" unsigned int len = last - first; " \ -" unsigned int half; " \ -" unsigned int middle; " \ -" while (len > 0) "\ -" { "\ -" half = len >> 1; " \ -" middle = first; " \ -" middle += half; " \ -" if ( (sortDir && ptr[middle] < val) || (!sortDir && ptr[middle] > val) ) "\ -" { "\ -" first = middle; " \ -" ++first; "\ -" len = len - half - 1; "\ -" } "\ -" else "\ -" len = half; " \ -" } "\ -" return first; "\ -"} "\ -"template __device__ unsigned int binarySearchLowerBoundInclusive(T val, T *ptr, unsigned int first, "\ -" unsigned int last, unsigned int sortDir) "\ -"{ "\ -" unsigned int len = last - first; "\ -" unsigned int half; "\ -" unsigned int middle; "\ -" while (len > 0) "\ -" { "\ -" half = len >> 1; "\ -" middle = first; "\ -" middle += half; "\ -" if ( (sortDir && ptr[middle] <= val) || (!sortDir && ptr[middle] >= val) ) "\ -" { "\ -" first = middle; "\ -" ++first; "\ -" len = len - half - 1; "\ -" } "\ -" else "\ -" len = half; "\ -" } "\ -" return first; "\ +static const char *code_bin_search = \ +"template __device__ unsigned int binarySearchInclusive(T val, T *data, unsigned int L, " \ +" unsigned int stride, unsigned int sortDir){" \ +" if (L == 0) " \ +" return 0; " \ +" unsigned int pos = 0; " \ +" for (; stride > 0; stride >>= 1){ " \ +" unsigned int newPos = min(pos + stride, L); " \ +" if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))){ " \ +" pos = newPos; " \ +" } " \ +" } " \ +" return pos; " \ +"} " \ +" template __device__ unsigned int binarySearchExclusive(T val, T *data, unsigned int L, " \ +" unsigned int stride, unsigned int sortDir) " \ +"{ " \ +" if (L == 0) " \ +" return 0; " \ +" unsigned int pos = 0; " \ +" for (; stride > 0; stride >>= 1){ " \ +" unsigned int newPos = min(pos + stride, L); " \ +" if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))){ " \ +" pos = newPos; " \ +" } " \ +" } " \ +" return pos; " \ +"}" \ +"template __device__ unsigned int binarySearchLowerBoundExclusive(T val, T *ptr, unsigned int first," \ +" unsigned int last, unsigned int sortDir) " \ +"{ " \ +" unsigned int len = last - first; " \ +" unsigned int half; " \ +" unsigned int middle; " \ +" while (len > 0) { " \ +" half = len >> 1; " \ +" middle = first; " \ +" middle += half; " \ +" if ( (sortDir && ptr[middle] < val) || (!sortDir && ptr[middle] > val) ) { " \ +" first = middle; " \ +" ++first; " \ +" len = len - half - 1; " \ +" } " \ +" else " \ +" len = half; " \ +" } " \ +" return first; " \ +"} " \ +"template __device__ unsigned int binarySearchLowerBoundInclusive(T val, T *ptr, unsigned int first, " \ +" unsigned int last, unsigned int sortDir) " \ +"{ " \ +" unsigned int len = last - first; " \ +" unsigned int half; " \ +" unsigned int middle; " \ +" while (len > 0) { " \ +" half = len >> 1; " \ +" middle = first; " \ +" middle += half; " \ +" if ( (sortDir && ptr[middle] <= val) || (!sortDir && ptr[middle] >= val) ) { " \ +" first = middle; " \ +" ++first; " \ +" len = len - half - 1; " \ +" } " \ +" else " \ +" len = half; " \ +" } " \ +" return first; " \ "}\n"; #define NUMARGS_BITONIC_KERNEL 8 const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; -static const char *code_bitonic_smem = \ -" extern \"C\" __global__ void bitonicSortSharedKernel( "\ -" t_key *d_DstKey, "\ -" size_t dstOff," -" t_key *d_SrcKey, "\ -" size_t srcOff," -" unsigned int batchSize, "\ -" unsigned int arrayLength, "\ -" unsigned int elemsOff, " \ -" unsigned int sortDir "\ -" ) "\ -" { "\ -" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ -" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ -" d_DstKey += elemsOff;" \ -" d_SrcKey += elemsOff;" \ -" __shared__ t_key s_key[SHARED_SIZE_LIMIT]; "\ -" s_key[threadIdx.x] = readArray( d_SrcKey, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ -" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcKey, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), "\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ -" for (unsigned int size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) "\ -" { "\ -" unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); "\ -" for (unsigned int stride = size / 2; stride > 0; stride >>= 1) "\ -" { "\ -" __syncthreads(); "\ -" unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); "\ -" t_key t; "\ -" if ((s_key[pos] > s_key[pos + stride]) == ddd) { "\ -" t = s_key[pos]; "\ -" s_key[pos] = s_key[pos + stride]; "\ -" s_key[pos + stride] = t; "\ -" } "\ -" } "\ -" } "\ -" { "\ -" for (unsigned int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {" \ -" __syncthreads(); "\ -" unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); "\ -" t_key t; "\ -" if ((s_key[pos] > s_key[pos + stride]) == sortDir) {" \ -" t = s_key[pos]; "\ -" s_key[pos] = s_key[pos + stride]; "\ -" s_key[pos + stride] = t; "\ -" } "\ -" } "\ -" } "\ -" __syncthreads(); "\ -" writeArray( d_DstKey, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ -" s_key[threadIdx.x], "\ -" arrayLength * batchSize "\ -" ); "\ -" writeArray( d_DstKey, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), "\ -" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], "\ -" arrayLength * batchSize "\ -" ); "\ +static const char *code_bitonic_smem = \ +" extern \"C\" __global__ void bitonicSortSharedKernel( " \ +" t_key *d_DstKey, " \ +" size_t dstOff," \ +" t_key *d_SrcKey, " \ +" size_t srcOff," \ +" unsigned int batchSize, " \ +" unsigned int arrayLength, " \ +" unsigned int elemsOff, " \ +" unsigned int sortDir " \ +" ) " \ +" { " \ +" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ +" d_DstKey += elemsOff;" \ +" d_SrcKey += elemsOff;" \ +" __shared__ t_key s_key[SHARED_SIZE_LIMIT]; " \ +" s_key[threadIdx.x] = readArray( d_SrcKey, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, " \ +" arrayLength * batchSize, " \ +" sortDir " \ +" ); " \ +" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcKey, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2),"\ +" arrayLength * batchSize, " \ +" sortDir " \ +" ); " \ +" for (unsigned int size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) { " \ +" unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); " \ +" for (unsigned int stride = size / 2; stride > 0; stride >>= 1) " \ +" { " \ +" __syncthreads(); " \ +" unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); " \ +" t_key t; " \ +" if ((s_key[pos] > s_key[pos + stride]) == ddd) { " \ +" t = s_key[pos]; " \ +" s_key[pos] = s_key[pos + stride]; " \ +" s_key[pos + stride] = t; " \ +" } " \ +" } " \ +" } " \ +" { " \ +" for (unsigned int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {" \ +" __syncthreads(); " \ +" unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); " \ +" t_key t; " \ +" if ((s_key[pos] > s_key[pos + stride]) == sortDir) {" \ +" t = s_key[pos]; " \ +" s_key[pos] = s_key[pos + stride]; " \ +" s_key[pos + stride] = t; " \ +" } " \ +" } " \ +" } " \ +" __syncthreads(); " \ +" writeArray( d_DstKey, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, " \ +" s_key[threadIdx.x], " \ +" arrayLength * batchSize " \ +" ); " \ +" writeArray( d_DstKey, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), " \ +" s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], " \ +" arrayLength * batchSize " \ +" ); " \ " }\n"; static int bitonicSortShared( GpuArray *d_DstKey, @@ -234,34 +246,33 @@ static int bitonicSortShared( gs = batchSize; err = GpuKernel_setarg(k_bitonic, p++, d_DstKey->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, &batchSize); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, &arrayLength); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, &elemsOff); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, &sortDir); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) goto error_call_basic; - error_call_basic: - return err; + if (err != GA_NO_ERROR) return err; + return err; /* float *h_dst2 = (float *) malloc ( 16 * sizeof(float)); err = GpuArray_read(h_dst2, 16 * sizeof(float), d_DstKey); @@ -278,53 +289,50 @@ static int bitonicSortShared( #define NUMARGS_SAMPLE_RANKS 10 const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; -static const char *code_sample_ranks = \ -"extern \"C\" __global__ void generateSampleRanksKernel(" \ -" unsigned int *d_RanksA," \ -" size_t rankAOff," \ -" unsigned int *d_RanksB," \ -" size_t rankBOff," \ -" t_key *d_SrcKey,"\ -" size_t srcOff," \ -" unsigned int stride," \ -" unsigned int N," \ -" unsigned int threadCount,"\ -" unsigned int sortDir" \ -")" \ -"{" \ -" d_RanksA = (unsigned int*) (((char*)d_RanksA)+ rankAOff);" \ -" d_RanksB = (unsigned int*) (((char*)d_RanksB)+ rankBOff);" \ -" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ -" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;"\ -" if (pos >= threadCount)" \ -" {"\ -" return;"\ -" }"\ -" const unsigned int i = pos & ((stride / SAMPLE_STRIDE) - 1);"\ -" const unsigned int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);"\ -" d_SrcKey += segmentBase;"\ -" d_RanksA += segmentBase / SAMPLE_STRIDE;"\ -" d_RanksB += segmentBase / SAMPLE_STRIDE;"\ -" const unsigned int segmentElementsA = stride;"\ -" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride);"\ -" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA);"\ -" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB);"\ -" if (i < segmentSamplesA)"\ -" {"\ -" d_RanksA[i] = i * SAMPLE_STRIDE;"\ -" d_RanksB[i] = binarySearchExclusive("\ -" d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride,"\ -" segmentElementsB, nextPowerOfTwo(segmentElementsB), sortDir"\ -" );"\ -" }"\ -" if (i < segmentSamplesB)"\ -" {"\ -" d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;"\ -" d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive("\ -" d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0,"\ -" segmentElementsA, nextPowerOfTwo(segmentElementsA), sortDir"\ -" );"\ -" }"\ +static const char *code_sample_ranks = \ +"extern \"C\" __global__ void generateSampleRanksKernel(" \ +" unsigned int *d_RanksA," \ +" size_t rankAOff," \ +" unsigned int *d_RanksB," \ +" size_t rankBOff," \ +" t_key *d_SrcKey," \ +" size_t srcOff," \ +" unsigned int stride," \ +" unsigned int N," \ +" unsigned int threadCount," \ +" unsigned int sortDir" \ +")" \ +"{" \ +" d_RanksA = (unsigned int*) (((char*)d_RanksA)+ rankAOff);" \ +" d_RanksB = (unsigned int*) (((char*)d_RanksB)+ rankBOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ +" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;" \ +" if (pos >= threadCount) {" \ +" return;" \ +" }" \ +" const unsigned int i = pos & ((stride / SAMPLE_STRIDE) - 1);" \ +" const unsigned int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);" \ +" d_SrcKey += segmentBase;" \ +" d_RanksA += segmentBase / SAMPLE_STRIDE;" \ +" d_RanksB += segmentBase / SAMPLE_STRIDE;" \ +" const unsigned int segmentElementsA = stride;" \ +" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride);" \ +" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA);" \ +" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB);" \ +" if (i < segmentSamplesA) {" \ +" d_RanksA[i] = i * SAMPLE_STRIDE;" \ +" d_RanksB[i] = binarySearchExclusive(" \ +" d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride," \ +" segmentElementsB, nextPowerOfTwo(segmentElementsB), sortDir" \ +" );" \ +" }" \ +" if (i < segmentSamplesB) {" \ +" d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;" \ +" d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive(" \ +" d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0," \ +" segmentElementsA, nextPowerOfTwo(segmentElementsA), sortDir" \ +" );" \ +" }" \ "}\n"; static int generateSampleRanks( GpuArray *d_RanksA, @@ -348,93 +356,77 @@ static int generateSampleRanks( gs = iDivUp(threadCount, 256); err = GpuKernel_setarg(k_ranks, p++, d_RanksA->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &d_RanksA->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, d_RanksB->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &d_RanksB->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &stride); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &N); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &threadCount); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &sortDir); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_ranks, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) goto error_call_basic; - error_call_basic: + if (err != GA_NO_ERROR) return err; + return err; - - - /*unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); - err = GpuArray_read(h_dst, (2048/128) * sizeof(unsigned int), d_RanksA); - if (err != GA_NO_ERROR) printf("error reading \n"); - - unsigned int *h_dst2 = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); - err = GpuArray_read(h_dst2, (2048/128) * sizeof(unsigned int), d_RanksB); - if (err != GA_NO_ERROR) printf("error reading \n"); - - int i; - for (i = 0; i < 2048/128; i++) - { - printf("%d rankA %u rankB %u \n", i, h_dst[i], h_dst2[i]); - } - */ } #define NUMARGS_RANKS_IDXS 7 const int type_args_ranks_idxs[NUMARGS_RANKS_IDXS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; -static const char *code_ranks_idxs = \ -"extern \"C\" __global__ void mergeRanksAndIndicesKernel( "\ -" unsigned int *d_Limits, "\ -" size_t limOff," \ -" unsigned int *d_Ranks, "\ -" size_t rankOff," \ -" unsigned int stride, "\ -" unsigned int N, "\ -" unsigned int threadCount "\ -") "\ -"{ "\ -" d_Limits = (unsigned int*) (((char*)d_Limits)+ limOff);" \ -" d_Ranks = (unsigned int*) (((char*)d_Ranks)+ rankOff);" \ -" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; "\ -" if (pos >= threadCount) "\ -" return; "\ -" const unsigned int i = pos & ((stride / SAMPLE_STRIDE) - 1); "\ -" const unsigned int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); "\ -" d_Ranks += (pos - i) * 2; "\ -" d_Limits += (pos - i) * 2; "\ -" const unsigned int segmentElementsA = stride; "\ -" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride); "\ -" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA); "\ -" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB); "\ -" if (i < segmentSamplesA) "\ -" { "\ -" unsigned int dstPos = binarySearchExclusive(d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB), 1U) + i; "\ -" d_Limits[dstPos] = d_Ranks[i]; "\ -" } "\ -" if (i < segmentSamplesB) "\ -" { "\ -" unsigned int dstPos = binarySearchInclusive(d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA), 1U) + i; "\ -" d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; "\ -" } "\ +static const char *code_ranks_idxs = \ +"extern \"C\" __global__ void mergeRanksAndIndicesKernel( " \ +" unsigned int *d_Limits, " \ +" size_t limOff," \ +" unsigned int *d_Ranks, " \ +" size_t rankOff," \ +" unsigned int stride, " \ +" unsigned int N, " \ +" unsigned int threadCount " \ +") " \ +"{ " \ +" d_Limits = (unsigned int*) (((char*)d_Limits)+ limOff);" \ +" d_Ranks = (unsigned int*) (((char*)d_Ranks)+ rankOff);" \ +" unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; " \ +" if (pos >= threadCount) " \ +" return; " \ +" const unsigned int i = pos & ((stride / SAMPLE_STRIDE) - 1); " \ +" const unsigned int segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); " \ +" d_Ranks += (pos - i) * 2; " \ +" d_Limits += (pos - i) * 2; " \ +" const unsigned int segmentElementsA = stride; " \ +" const unsigned int segmentElementsB = min(stride, N - segmentBase - stride); " \ +" const unsigned int segmentSamplesA = getSampleCount(segmentElementsA); " \ +" const unsigned int segmentSamplesB = getSampleCount(segmentElementsB); " \ +" if (i < segmentSamplesA) { " \ +" unsigned int dstPos = binarySearchExclusive(d_Ranks[i], d_Ranks + segmentSamplesA," \ +" segmentSamplesB, nextPowerOfTwo(segmentSamplesB), 1U) + i;" \ +" d_Limits[dstPos] = d_Ranks[i]; " \ +" } " \ +" if (i < segmentSamplesB) { " \ +" unsigned int dstPos = binarySearchInclusive(d_Ranks[segmentSamplesA + i], d_Ranks," \ +" segmentSamplesA, nextPowerOfTwo(segmentSamplesA), 1U) + i;"\ +" d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; " \ +" } " \ "}\n"; static int mergeRanksAndIndices( GpuArray *d_LimitsA, @@ -459,162 +451,143 @@ static int mergeRanksAndIndices( gs = iDivUp(threadCount, 256U); err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsA->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsA->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksA->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksA->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &stride); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &N); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &threadCount); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; p = 0; err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsB->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsB->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksB->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksB->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) goto error_call_basic; - error_call_basic: - return err; - - /*unsigned int *h_dst = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); - err = GpuArray_read(h_dst, (2048/128) * sizeof(unsigned int), d_LimitsB); - if (err != GA_NO_ERROR) printf("error reading \n"); - - unsigned int *h_dst2 = (unsigned int *) malloc ( (2048/128) * sizeof(unsigned int)); - err = GpuArray_read(h_dst2, (2048/128) * sizeof(unsigned int), d_RanksB); - if (err != GA_NO_ERROR) printf("error reading \n"); + if (err != GA_NO_ERROR) return err; - */ + return err; } #define NUMARGS_MERGE 11 const int type_args_merge[NUMARGS_MERGE] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; -static const char *code_merge = \ -" template __device__ void merge( "\ -" T *dstKey, "\ -" T *srcAKey, "\ -" T *srcBKey, "\ -" unsigned int lenA, "\ -" unsigned int nPowTwoLenA, "\ -" unsigned int lenB, "\ -" unsigned int nPowTwoLenB, "\ -" unsigned int sortDir "\ -") "\ -"{ "\ -" T keyA, keyB; "\ -" unsigned int dstPosA , dstPosB;"\ -" if (threadIdx.x < lenA) "\ -" { "\ -" keyA = srcAKey[threadIdx.x]; "\ -" dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB, sortDir) + threadIdx.x; "\ -" } "\ -" if (threadIdx.x < lenB) "\ -" { "\ -" keyB = srcBKey[threadIdx.x]; "\ -" dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA, sortDir) + threadIdx.x; "\ -" } "\ -" __syncthreads(); "\ -" if (threadIdx.x < lenA) "\ -" { "\ -" dstKey[dstPosA] = keyA; "\ -" } "\ -" if (threadIdx.x < lenB) "\ -" { "\ -" dstKey[dstPosB] = keyB; "\ -" } "\ -"} "\ -"extern \"C\" __global__ void mergeElementaryIntervalsKernel( "\ -" t_key *d_DstKey, "\ -" size_t dstOff," \ -" t_key *d_SrcKey, "\ -" size_t srcOff," \ -" unsigned int *d_LimitsA, "\ -" size_t limAOff," \ -" unsigned int *d_LimitsB, "\ -" size_t limBOff," \ -" unsigned int stride, "\ -" unsigned int N, "\ -" unsigned int sortDir" -") "\ -"{ "\ -" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ -" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ -" d_LimitsA = (unsigned int*) (((char*)d_LimitsA)+ limAOff);" \ -" d_LimitsB = (unsigned int*) (((char*)d_LimitsB)+ limBOff);" \ -" __shared__ t_key s_key[2 * SAMPLE_STRIDE]; "\ -" const unsigned int intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); "\ -" const unsigned int segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; "\ -" d_SrcKey += segmentBase; "\ -" d_DstKey += segmentBase; "\ -" __shared__ unsigned int startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; "\ -" if (threadIdx.x == 0) "\ -" { "\ -" unsigned int segmentElementsA = stride; "\ -" unsigned int segmentElementsB = min(stride, N - segmentBase - stride); "\ -" unsigned int segmentSamplesA = getSampleCount(segmentElementsA); "\ -" unsigned int segmentSamplesB = getSampleCount(segmentElementsB); "\ -" unsigned int segmentSamples = segmentSamplesA + segmentSamplesB; "\ -" startSrcA = d_LimitsA[blockIdx.x]; "\ -" startSrcB = d_LimitsB[blockIdx.x]; "\ -" unsigned int endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA; "\ -" unsigned int endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB; "\ -" lenSrcA = endSrcA - startSrcA; "\ -" lenSrcB = endSrcB - startSrcB; "\ -" startDstA = startSrcA + startSrcB; "\ -" startDstB = startDstA + lenSrcA; "\ -" } "\ -" __syncthreads(); "\ -" if (threadIdx.x < lenSrcA) "\ -" { "\ -" s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x]; "\ -" } "\ -" if (threadIdx.x < lenSrcB) "\ -" { "\ -" s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x]; "\ -" } "\ -" __syncthreads(); "\ -" merge( "\ -" s_key, "\ -" s_key + 0, "\ -" s_key + SAMPLE_STRIDE, "\ -" lenSrcA, SAMPLE_STRIDE, "\ -" lenSrcB, SAMPLE_STRIDE, "\ -" sortDir "\ -" ); "\ -" __syncthreads(); "\ -" if (threadIdx.x < lenSrcA) "\ -" { "\ -" d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x]; "\ -" } "\ -" if (threadIdx.x < lenSrcB) "\ -" { "\ -" d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; "\ -" } "\ +static const char *code_merge = \ +" template __device__ void merge( " \ +" T *dstKey, " \ +" T *srcAKey, " \ +" T *srcBKey, " \ +" unsigned int lenA, " \ +" unsigned int nPowTwoLenA, " \ +" unsigned int lenB, " \ +" unsigned int nPowTwoLenB, " \ +" unsigned int sortDir " \ +") " \ +"{ " \ +" T keyA, keyB; " \ +" unsigned int dstPosA , dstPosB;" \ +" if (threadIdx.x < lenA) { " \ +" keyA = srcAKey[threadIdx.x]; " \ +" dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB, sortDir) + threadIdx.x; " \ +" } " \ +" if (threadIdx.x < lenB) { " \ +" keyB = srcBKey[threadIdx.x]; " \ +" dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA, sortDir) + threadIdx.x; " \ +" } " \ +" __syncthreads(); " \ +" if (threadIdx.x < lenA) { " \ +" dstKey[dstPosA] = keyA; " \ +" } " \ +" if (threadIdx.x < lenB) { " \ +" dstKey[dstPosB] = keyB; " \ +" } " \ +"} " \ +"extern \"C\" __global__ void mergeElementaryIntervalsKernel( " \ +" t_key *d_DstKey, " \ +" size_t dstOff," \ +" t_key *d_SrcKey, " \ +" size_t srcOff," \ +" unsigned int *d_LimitsA, " \ +" size_t limAOff," \ +" unsigned int *d_LimitsB, " \ +" size_t limBOff," \ +" unsigned int stride, " \ +" unsigned int N, " \ +" unsigned int sortDir" \ +") " \ +"{ " \ +" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ +" d_LimitsA = (unsigned int*) (((char*)d_LimitsA)+ limAOff);" \ +" d_LimitsB = (unsigned int*) (((char*)d_LimitsB)+ limBOff);" \ +" __shared__ t_key s_key[2 * SAMPLE_STRIDE]; " \ +" const unsigned int intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); " \ +" const unsigned int segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; " \ +" d_SrcKey += segmentBase; " \ +" d_DstKey += segmentBase; " \ +" __shared__ unsigned int startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; " \ +" if (threadIdx.x == 0) { " \ +" unsigned int segmentElementsA = stride; " \ +" unsigned int segmentElementsB = min(stride, N - segmentBase - stride); " \ +" unsigned int segmentSamplesA = getSampleCount(segmentElementsA); " \ +" unsigned int segmentSamplesB = getSampleCount(segmentElementsB); " \ +" unsigned int segmentSamples = segmentSamplesA + segmentSamplesB; " \ +" startSrcA = d_LimitsA[blockIdx.x]; " \ +" startSrcB = d_LimitsB[blockIdx.x]; " \ +" unsigned int endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA; " \ +" unsigned int endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB; " \ +" lenSrcA = endSrcA - startSrcA; " \ +" lenSrcB = endSrcB - startSrcB; " \ +" startDstA = startSrcA + startSrcB; " \ +" startDstB = startDstA + lenSrcA; " \ +" } " \ +" __syncthreads(); " \ +" if (threadIdx.x < lenSrcA) { " \ +" s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x]; " \ +" } " \ +" if (threadIdx.x < lenSrcB) { " \ +" s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x]; " \ +" } " \ +" __syncthreads(); " \ +" merge( " \ +" s_key, " \ +" s_key + 0, " \ +" s_key + SAMPLE_STRIDE, " \ +" lenSrcA, SAMPLE_STRIDE, " \ +" lenSrcB, SAMPLE_STRIDE, " \ +" sortDir " \ +" ); " \ +" __syncthreads(); " \ +" if (threadIdx.x < lenSrcA) { " \ +" d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x]; " \ +" } " \ +" if (threadIdx.x < lenSrcB) { " \ +" d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; " \ +" } " \ "}\n"; static int mergeElementaryIntervals( GpuArray *d_DstKey, @@ -639,79 +612,76 @@ static int mergeElementaryIntervals( gs = mergePairs; err = GpuKernel_setarg(k_merge, p++, d_DstKey->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, d_LimitsA->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &d_LimitsA->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, d_LimitsB->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &d_LimitsB->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &stride); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &N); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &sortDir); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_merge, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; - error_call_basic: return err; } #define NUMARGS_MERGE_GLB 8 const int type_args_merge_glb[NUMARGS_MERGE_GLB] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; -static const char *code_merge_glb = \ -"extern \"C\" __global__ void mergeGlobalMemKernel( "\ -" t_key *d_DstKey, "\ -" size_t dstOff, "\ -" t_key *d_SrcKey, "\ -" size_t srcOff, "\ -" unsigned int segmentSizeA, "\ -" unsigned int segmentSizeB, "\ -" unsigned int N, "\ -" unsigned int sortDir "\ -") "\ -"{ "\ -" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ -" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ -" unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; "\ -" t_key *segmentPtrA = d_SrcKey; "\ -" t_key *segmentPtrB = d_SrcKey + segmentSizeA; "\ -" unsigned int idxSegmentA = idx % segmentSizeA; "\ -" unsigned int idxSegmentB = idx - segmentSizeA; "\ -" if (idx >= N) "\ -" return; "\ -" t_key value = d_SrcKey[idx]; "\ -" unsigned int dstPos; "\ -" if (idx < segmentSizeA) "\ -" { "\ -" dstPos = binarySearchLowerBoundExclusive(value, segmentPtrB, 0, segmentSizeB, sortDir) + idxSegmentA; "\ -" } "\ -" else "\ -" { "\ -" dstPos = binarySearchLowerBoundInclusive(value, segmentPtrA, 0, segmentSizeA, sortDir) + idxSegmentB; "\ -" } "\ -" d_DstKey[dstPos] = value; "\ +static const char *code_merge_glb = \ +"extern \"C\" __global__ void mergeGlobalMemKernel( " \ +" t_key *d_DstKey, " \ +" size_t dstOff, " \ +" t_key *d_SrcKey, " \ +" size_t srcOff, " \ +" unsigned int segmentSizeA, " \ +" unsigned int segmentSizeB, " \ +" unsigned int N, " \ +" unsigned int sortDir " \ +") " \ +"{ " \ +" d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ +" d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ +" unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; " \ +" t_key *segmentPtrA = d_SrcKey; " \ +" t_key *segmentPtrB = d_SrcKey + segmentSizeA; " \ +" unsigned int idxSegmentA = idx % segmentSizeA; " \ +" unsigned int idxSegmentB = idx - segmentSizeA; " \ +" if (idx >= N) " \ +" return; " \ +" t_key value = d_SrcKey[idx]; " \ +" unsigned int dstPos; " \ +" if (idx < segmentSizeA) { " \ +" dstPos = binarySearchLowerBoundExclusive(value, segmentPtrB, 0, segmentSizeB, sortDir) + idxSegmentA;" \ +" } " \ +" else { " \ +" dstPos = binarySearchLowerBoundInclusive(value, segmentPtrA, 0, segmentSizeA, sortDir) + idxSegmentB;" \ +" } " \ +" d_DstKey[dstPos] = value; " \ "}\n"; static int mergeGlobalMem( @@ -733,74 +703,73 @@ static int mergeGlobalMem( gs = iDivUp(N, ls); err = GpuKernel_setarg(k_merge_global, p++, d_DstKey->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge_global, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge_global, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge_global, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeA); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeB); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge_global, p++, &N); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge_global, p++, &sortDir); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_merge_global, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) goto error_call_basic; + if (err != GA_NO_ERROR) return err; - error_call_basic: return err; - } // Generate type specific GPU code -static void genMergeSortTypeCode(char *str, int typecode) +static int genMergeSortTypeCode(strb *str, int typecode) { - int nchars = 0; + int err = GA_NO_ERROR; // Generate typedef for the data type to be sorted - nchars = sprintf(str, "typedef %s t_key;\n", ctype(typecode)); + strb_appendf(str, "typedef %s t_key;\n", ctype(typecode)); // Generate macro for MIN and MAX value of a given data type switch (typecode){ case GA_UINT: - sprintf(str + nchars, "#define MAX_NUM %u \n#define MIN_NUM %u \n", UINT_MAX, 0); + strb_appendf(str, "#define MAX_NUM %u \n#define MIN_NUM %u \n", UINT_MAX, 0); break; case GA_INT: - sprintf(str + nchars, "#define MAX_NUM %d \n#define MIN_NUM %d \n", INT_MAX, INT_MIN); + strb_appendf(str, "#define MAX_NUM %d \n#define MIN_NUM %d \n", INT_MAX, INT_MIN); break; case GA_FLOAT: - sprintf(str + nchars, "#define MAX_NUM %f \n#define MIN_NUM %f \n", FLT_MAX, -FLT_MAX); + strb_appendf(str, "#define MAX_NUM %f \n#define MIN_NUM %f \n", FLT_MAX, -FLT_MAX); break; case GA_DOUBLE: - sprintf(str + nchars, "#define MAX_NUM %g \n#define MIN_NUM %g \n", DBL_MAX, -DBL_MAX); + strb_appendf(str, "#define MAX_NUM %g \n#define MIN_NUM %g \n", DBL_MAX, -DBL_MAX); break; case GA_UBYTE: - sprintf(str + nchars, "#define MAX_NUM %u \n#define MIN_NUM %u \n", UCHAR_MAX, 0); + strb_appendf(str, "#define MAX_NUM %u \n#define MIN_NUM %u \n", UCHAR_MAX, 0); break; case GA_BYTE: - sprintf(str + nchars, "#define MAX_NUM %d \n#define MIN_NUM %d \n", SCHAR_MAX, SCHAR_MIN); + strb_appendf(str, "#define MAX_NUM %d \n#define MIN_NUM %d \n", SCHAR_MAX, SCHAR_MIN); break; case GA_USHORT: - sprintf(str + nchars, "#define MAX_NUM %u \n#define MIN_NUM %u \n", USHRT_MAX, 0); + strb_appendf(str, "#define MAX_NUM %u \n#define MIN_NUM %u \n", USHRT_MAX, 0); break; case GA_SHORT: - sprintf(str + nchars, "#define MAX_NUM %d \n#define MIN_NUM %d \n", SHRT_MAX, SHRT_MIN); + strb_appendf(str, "#define MAX_NUM %d \n#define MIN_NUM %d \n", SHRT_MAX, SHRT_MIN); break; default: - fprintf(stderr, "Type %s not supported", ctype(typecode)); + return GA_IMPL_ERROR; break; } + return strb_error(&str); } #define NSTR_BITONIC 3 @@ -808,7 +777,7 @@ static void genMergeSortTypeCode(char *str, int typecode) #define NSTRINGS_RKS_IDX 4 #define NSTRINGS_MERGE 4 #define NSTRINGS_MERGE_GLB 4 -static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k_ranks_idxs, GpuKernel *k_merge, +static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k_ranks_idxs, GpuKernel *k_merge, GpuKernel *k_merge_global, gpucontext *ctx, int typecode) { char *err_str = NULL; @@ -826,13 +795,13 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * const char *codes_merge[NSTRINGS_MERGE] = {NULL, code_helper_funcs, code_bin_search, code_merge}; const char *codes_merge_glb[NSTRINGS_MERGE_GLB] = {NULL, code_helper_funcs, code_bin_search, code_merge_glb}; - char code_typecode[500]; - genMergeSortTypeCode(code_typecode, typecode); + strb sb = STRB_STATIC_INIT; + err = genMergeSortTypeCode(&sb, typecode); + if (err != GA_NO_ERROR) return err; // Compile Bitonic sort Kernel - lens_bitonic[0] = strlen(code_typecode); - codes_bitonic[0] = code_typecode; - + lens_bitonic[0] = sb.l; + codes_bitonic[0] = sb.s; err = GpuKernel_init( k_bitonic, ctx, NSTR_BITONIC, @@ -844,13 +813,15 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * flags, &err_str ); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + if (err != GA_NO_ERROR) { + printf("error kernel init: %s \n", gpuarray_error_str(err)); + printf("error backend: %s \n", err_str); + return err; + } // Compile ranks kernel - lens_ranks[0] = strlen(code_typecode); - codes_ranks[0] = code_typecode; - + lens_ranks[0] = sb.l; + codes_ranks[0] = sb.s; err = GpuKernel_init( k_ranks, ctx, NSTR_RANKS, @@ -862,13 +833,15 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * flags, &err_str ); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + if (err != GA_NO_ERROR) { + printf("error kernel init: %s \n", gpuarray_error_str(err)); + printf("error backend: %s \n", err_str); + return err; + } // Compile ranks and idxs kernel - lens_rks_idx[0] = strlen(code_typecode); - codes_rks_idx[0] = code_typecode; - + lens_rks_idx[0] = sb.l; + codes_rks_idx[0] = sb.s; err = GpuKernel_init( k_ranks_idxs, ctx, NSTRINGS_RKS_IDX, @@ -880,13 +853,15 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * flags, &err_str ); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + if (err != GA_NO_ERROR) { + printf("error kernel init: %s \n", gpuarray_error_str(err)); + printf("error backend: %s \n", err_str); + return err; + } // Compile merge kernel - lens_merge[0] = strlen(code_typecode); - codes_merge[0] = code_typecode; - + lens_merge[0] = sb.l; + codes_merge[0] = sb.s; err = GpuKernel_init( k_merge, ctx, NSTRINGS_MERGE, @@ -898,13 +873,15 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * flags, &err_str ); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + if (err != GA_NO_ERROR) { + printf("error kernel init: %s \n", gpuarray_error_str(err)); + printf("error backend: %s \n", err_str); + return err; + } // Compile merge global kernel - lens_merge_glb[0] = strlen(code_typecode); - codes_merge_glb[0] = code_typecode; - + lens_merge_glb[0] = sb.l; + codes_merge_glb[0] = sb.s; err = GpuKernel_init( k_merge_global, ctx, NSTRINGS_MERGE_GLB, @@ -916,8 +893,12 @@ static void compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel * flags, &err_str ); - if (err != GA_NO_ERROR) printf("error kernel init: %s \n", gpuarray_error_str(err)); - if (err != GA_NO_ERROR) printf("error backend: %s \n", err_str); + if (err != GA_NO_ERROR) { + printf("error kernel init: %s \n", gpuarray_error_str(err)); + printf("error backend: %s \n", err_str); + return err; + } + return err; } static int sort( @@ -945,7 +926,8 @@ static int sort( GpuArray *ikey, *okey, *t; GpuKernel k_bitonic, k_ranks, k_ranks_idxs, k_merge, k_merge_global; - compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, typecode); + err = compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, typecode); + if (err != GA_NO_ERROR) return err; for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1, stageCount++); @@ -958,33 +940,38 @@ static int sort( okey = d_BufKey; } - // Bitonic sort for arrays <= 1024 elements + // Bitonic sort for short arrays if (N <= SHARED_SIZE_LIMIT) { - err = bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); + err = bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); + if (err != GA_NO_ERROR) return err; } // Merge - Bitonic sort for bigger arrays else { unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; unsigned int arrayLength = SHARED_SIZE_LIMIT; err = bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0, &k_bitonic, ctx); + if (err != GA_NO_ERROR) return err; for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) { unsigned int lastSegmentElements = Nfloor % (2 * stride); //Find sample ranks and prepare for limiters merge - err = generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); + err = generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); + if (err != GA_NO_ERROR) return err; //Merge ranks and indices err = mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor, sortDir, &k_ranks_idxs, ctx); + if (err != GA_NO_ERROR) return err; //Merge elementary intervals err = mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir, &k_merge, ctx); + if (err != GA_NO_ERROR) return err; if (lastSegmentElements <= stride) { //Last merge segment consists of a single array which just needs to be passed through lstCopyOff = okey->offset + ((Nfloor - lastSegmentElements) * typeSize); err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, lastSegmentElements * typeSize); - if (err != GA_NO_ERROR) printf("error move data\n"); + if (err != GA_NO_ERROR) return err; } // Swap pointers t = ikey; @@ -994,13 +981,18 @@ static int sort( // If the array is not multiple of 1024, sort the remaining and merge if (Nleft > 0) { err = bitonicSortShared(d_SrcKey, d_DstKey, 1, Nleft, sortDir, Nfloor, &k_bitonic, ctx); + if (err != GA_NO_ERROR) return err; // Copy the leftMost segment to the output array of which contains the first sorted sequence lstCopyOff = okey->offset + Nfloor * typeSize; err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, Nleft * typeSize); + if (err != GA_NO_ERROR) return err; err = mergeGlobalMem(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir, &k_merge_global, ctx); + if (err != GA_NO_ERROR) return err; + err = GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; } } return err; @@ -1011,13 +1003,13 @@ static int initMergeSort( GpuArray *d_RanksB, GpuArray *d_LimitsA, GpuArray *d_LimitsB, - unsigned int MAX_SAMPLE_COUNT, + unsigned int len, + unsigned int nd, gpucontext *ctx ) { int err = GA_NO_ERROR; - const unsigned int nd = 1; - const size_t dims = MAX_SAMPLE_COUNT * sizeof(unsigned int); + const size_t dims = len * sizeof(unsigned int); err = GpuArray_empty(d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); @@ -1051,60 +1043,57 @@ static void destroyMergeSort( int GpuArray_sort( - GpuArray *dst, - GpuArray *src, + GpuArray *dstKey, + GpuArray *srcKey, unsigned int sortDir, - GpuArray *arg + GpuArray *dstArg, + GpuArray *srcArg ) { int err = GA_NO_ERROR; + + const size_t dims = srcKey->dimensions[0]; + const unsigned int Nfloor = roundDown(dims, SHARED_SIZE_LIMIT); + const int Nleft = dims - Nfloor; + + // Buffer data structure + GpuArray BufKey, BufArg; + // Device pointers - auxiiary data structure GpuArray d_RanksA, d_RanksB, d_LimitsA, d_LimitsB; + + gpucontext *ctx = GpuArray_context(srcKey); - //int type = src->typecode; - gpucontext *ctx = GpuArray_context(src); - - printf("Sorting GPU Array (%s -> %zu bytes)\n", gpuarray_get_type(src->typecode)->cluda_name, typesize(src->typecode)); + if (srcKey->nd > 1) return GA_IMPL_ERROR; + if (dstArg != NULL || srcArg != NULL) return GA_IMPL_ERROR; - if (arg != NULL) - { - // perform argsort - assert(arg != NULL); - } - else - { - const unsigned int nd = 1; - const size_t dims = src->dimensions[0]; - - const unsigned int Nfloor = roundDown(dims, SHARED_SIZE_LIMIT); - const int Nleft = dims - Nfloor; - - // Buffer data strucute - GpuArray BufKey; - err = GpuArray_empty(&BufKey, ctx, src->typecode, nd, &dims, GA_C_ORDER); - - // Auxiliary data structure for MergeSort - initMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, Nfloor / 128, ctx); - - // perform regular sort - err = sort( - dst, - &BufKey, - src, - &d_RanksA, - &d_RanksB, - &d_LimitsA, - &d_LimitsB, - dims, - Nfloor, - Nleft, - sortDir, - ctx - ); - - destroyMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, &BufKey); + if (dstArg != NULL || srcArg != NULL) { + err = GpuArray_empty(&BufArg, ctx, GA_UINT, srcKey->nd, &dims, GA_C_ORDER); } - return err; + err = GpuArray_empty(&BufKey, ctx, srcKey->typecode, srcKey->nd, &dims, GA_C_ORDER); + + // Auxiliary data structure for MergeSort + err = initMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, Nfloor / 128, srcKey->nd, ctx); + + // perform regular sort + err = sort( + dstKey, + &BufKey, + srcKey, + &d_RanksA, + &d_RanksB, + &d_LimitsA, + &d_LimitsB, + dims, + Nfloor, + Nleft, + sortDir, + ctx + ); + + // Destroy auxiliary data structures + destroyMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, &BufKey); -} + return err; +} \ No newline at end of file From b5cec2a59e54c5ccf7bfeda339ad0d1ee0b11570 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Mon, 24 Jul 2017 21:09:53 +0200 Subject: [PATCH 10/19] CMakeList restored --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa5defc384..d6a96e7339 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.0) PROJECT(libgpuarray C) From 879ff5219a6b2a2674afe7eb38fa3842a73358c1 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Mon, 24 Jul 2017 21:28:02 +0200 Subject: [PATCH 11/19] change for merge --- src/gpuarray/sort.h | 2 +- src/gpuarray_sort.c | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index 3752d37928..4874698936 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -19,7 +19,7 @@ extern "C" { #define SAMPLE_STRIDE 128 -int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstArg, GpuArray *srcArg); +int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir); #ifdef __cplusplus diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index ecc5ce3bbb..235785e7a4 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -1045,9 +1045,7 @@ static void destroyMergeSort( int GpuArray_sort( GpuArray *dstKey, GpuArray *srcKey, - unsigned int sortDir, - GpuArray *dstArg, - GpuArray *srcArg + unsigned int sortDir ) { int err = GA_NO_ERROR; @@ -1057,19 +1055,20 @@ int GpuArray_sort( const int Nleft = dims - Nfloor; // Buffer data structure - GpuArray BufKey, BufArg; - + GpuArray BufKey; + // Device pointers - auxiiary data structure GpuArray d_RanksA, d_RanksB, d_LimitsA, d_LimitsB; gpucontext *ctx = GpuArray_context(srcKey); if (srcKey->nd > 1) return GA_IMPL_ERROR; - if (dstArg != NULL || srcArg != NULL) return GA_IMPL_ERROR; + // if (dstArg != NULL || srcArg != NULL) return GA_IMPL_ERROR; + /* if (dstArg != NULL || srcArg != NULL) { err = GpuArray_empty(&BufArg, ctx, GA_UINT, srcKey->nd, &dims, GA_C_ORDER); - } + }*/ err = GpuArray_empty(&BufKey, ctx, srcKey->typecode, srcKey->nd, &dims, GA_C_ORDER); From bba0a2d45dedf2b06d1bbed0c44a6126e4a5d9ed Mon Sep 17 00:00:00 2001 From: vcampmany Date: Wed, 26 Jul 2017 16:47:44 +0200 Subject: [PATCH 12/19] error fix and refactor --- src/gpuarray/sort.h | 28 ++- src/gpuarray_sort.c | 544 ++++++++++++++++++++++++++++++-------------- 2 files changed, 400 insertions(+), 172 deletions(-) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index 4874698936..ee05803f68 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -18,8 +18,32 @@ extern "C" { #define SHARED_SIZE_LIMIT 1024U #define SAMPLE_STRIDE 128 - -int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir); +typedef struct _GpuSortData { + GpuArray BufKey; + GpuArray BufArg; + GpuArray d_RanksA; + GpuArray d_RanksB; + GpuArray d_LimitsA; + GpuArray d_LimitsB; +} GpuSortData; + +typedef struct _GpuSortConfig { + unsigned int dims; + unsigned int Nfloor; + int Nleft; + unsigned int sortDirFlg; + unsigned int argSortFlg; + int typecode; + size_t typesize; +} GpuSortConfig; + +typedef struct _GpuSortBuffers { + GpuArray BufKey; + GpuArray BufArg; +} GpuSortBuff; + + +int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstArg); #ifdef __cplusplus diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 235785e7a4..596c8f4cd3 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -161,13 +161,22 @@ static const char *code_bin_search = "}\n"; #define NUMARGS_BITONIC_KERNEL 8 -const int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +#define NUMARGS_BITONIC_KERNEL_ARG 12 +int type_args_bitonic_arg[NUMARGS_BITONIC_KERNEL_ARG] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, + GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_bitonic_smem = \ " extern \"C\" __global__ void bitonicSortSharedKernel( " \ " t_key *d_DstKey, " \ " size_t dstOff," \ " t_key *d_SrcKey, " \ -" size_t srcOff," \ +" size_t srcOff,"\ +"\n#ifdef ARGSORT\n" \ +" t_arg *d_DstArg, "\ +" size_t dstArgOff, "\ +" t_arg *d_SrcArg, "\ +" size_t srcArgOff, " \ +"\n#endif\n"\ " unsigned int batchSize, " \ " unsigned int arrayLength, " \ " unsigned int elemsOff, " \ @@ -176,6 +185,13 @@ static const char *code_bitonic_smem = " { " \ " d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ " d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ +"\n#ifdef ARGSORT\n" \ +" d_DstArg = (t_arg*) (((char*)d_DstArg)+ dstArgOff); "\ +" d_SrcArg = (t_arg*) (((char*)d_SrcArg)+ srcArgOff);"\ +" d_DstArg += elemsOff;"\ +" d_SrcArg += elemsOff;" \ +" __shared__ t_arg s_arg[SHARED_SIZE_LIMIT];" \ +"\n#endif\n"\ " d_DstKey += elemsOff;" \ " d_SrcKey += elemsOff;" \ " __shared__ t_key s_key[SHARED_SIZE_LIMIT]; " \ @@ -183,12 +199,24 @@ static const char *code_bitonic_smem = " blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, " \ " arrayLength * batchSize, " \ " sortDir " \ -" ); " \ +" ); " \ " s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcKey, " \ " blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2),"\ " arrayLength * batchSize, " \ " sortDir " \ " ); " \ +"\n#ifdef ARGSORT\n" +" s_arg[threadIdx.x] = readArray( d_SrcArg, "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ +" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcArg," \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2),"\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ +"\n#endif\n" \ " for (unsigned int size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) { " \ " unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); " \ " for (unsigned int stride = size / 2; stride > 0; stride >>= 1) " \ @@ -196,10 +224,16 @@ static const char *code_bitonic_smem = " __syncthreads(); " \ " unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); " \ " t_key t; " \ +" t_arg t2;" \ " if ((s_key[pos] > s_key[pos + stride]) == ddd) { " \ " t = s_key[pos]; " \ " s_key[pos] = s_key[pos + stride]; " \ " s_key[pos + stride] = t; " \ +"\n#ifdef ARGSORT\n" \ +" t2 = s_arg[pos];"\ +" s_arg[pos] = s_arg[pos + stride];" \ +" s_arg[pos + stride] = t2;" \ +"\n#endif\n" \ " } " \ " } " \ " } " \ @@ -208,10 +242,16 @@ static const char *code_bitonic_smem = " __syncthreads(); " \ " unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); " \ " t_key t; " \ +" t_arg t2; " \ " if ((s_key[pos] > s_key[pos + stride]) == sortDir) {" \ " t = s_key[pos]; " \ " s_key[pos] = s_key[pos + stride]; " \ " s_key[pos + stride] = t; " \ +"\n#ifdef ARGSORT\n" \ +" t2 = s_arg[pos];"\ +" s_arg[pos] = s_arg[pos + stride];" \ +" s_arg[pos + stride] = t2;" \ +"\n#endif\n" \ " } " \ " } " \ " } " \ @@ -226,14 +266,29 @@ static const char *code_bitonic_smem = " s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], " \ " arrayLength * batchSize " \ " ); " \ -" }\n"; +"\n#ifdef ARGSORT\n" \ +" writeArray( d_DstArg, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, " \ +" s_arg[threadIdx.x], " \ +" arrayLength * batchSize " \ +" ); " \ +" writeArray( d_DstArg, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), " \ +" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], " \ +" arrayLength * batchSize " \ +" ); " \ +"\n#endif\n "\ +"}\n"; static int bitonicSortShared( GpuArray *d_DstKey, GpuArray *d_SrcKey, + GpuArray *d_DstArg, + GpuArray *d_SrcArg, unsigned int batchSize, unsigned int arrayLength, unsigned int sortDir, unsigned int elemsOff, + unsigned int argSortFlg, GpuKernel *k_bitonic, gpucontext *ctx ) @@ -250,12 +305,27 @@ static int bitonicSortShared( err = GpuKernel_setarg(k_bitonic, p++, &d_DstKey->offset); if (err != GA_NO_ERROR) return err; - + err = GpuKernel_setarg(k_bitonic, p++, d_SrcKey->data); if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_bitonic, p++, &d_SrcKey->offset); if (err != GA_NO_ERROR) return err; + + if (argSortFlg) { + printf("in params\n"); + err = GpuKernel_setarg(k_bitonic, p++, d_DstArg->data); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_bitonic, p++, &d_DstArg->offset); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_bitonic, p++, d_SrcArg->data); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_bitonic, p++, &d_SrcArg->offset); + if (err != GA_NO_ERROR) return err; + } err = GpuKernel_setarg(k_bitonic, p++, &batchSize); if (err != GA_NO_ERROR) return err; @@ -288,7 +358,8 @@ static int bitonicSortShared( } #define NUMARGS_SAMPLE_RANKS 10 -const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, + GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_sample_ranks = \ "extern \"C\" __global__ void generateSampleRanksKernel(" \ " unsigned int *d_RanksA," \ @@ -335,18 +406,18 @@ static const char *code_sample_ranks = " }" \ "}\n"; static int generateSampleRanks( - GpuArray *d_RanksA, - GpuArray *d_RanksB, + GpuSortData *msData, GpuArray *d_SrcKey, unsigned int stride, - unsigned int N, - unsigned int sortDir, + GpuSortConfig *msConfig, GpuKernel *k_ranks, gpucontext *ctx ) { - unsigned int lastSegmentElements = N % (2 * stride); - unsigned int threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + unsigned int lastSegmentElements = msConfig->Nfloor % (2 * stride); + unsigned int threadCount = (lastSegmentElements > stride) ? + (msConfig->Nfloor + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : + (msConfig->Nfloor - lastSegmentElements) / (2 * SAMPLE_STRIDE); size_t ls, gs; unsigned int p = 0; @@ -355,16 +426,16 @@ static int generateSampleRanks( ls = 256; gs = iDivUp(threadCount, 256); - err = GpuKernel_setarg(k_ranks, p++, d_RanksA->data); + err = GpuKernel_setarg(k_ranks, p++, msData->d_RanksA.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks, p++, &d_RanksA->offset); + err = GpuKernel_setarg(k_ranks, p++, &msData->d_RanksA.offset); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks, p++, d_RanksB->data); + err = GpuKernel_setarg(k_ranks, p++, msData->d_RanksB.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks, p++, &d_RanksB->offset); + err = GpuKernel_setarg(k_ranks, p++, &msData->d_RanksB.offset); if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, d_SrcKey->data); @@ -376,13 +447,13 @@ static int generateSampleRanks( err = GpuKernel_setarg(k_ranks, p++, &stride); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks, p++, &N); + err = GpuKernel_setarg(k_ranks, p++, &msConfig->Nfloor); if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks, p++, &threadCount); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks, p++, &sortDir); + err = GpuKernel_setarg(k_ranks, p++, &msConfig->sortDirFlg); if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_ranks, 1, &gs, &ls, 0, NULL); @@ -429,20 +500,17 @@ static const char *code_ranks_idxs = " } " \ "}\n"; static int mergeRanksAndIndices( - GpuArray *d_LimitsA, - GpuArray *d_LimitsB, - GpuArray *d_RanksA, - GpuArray *d_RanksB, + GpuSortData *msData, unsigned int stride, - unsigned int N, - unsigned int sortDir, + GpuSortConfig *msConfig, GpuKernel *k_ranks_idxs, gpucontext *ctx ) { - unsigned int lastSegmentElements = N % (2 * stride); + unsigned int lastSegmentElements = msConfig->Nfloor % (2 * stride); unsigned int threadCount = (lastSegmentElements > stride) ? - (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE); + (msConfig->Nfloor + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : + (msConfig->Nfloor - lastSegmentElements) / (2 * SAMPLE_STRIDE); size_t ls, gs; unsigned int p = 0; int err = GA_NO_ERROR; @@ -450,22 +518,22 @@ static int mergeRanksAndIndices( ls = 256U; gs = iDivUp(threadCount, 256U); - err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsA->data); + err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_LimitsA.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsA->offset); + err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_LimitsA.offset); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksA->data); + err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_RanksA.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksA->offset); + err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_RanksA.offset); if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &stride); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks_idxs, p++, &N); + err = GpuKernel_setarg(k_ranks_idxs, p++, &msConfig->Nfloor); if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_ranks_idxs, p++, &threadCount); @@ -476,16 +544,16 @@ static int mergeRanksAndIndices( p = 0; - err = GpuKernel_setarg(k_ranks_idxs, p++, d_LimitsB->data); + err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_LimitsB.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks_idxs, p++, &d_LimitsB->offset); + err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_LimitsB.offset); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks_idxs, p++, d_RanksB->data); + err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_RanksB.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_ranks_idxs, p++, &d_RanksB->offset); + err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_RanksB.offset); if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); @@ -495,12 +563,21 @@ static int mergeRanksAndIndices( } #define NUMARGS_MERGE 11 -const int type_args_merge[NUMARGS_MERGE] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; +int type_args_merge[NUMARGS_MERGE] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, + GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; +#define NUMARGS_MERGE_ARG 15 +int type_args_merge_arg[NUMARGS_MERGE_ARG] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, + GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; static const char *code_merge = \ " template __device__ void merge( " \ " T *dstKey, " \ " T *srcAKey, " \ " T *srcBKey, " \ +"\n#ifdef ARGSORT\n" \ +" t_arg *dstVal, " \ +" t_arg *srcAVal, " \ +" t_arg *srcBVal, " \ +"\n#endif\n" \ " unsigned int lenA, " \ " unsigned int nPowTwoLenA, " \ " unsigned int lenB, " \ @@ -509,21 +586,34 @@ static const char *code_merge = ") " \ "{ " \ " T keyA, keyB; " \ +" t_arg valA, valB; " \ " unsigned int dstPosA , dstPosB;" \ " if (threadIdx.x < lenA) { " \ " keyA = srcAKey[threadIdx.x]; " \ +"\n#ifdef ARGSORT\n" \ +" valA = srcAVal[threadIdx.x]; " \ +"\n#endif\n" \ " dstPosA = binarySearchExclusive(keyA, srcBKey, lenB, nPowTwoLenB, sortDir) + threadIdx.x; " \ " } " \ " if (threadIdx.x < lenB) { " \ " keyB = srcBKey[threadIdx.x]; " \ +"\n#ifdef ARGSORT\n" \ +" valB = srcBVal[threadIdx.x]; " \ +"\n#endif\n" \ " dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA, sortDir) + threadIdx.x; " \ " } " \ " __syncthreads(); " \ " if (threadIdx.x < lenA) { " \ " dstKey[dstPosA] = keyA; " \ +"\n#ifdef ARGSORT\n" \ +" dstVal[dstPosA] = valA; " \ +"\n#endif\n" \ " } " \ " if (threadIdx.x < lenB) { " \ " dstKey[dstPosB] = keyB; " \ +"\n#ifdef ARGSORT\n" \ +" dstVal[dstPosB] = valB; " \ +"\n#endif\n" \ " } " \ "} " \ "extern \"C\" __global__ void mergeElementaryIntervalsKernel( " \ @@ -531,6 +621,12 @@ static const char *code_merge = " size_t dstOff," \ " t_key *d_SrcKey, " \ " size_t srcOff," \ +"\n#ifdef ARGSORT\n" \ +" t_arg *d_DstArg, " \ +" size_t dstArgOff, " \ +" t_arg *d_SrcArg, " \ +" size_t srcArgOff, " \ +"\n#endif\n" \ " unsigned int *d_LimitsA, " \ " size_t limAOff," \ " unsigned int *d_LimitsB, " \ @@ -544,11 +640,20 @@ static const char *code_merge = " d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ " d_LimitsA = (unsigned int*) (((char*)d_LimitsA)+ limAOff);" \ " d_LimitsB = (unsigned int*) (((char*)d_LimitsB)+ limBOff);" \ +"\n#ifdef ARGSORT\n" \ +" d_DstArg = (t_arg*) (((char*)d_DstArg)+ dstArgOff); " \ +" d_SrcArg = (t_arg*) (((char*)d_SrcArg)+ srcArgOff);" \ +" __shared__ t_arg s_arg[2 * SAMPLE_STRIDE]; " \ +"\n#endif\n" \ " __shared__ t_key s_key[2 * SAMPLE_STRIDE]; " \ " const unsigned int intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); " \ " const unsigned int segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; " \ " d_SrcKey += segmentBase; " \ " d_DstKey += segmentBase; " \ +"\n#ifdef ARGSORT\n" \ +" d_DstArg += segmentBase; " \ +" d_SrcArg += segmentBase; " \ +"\n#endif\n" \ " __shared__ unsigned int startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; " \ " if (threadIdx.x == 0) { " \ " unsigned int segmentElementsA = stride; " \ @@ -568,15 +673,26 @@ static const char *code_merge = " __syncthreads(); " \ " if (threadIdx.x < lenSrcA) { " \ " s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x]; " \ +"\n#ifdef ARGSORT\n" \ +" s_arg[threadIdx.x + 0] = d_SrcArg[0 + startSrcA + threadIdx.x]; " \ +"\n#endif\n" \ " } " \ " if (threadIdx.x < lenSrcB) { " \ " s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x]; " \ +"\n#ifdef ARGSORT\n" \ +" s_arg[threadIdx.x + SAMPLE_STRIDE] = d_SrcArg[stride + startSrcB + threadIdx.x]; " \ +"\n#endif\n" \ " } " \ " __syncthreads(); " \ " merge( " \ " s_key, " \ " s_key + 0, " \ " s_key + SAMPLE_STRIDE, " \ +"\n#ifdef ARGSORT\n" \ +" s_arg, " \ +" s_arg + 0, " \ +" s_arg + SAMPLE_STRIDE, " \ +"\n#endif\n" \ " lenSrcA, SAMPLE_STRIDE, " \ " lenSrcB, SAMPLE_STRIDE, " \ " sortDir " \ @@ -584,25 +700,31 @@ static const char *code_merge = " __syncthreads(); " \ " if (threadIdx.x < lenSrcA) { " \ " d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x]; " \ +"\n#ifdef ARGSORT\n" \ +" d_DstArg[startDstA + threadIdx.x] = s_arg[threadIdx.x];" \ +"\n#endif\n" \ " } " \ " if (threadIdx.x < lenSrcB) { " \ " d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; " \ +"\n#ifdef ARGSORT\n" \ +" d_DstArg[startDstB + threadIdx.x] = s_arg[lenSrcA + threadIdx.x];" \ +"\n#endif\n" \ " } " \ "}\n"; static int mergeElementaryIntervals( GpuArray *d_DstKey, GpuArray *d_SrcKey, - GpuArray *d_LimitsA, - GpuArray *d_LimitsB, + GpuSortData *msData, unsigned int stride, - unsigned int N, - unsigned int sortDir, + GpuSortConfig *msConfig, GpuKernel *k_merge, gpucontext *ctx ) { - unsigned int lastSegmentElements = N % (2 * stride); - unsigned int mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE; + unsigned int lastSegmentElements = msConfig->Nfloor % (2 * stride); + unsigned int mergePairs = (lastSegmentElements > stride) ? + getSampleCount(msConfig->Nfloor) : + (msConfig->Nfloor - lastSegmentElements) / SAMPLE_STRIDE; size_t ls, gs; unsigned int p = 0; @@ -623,25 +745,25 @@ static int mergeElementaryIntervals( err = GpuKernel_setarg(k_merge, p++, &d_SrcKey->offset); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge, p++, d_LimitsA->data); + err = GpuKernel_setarg(k_merge, p++, msData->d_LimitsA.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge, p++, &d_LimitsA->offset); + err = GpuKernel_setarg(k_merge, p++, &msData->d_LimitsA.offset); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge, p++, d_LimitsB->data); + err = GpuKernel_setarg(k_merge, p++, msData->d_LimitsB.data); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge, p++, &d_LimitsB->offset); + err = GpuKernel_setarg(k_merge, p++, &msData->d_LimitsB.offset); if (err != GA_NO_ERROR) return err; err = GpuKernel_setarg(k_merge, p++, &stride); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge, p++, &N); + err = GpuKernel_setarg(k_merge, p++, &msConfig->Nfloor); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge, p++, &sortDir); + err = GpuKernel_setarg(k_merge, p++, &msConfig->sortDirFlg); if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_merge, 1, &gs, &ls, 0, NULL); @@ -651,13 +773,24 @@ static int mergeElementaryIntervals( } #define NUMARGS_MERGE_GLB 8 -const int type_args_merge_glb[NUMARGS_MERGE_GLB] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +int type_args_merge_glb[NUMARGS_MERGE_GLB] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, + GA_UINT, GA_UINT, GA_UINT, GA_UINT}; +#define NUMARGS_MERGE_GLB_ARG 12 +int type_args_merge_glb_arg[NUMARGS_MERGE_GLB_ARG] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, + GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; + static const char *code_merge_glb = \ "extern \"C\" __global__ void mergeGlobalMemKernel( " \ " t_key *d_DstKey, " \ " size_t dstOff, " \ " t_key *d_SrcKey, " \ " size_t srcOff, " \ +"\n#ifdef ARGSORT\n" \ +" t_arg *d_DstArg, " \ +" size_t dstArgOff, " \ +" t_arg *d_SrcArg, " \ +" size_t srcArgOff, " \ +"\n#endif\n" \ " unsigned int segmentSizeA, " \ " unsigned int segmentSizeB, " \ " unsigned int N, " \ @@ -666,6 +799,10 @@ static const char *code_merge_glb = "{ " \ " d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ " d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ +"\n#ifdef ARGSORT\n" \ +" d_DstArg = (t_arg*) (((char*)d_DstArg)+ dstArgOff); " \ +" d_SrcArg = (t_arg*) (((char*)d_SrcArg)+ srcArgOff);" \ +"\n#endif\n" \ " unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; " \ " t_key *segmentPtrA = d_SrcKey; " \ " t_key *segmentPtrB = d_SrcKey + segmentSizeA; " \ @@ -674,6 +811,9 @@ static const char *code_merge_glb = " if (idx >= N) " \ " return; " \ " t_key value = d_SrcKey[idx]; " \ +"\n#ifdef ARGSORT\n" \ +" t_arg arg = d_SrcArg[idx]; " \ +"\n#endif\n" \ " unsigned int dstPos; " \ " if (idx < segmentSizeA) { " \ " dstPos = binarySearchLowerBoundExclusive(value, segmentPtrB, 0, segmentSizeB, sortDir) + idxSegmentA;" \ @@ -682,15 +822,15 @@ static const char *code_merge_glb = " dstPos = binarySearchLowerBoundInclusive(value, segmentPtrA, 0, segmentSizeA, sortDir) + idxSegmentB;" \ " } " \ " d_DstKey[dstPos] = value; " \ +"\n#ifdef ARGSORT\n" \ +" d_DstArg[dstPos] = arg; " \ +"\n#endif\n" \ "}\n"; static int mergeGlobalMem( GpuArray *d_DstKey, GpuArray *d_SrcKey, - unsigned int segmentSizeA, - unsigned int segmentSizeB, - unsigned int N, - unsigned int sortDir, + GpuSortConfig *msConfig, GpuKernel *k_merge_global, gpucontext *ctx ) @@ -698,9 +838,10 @@ static int mergeGlobalMem( size_t ls, gs; unsigned int p = 0; int err = GA_NO_ERROR; + unsigned int NleftC = (unsigned int)msConfig->Nleft; ls = 256; - gs = iDivUp(N, ls); + gs = iDivUp(msConfig->dims, (unsigned int)ls); err = GpuKernel_setarg(k_merge_global, p++, d_DstKey->data); if (err != GA_NO_ERROR) return err; @@ -714,16 +855,16 @@ static int mergeGlobalMem( err = GpuKernel_setarg(k_merge_global, p++, &d_SrcKey->offset); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeA); + err = GpuKernel_setarg(k_merge_global, p++, &msConfig->Nfloor); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge_global, p++, &segmentSizeB); + err = GpuKernel_setarg(k_merge_global, p++, &NleftC); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge_global, p++, &N); + err = GpuKernel_setarg(k_merge_global, p++, &msConfig->dims); if (err != GA_NO_ERROR) return err; - err = GpuKernel_setarg(k_merge_global, p++, &sortDir); + err = GpuKernel_setarg(k_merge_global, p++, &msConfig->sortDirFlg); if (err != GA_NO_ERROR) return err; err = GpuKernel_call(k_merge_global, 1, &gs, &ls, 0, NULL); @@ -733,11 +874,14 @@ static int mergeGlobalMem( } // Generate type specific GPU code -static int genMergeSortTypeCode(strb *str, int typecode) +static int genMergeSortTypeCode(strb *str, int typecode, unsigned int argSort) { - int err = GA_NO_ERROR; + if (argSort) + strb_appends(str, "\n#define ARGSORT\n"); + // Generate typedef for the data type to be sorted strb_appendf(str, "typedef %s t_key;\n", ctype(typecode)); + strb_appendf(str, "typedef %s t_arg;\n", "ga_uint"); // Generate macro for MIN and MAX value of a given data type switch (typecode){ @@ -769,7 +913,7 @@ static int genMergeSortTypeCode(strb *str, int typecode) return GA_IMPL_ERROR; break; } - return strb_error(&str); + return strb_error(str); } #define NSTR_BITONIC 3 @@ -778,7 +922,7 @@ static int genMergeSortTypeCode(strb *str, int typecode) #define NSTRINGS_MERGE 4 #define NSTRINGS_MERGE_GLB 4 static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k_ranks_idxs, GpuKernel *k_merge, - GpuKernel *k_merge_global, gpucontext *ctx, int typecode) + GpuKernel *k_merge_global, gpucontext *ctx, GpuSortConfig *msConfig) { char *err_str = NULL; int err = GA_NO_ERROR; @@ -795,21 +939,32 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k const char *codes_merge[NSTRINGS_MERGE] = {NULL, code_helper_funcs, code_bin_search, code_merge}; const char *codes_merge_glb[NSTRINGS_MERGE_GLB] = {NULL, code_helper_funcs, code_bin_search, code_merge_glb}; + unsigned int nargs; + int *types; + strb sb = STRB_STATIC_INIT; - err = genMergeSortTypeCode(&sb, typecode); + err = genMergeSortTypeCode(&sb, msConfig->typecode, msConfig->argSortFlg); if (err != GA_NO_ERROR) return err; // Compile Bitonic sort Kernel lens_bitonic[0] = sb.l; codes_bitonic[0] = sb.s; + if (msConfig->argSortFlg) { + nargs = NUMARGS_BITONIC_KERNEL_ARG; + types = type_args_bitonic_arg; + } + else { + nargs = NUMARGS_BITONIC_KERNEL; + types = type_args_bitonic; + } err = GpuKernel_init( k_bitonic, ctx, NSTR_BITONIC, codes_bitonic, lens_bitonic, "bitonicSortSharedKernel", - NUMARGS_BITONIC_KERNEL, - type_args_bitonic, + nargs, + types, flags, &err_str ); @@ -859,6 +1014,14 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k return err; } + if (msConfig->argSortFlg) { + nargs = NUMARGS_MERGE_ARG; + types = type_args_merge_arg; + } + else { + nargs = NUMARGS_MERGE; + types = type_args_merge; + } // Compile merge kernel lens_merge[0] = sb.l; codes_merge[0] = sb.s; @@ -868,8 +1031,8 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k codes_merge, lens_merge, "mergeElementaryIntervalsKernel", - NUMARGS_MERGE, - type_args_merge, + nargs, + types, flags, &err_str ); @@ -879,6 +1042,14 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k return err; } + if (msConfig->argSortFlg) { + nargs = NUMARGS_MERGE_GLB_ARG; + types = type_args_merge_glb_arg; + } + else { + nargs = NUMARGS_MERGE_GLB; + types = type_args_merge_glb; + } // Compile merge global kernel lens_merge_glb[0] = sb.l; codes_merge_glb[0] = sb.s; @@ -903,75 +1074,70 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k static int sort( GpuArray *d_DstKey, - GpuArray *d_BufKey, GpuArray *d_SrcKey, - GpuArray *d_RanksA, - GpuArray *d_RanksB, - GpuArray *d_LimitsA, - GpuArray *d_LimitsB, - unsigned int N, - unsigned int Nfloor, - int Nleft, - unsigned int sortDir, + GpuArray *d_DstArg, + GpuArray *d_SrcArg, + GpuSortBuff *msBuff, + GpuSortData *msData, + GpuSortConfig *msConfig, gpucontext *ctx ) { - int typecode = d_SrcKey->typecode; - size_t typeSize = typesize(typecode); - size_t lstCopyOff; int err = GA_NO_ERROR; - + size_t lstCopyOff; unsigned int stageCount = 0; unsigned int stride; - GpuArray *ikey, *okey, *t; + GpuArray *ikey, *okey, *iarg, *oarg, *t; GpuKernel k_bitonic, k_ranks, k_ranks_idxs, k_merge, k_merge_global; - err = compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, typecode); + err = compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, msConfig); if (err != GA_NO_ERROR) return err; - for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1, stageCount++); + for (stride = SHARED_SIZE_LIMIT; stride < msConfig->Nfloor; stride <<= 1, stageCount++); if (stageCount & 1) { - ikey = d_BufKey; + ikey = &msBuff->BufKey; okey = d_DstKey; + iarg = &msBuff->BufArg; + oarg = d_DstArg; } else { ikey = d_DstKey; - okey = d_BufKey; + okey = &msBuff->BufKey; + iarg = d_DstArg; + oarg = &msBuff->BufArg; } // Bitonic sort for short arrays - if (N <= SHARED_SIZE_LIMIT) { - err = bitonicSortShared(d_DstKey, d_SrcKey, 1, N, sortDir, 0, &k_bitonic, ctx); - if (err != GA_NO_ERROR) return err; + if (msConfig->dims <= SHARED_SIZE_LIMIT) { + + err = bitonicSortShared(d_DstKey, d_SrcKey, d_DstArg, d_SrcArg, 1, (unsigned int)msConfig->dims, + msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx); } // Merge - Bitonic sort for bigger arrays else { - unsigned int batchSize = Nfloor / SHARED_SIZE_LIMIT; + unsigned int batchSize = msConfig->Nfloor / SHARED_SIZE_LIMIT; unsigned int arrayLength = SHARED_SIZE_LIMIT; - err = bitonicSortShared(ikey, d_SrcKey, batchSize, arrayLength, sortDir, 0, &k_bitonic, ctx); - if (err != GA_NO_ERROR) return err; + err = bitonicSortShared(ikey, d_SrcKey, iarg, d_SrcArg, batchSize, arrayLength, + msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx); - for (stride = SHARED_SIZE_LIMIT; stride < Nfloor; stride <<= 1) { - unsigned int lastSegmentElements = Nfloor % (2 * stride); + for (stride = SHARED_SIZE_LIMIT; stride < msConfig->Nfloor; stride <<= 1) { + unsigned int lastSegmentElements = msConfig->Nfloor % (2 * stride); //Find sample ranks and prepare for limiters merge - err = generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, Nfloor, sortDir, &k_ranks, ctx); - if (err != GA_NO_ERROR) return err; + err = generateSampleRanks(msData, ikey, stride, msConfig, &k_ranks, ctx); //Merge ranks and indices - err = mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, Nfloor, sortDir, &k_ranks_idxs, ctx); - if (err != GA_NO_ERROR) return err; + err = mergeRanksAndIndices(msData, stride, msConfig, &k_ranks_idxs, ctx); //Merge elementary intervals - err = mergeElementaryIntervals(okey, ikey, d_LimitsA, d_LimitsB, stride, Nfloor, sortDir, &k_merge, ctx); - if (err != GA_NO_ERROR) return err; + err = mergeElementaryIntervals(okey, ikey, msData, stride, msConfig, &k_merge, ctx); if (lastSegmentElements <= stride) { //Last merge segment consists of a single array which just needs to be passed through - lstCopyOff = okey->offset + ((Nfloor - lastSegmentElements) * typeSize); - err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, lastSegmentElements * typeSize); - if (err != GA_NO_ERROR) return err; + lstCopyOff = okey->offset + ((msConfig->Nfloor - lastSegmentElements) * msConfig->typesize); + err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, + lastSegmentElements * msConfig->typesize); } // Swap pointers t = ikey; @@ -979,120 +1145,158 @@ static int sort( okey = t; } // If the array is not multiple of 1024, sort the remaining and merge - if (Nleft > 0) { - err = bitonicSortShared(d_SrcKey, d_DstKey, 1, Nleft, sortDir, Nfloor, &k_bitonic, ctx); - if (err != GA_NO_ERROR) return err; + if (msConfig->Nleft > 0) { + err = bitonicSortShared(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, 1, msConfig->Nleft, + msConfig->sortDirFlg, msConfig->Nfloor, msConfig->argSortFlg, &k_bitonic, ctx); // Copy the leftMost segment to the output array of which contains the first sorted sequence - lstCopyOff = okey->offset + Nfloor * typeSize; - err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, Nleft * typeSize); - if (err != GA_NO_ERROR) return err; + lstCopyOff = okey->offset + msConfig->Nfloor * msConfig->typesize; + err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, + msConfig->Nleft * msConfig->typesize); - err = mergeGlobalMem(d_SrcKey, d_DstKey, Nfloor, (unsigned int)Nleft, N, sortDir, &k_merge_global, ctx); - if (err != GA_NO_ERROR) return err; + err = mergeGlobalMem(d_SrcKey, d_DstKey, msConfig, &k_merge_global, ctx); err = GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; } } return err; } +static int initArgSort( + GpuArray *srcArg, + GpuArray *src, + gpucontext *ctx +) +{ + int err = GA_NO_ERROR; + const size_t dims = src->dimensions[0] * sizeof(unsigned long); + + unsigned long *tmp = (unsigned long*) malloc(dims); + unsigned long i; + for (i = 0; i < src->dimensions[0]; ++i) tmp[i] = i; + + err = GpuArray_empty(srcArg, ctx, GA_ULONG, src->nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; + + err = GpuArray_write(srcArg, tmp, dims); + if (err != GA_NO_ERROR) return err; + + free(tmp); + return err; +} + static int initMergeSort( - GpuArray *d_RanksA, - GpuArray *d_RanksB, - GpuArray *d_LimitsA, - GpuArray *d_LimitsB, - unsigned int len, - unsigned int nd, + GpuSortData *msData, + GpuSortConfig *msConfig, + GpuArray *src, + GpuArray *srcArg, gpucontext *ctx ) { int err = GA_NO_ERROR; - const size_t dims = len * sizeof(unsigned int); + const size_t dims = (msConfig->Nfloor / 128) * sizeof(unsigned int); + unsigned int nd = src->nd; - err = GpuArray_empty(d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); + err = GpuArray_empty(&msData->d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; - err = GpuArray_empty(d_RanksB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); + err = GpuArray_empty(&msData->d_RanksB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; - err = GpuArray_empty(d_LimitsA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); + err = GpuArray_empty(&msData->d_LimitsA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; + + err = GpuArray_empty(&msData->d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; + + if (msConfig->sortDirFlg) { + initArgSort(srcArg, src, ctx); + } + return err; +} + +static void initMsConfig(GpuSortConfig *msConfig, GpuArray *src, unsigned int sortDir, unsigned int argSort) +{ + msConfig->dims = src->dimensions[0]; + msConfig->Nfloor = roundDown((unsigned int)msConfig->dims, SHARED_SIZE_LIMIT); + msConfig->Nleft = (unsigned int)msConfig->dims - msConfig->Nfloor; + msConfig->sortDirFlg = sortDir; + msConfig->argSortFlg = argSort; + msConfig->typecode = src->typecode; + msConfig->typesize = typesize(src->typecode); +} + +static int initMsBuff(GpuSortBuff *msBuff, GpuArray *src, gpucontext *ctx, unsigned int argSort) +{ + int err = GA_NO_ERROR; - err = GpuArray_empty(d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) printf("error allocating aux structures %d\n", err); + err = GpuArray_empty(&msBuff->BufKey, ctx, src->typecode, src->nd, src->dimensions, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; + + if (argSort) { + err = GpuArray_empty(&msBuff->BufArg, ctx, GA_ULONG, src->nd, src->dimensions, GA_C_ORDER); + if (err != GA_NO_ERROR) return err; + } return err; } static void destroyMergeSort( - GpuArray *d_RanksA, - GpuArray *d_RanksB, - GpuArray *d_LimitsA, - GpuArray *d_LimitsB, - GpuArray *BufKey + GpuSortData *msData, + GpuSortBuff *msBuff, + GpuArray *srcArg, + unsigned int argSort ) { - GpuArray_clear(d_RanksA); - GpuArray_clear(d_RanksB); - GpuArray_clear(d_LimitsA); - GpuArray_clear(d_LimitsB); - GpuArray_clear(BufKey); + GpuArray_clear(&msData->d_RanksA); + GpuArray_clear(&msData->d_RanksB); + GpuArray_clear(&msData->d_LimitsA); + GpuArray_clear(&msData->d_LimitsB); + GpuArray_clear(&msBuff->BufKey); + if (argSort) { + GpuArray_clear(&msBuff->BufArg); + GpuArray_clear(srcArg); + } } - int GpuArray_sort( GpuArray *dstKey, GpuArray *srcKey, - unsigned int sortDir + unsigned int sortDir, + GpuArray *dstArg ) { int err = GA_NO_ERROR; - - const size_t dims = srcKey->dimensions[0]; - const unsigned int Nfloor = roundDown(dims, SHARED_SIZE_LIMIT); - const int Nleft = dims - Nfloor; - - // Buffer data structure - GpuArray BufKey; - - // Device pointers - auxiiary data structure - GpuArray d_RanksA, d_RanksB, d_LimitsA, d_LimitsB; - gpucontext *ctx = GpuArray_context(srcKey); - if (srcKey->nd > 1) return GA_IMPL_ERROR; - // if (dstArg != NULL || srcArg != NULL) return GA_IMPL_ERROR; - - /* - if (dstArg != NULL || srcArg != NULL) { - err = GpuArray_empty(&BufArg, ctx, GA_UINT, srcKey->nd, &dims, GA_C_ORDER); - }*/ + GpuArray srcArg; + GpuSortConfig msConfig; + GpuSortBuff msBuff; + GpuSortData msData; - err = GpuArray_empty(&BufKey, ctx, srcKey->typecode, srcKey->nd, &dims, GA_C_ORDER); + initMsConfig(&msConfig, srcKey, sortDir, dstArg != NULL ? 1 : 0); - // Auxiliary data structure for MergeSort - err = initMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, Nfloor / 128, srcKey->nd, ctx); + err = initMsBuff(&msBuff, srcKey, ctx, msConfig.argSortFlg); + if (err != GA_NO_ERROR) return err; + err = initMergeSort(&msData, &msConfig, srcKey, &srcArg, ctx); + if (err != GA_NO_ERROR) return err; + + if (srcKey->nd > 1) return GA_IMPL_ERROR; + // perform regular sort err = sort( dstKey, - &BufKey, srcKey, - &d_RanksA, - &d_RanksB, - &d_LimitsA, - &d_LimitsB, - dims, - Nfloor, - Nleft, - sortDir, + dstArg, + &srcArg, + &msBuff, + &msData, + &msConfig, ctx ); - // Destroy auxiliary data structures - destroyMergeSort(&d_RanksA, &d_RanksB, &d_LimitsA, &d_LimitsB, &BufKey); + destroyMergeSort(&msData, &msBuff, &srcArg, msConfig.sortDirFlg); return err; } \ No newline at end of file From eb7d948fac5abf6b1831402c1aed0af9bc9f0c5b Mon Sep 17 00:00:00 2001 From: vcampmany Date: Thu, 27 Jul 2017 17:00:50 +0200 Subject: [PATCH 13/19] argsort --- src/gpuarray/sort.h | 16 +-- src/gpuarray_sort.c | 255 +++++++++++++++++++++++++++++--------------- 2 files changed, 179 insertions(+), 92 deletions(-) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index ee05803f68..76a5860b95 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -28,13 +28,15 @@ typedef struct _GpuSortData { } GpuSortData; typedef struct _GpuSortConfig { - unsigned int dims; - unsigned int Nfloor; - int Nleft; - unsigned int sortDirFlg; - unsigned int argSortFlg; - int typecode; - size_t typesize; + unsigned int dims; + unsigned int Nfloor; + int Nleft; + unsigned int sortDirFlg; + unsigned int argSortFlg; + int typecodeKey; + size_t typesizeKey; + int typecodeArg; + size_t typesizeArg; } GpuSortConfig; typedef struct _GpuSortBuffers { diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 596c8f4cd3..5c34bb9b32 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "util/strb.h" #include "private.h" @@ -26,6 +27,9 @@ * */ + //#define checkErr(x) checkErrors(x, __FILE__, __LINE__) +#define checkErr(err) if (err != GA_NO_ERROR) return err; + const int flags = GA_USE_CLUDA; static const char *code_helper_funcs = \ @@ -57,6 +61,19 @@ static const char *code_helper_funcs = " return a[pos]; " \ " } " \ " } " \ +"template __device__ T readArray_arg(T *a, unsigned int pos, unsigned int length, unsigned int sortDir){"\ +" if (pos >= length) { " \ +" if (sortDir) { " \ +" return MAX_NUM_ARG; " \ +" } " \ +" else { " \ +" return MIN_NUM_ARG; " \ +" } " \ +" } " \ +" else { " \ +" return a[pos]; " \ +" } " \ +" } " \ "template __device__ void writeArray(T *a, unsigned int pos, T value, unsigned int length) " \ " { " \ " if (pos >= length) " \ @@ -87,10 +104,6 @@ static inline const char *ctype(int typecode) { return gpuarray_get_type(typecode)->cluda_name; } -static inline size_t typesize(int typecode) { - return gpuarray_get_type(typecode)->size; -} - static const char *code_bin_search = \ "template __device__ unsigned int binarySearchInclusive(T val, T *data, unsigned int L, " \ " unsigned int stride, unsigned int sortDir){" \ @@ -206,16 +219,16 @@ static const char *code_bitonic_smem = " sortDir " \ " ); " \ "\n#ifdef ARGSORT\n" -" s_arg[threadIdx.x] = readArray( d_SrcArg, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ -" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray( d_SrcArg," \ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2),"\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ +" s_arg[threadIdx.x] = readArray_arg( d_SrcArg, "\ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ +" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray_arg( d_SrcArg," \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2),"\ +" arrayLength * batchSize, "\ +" sortDir "\ +" ); "\ "\n#endif\n" \ " for (unsigned int size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) { " \ " unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); " \ @@ -224,13 +237,12 @@ static const char *code_bitonic_smem = " __syncthreads(); " \ " unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); " \ " t_key t; " \ -" t_arg t2;" \ " if ((s_key[pos] > s_key[pos + stride]) == ddd) { " \ " t = s_key[pos]; " \ " s_key[pos] = s_key[pos + stride]; " \ " s_key[pos + stride] = t; " \ "\n#ifdef ARGSORT\n" \ -" t2 = s_arg[pos];"\ +" t_arg t2 = s_arg[pos];"\ " s_arg[pos] = s_arg[pos + stride];" \ " s_arg[pos + stride] = t2;" \ "\n#endif\n" \ @@ -242,13 +254,12 @@ static const char *code_bitonic_smem = " __syncthreads(); " \ " unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); " \ " t_key t; " \ -" t_arg t2; " \ " if ((s_key[pos] > s_key[pos + stride]) == sortDir) {" \ " t = s_key[pos]; " \ " s_key[pos] = s_key[pos + stride]; " \ " s_key[pos + stride] = t; " \ "\n#ifdef ARGSORT\n" \ -" t2 = s_arg[pos];"\ +" t_arg t2 = s_arg[pos];"\ " s_arg[pos] = s_arg[pos + stride];" \ " s_arg[pos + stride] = t2;" \ "\n#endif\n" \ @@ -313,7 +324,6 @@ static int bitonicSortShared( if (err != GA_NO_ERROR) return err; if (argSortFlg) { - printf("in params\n"); err = GpuKernel_setarg(k_bitonic, p++, d_DstArg->data); if (err != GA_NO_ERROR) return err; @@ -586,7 +596,9 @@ static const char *code_merge = ") " \ "{ " \ " T keyA, keyB; " \ +"\n#ifdef ARGSORT\n" \ " t_arg valA, valB; " \ +"\n#endif\n" \ " unsigned int dstPosA , dstPosB;" \ " if (threadIdx.x < lenA) { " \ " keyA = srcAKey[threadIdx.x]; " \ @@ -598,7 +610,7 @@ static const char *code_merge = " if (threadIdx.x < lenB) { " \ " keyB = srcBKey[threadIdx.x]; " \ "\n#ifdef ARGSORT\n" \ -" valB = srcBVal[threadIdx.x]; " \ +" valB = srcBVal[threadIdx.x]; " \ "\n#endif\n" \ " dstPosB = binarySearchInclusive(keyB, srcAKey, lenA, nPowTwoLenA, sortDir) + threadIdx.x; " \ " } " \ @@ -714,6 +726,8 @@ static const char *code_merge = static int mergeElementaryIntervals( GpuArray *d_DstKey, GpuArray *d_SrcKey, + GpuArray *d_DstArg, + GpuArray *d_SrcArg, GpuSortData *msData, unsigned int stride, GpuSortConfig *msConfig, @@ -745,6 +759,20 @@ static int mergeElementaryIntervals( err = GpuKernel_setarg(k_merge, p++, &d_SrcKey->offset); if (err != GA_NO_ERROR) return err; + if (msConfig->argSortFlg) { + err = GpuKernel_setarg(k_merge, p++, d_DstArg->data); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_merge, p++, &d_DstArg->offset); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_merge, p++, d_SrcArg->data); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_merge, p++, &d_SrcArg->offset); + if (err != GA_NO_ERROR) return err; + } + err = GpuKernel_setarg(k_merge, p++, msData->d_LimitsA.data); if (err != GA_NO_ERROR) return err; @@ -774,10 +802,10 @@ static int mergeElementaryIntervals( #define NUMARGS_MERGE_GLB 8 int type_args_merge_glb[NUMARGS_MERGE_GLB] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, - GA_UINT, GA_UINT, GA_UINT, GA_UINT}; + GA_UINT, GA_UINT, GA_UINT, GA_UINT}; #define NUMARGS_MERGE_GLB_ARG 12 int type_args_merge_glb_arg[NUMARGS_MERGE_GLB_ARG] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, - GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; + GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; static const char *code_merge_glb = \ "extern \"C\" __global__ void mergeGlobalMemKernel( " \ @@ -830,6 +858,8 @@ static const char *code_merge_glb = static int mergeGlobalMem( GpuArray *d_DstKey, GpuArray *d_SrcKey, + GpuArray *d_DstArg, + GpuArray *d_SrcArg, GpuSortConfig *msConfig, GpuKernel *k_merge_global, gpucontext *ctx @@ -855,6 +885,20 @@ static int mergeGlobalMem( err = GpuKernel_setarg(k_merge_global, p++, &d_SrcKey->offset); if (err != GA_NO_ERROR) return err; + if (msConfig->argSortFlg) { + err = GpuKernel_setarg(k_merge_global, p++, d_DstArg->data); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_merge_global, p++, &d_DstArg->offset); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_merge_global, p++, d_SrcArg->data); + if (err != GA_NO_ERROR) return err; + + err = GpuKernel_setarg(k_merge_global, p++, &d_SrcArg->offset); + if (err != GA_NO_ERROR) return err; + } + err = GpuKernel_setarg(k_merge_global, p++, &msConfig->Nfloor); if (err != GA_NO_ERROR) return err; @@ -874,17 +918,18 @@ static int mergeGlobalMem( } // Generate type specific GPU code -static int genMergeSortTypeCode(strb *str, int typecode, unsigned int argSort) +static int genMergeSortTypeCode(strb *str, GpuSortConfig *msConfig) { - if (argSort) + if (msConfig->argSortFlg) { strb_appends(str, "\n#define ARGSORT\n"); - + strb_appendf(str, "typedef %s t_arg;\n", ctype(msConfig->typecodeArg)); + strb_appendf(str, "#define MAX_NUM_ARG %u \n#define MIN_NUM_ARG %u \n", (msConfig->typecodeArg==GA_ULONG) ? ULONG_MAX : UINT_MAX, 0); + } // Generate typedef for the data type to be sorted - strb_appendf(str, "typedef %s t_key;\n", ctype(typecode)); - strb_appendf(str, "typedef %s t_arg;\n", "ga_uint"); + strb_appendf(str, "typedef %s t_key;\n", ctype(msConfig->typecodeKey)); // Generate macro for MIN and MAX value of a given data type - switch (typecode){ + switch (msConfig->typecodeKey){ case GA_UINT: strb_appendf(str, "#define MAX_NUM %u \n#define MIN_NUM %u \n", UINT_MAX, 0); break; @@ -943,7 +988,7 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k int *types; strb sb = STRB_STATIC_INIT; - err = genMergeSortTypeCode(&sb, msConfig->typecode, msConfig->argSortFlg); + err = genMergeSortTypeCode(&sb, msConfig); if (err != GA_NO_ERROR) return err; // Compile Bitonic sort Kernel @@ -1059,8 +1104,8 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k codes_merge_glb, lens_merge_glb, "mergeGlobalMemKernel", - NUMARGS_MERGE_GLB, - type_args_merge_glb, + nargs, + types, flags, &err_str ); @@ -1084,14 +1129,13 @@ static int sort( ) { int err = GA_NO_ERROR; - size_t lstCopyOff; + size_t lstCopyOffDst, lstCopyOffSrc; unsigned int stageCount = 0; unsigned int stride; - GpuArray *ikey, *okey, *iarg, *oarg, *t; + GpuArray *ikey, *okey, *iarg, *oarg, *t, *t2; GpuKernel k_bitonic, k_ranks, k_ranks_idxs, k_merge, k_merge_global; - err = compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, msConfig); - if (err != GA_NO_ERROR) return err; + checkErr( compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, msConfig) ); for (stride = SHARED_SIZE_LIMIT; stride < msConfig->Nfloor; stride <<= 1, stageCount++); @@ -1111,52 +1155,85 @@ static int sort( // Bitonic sort for short arrays if (msConfig->dims <= SHARED_SIZE_LIMIT) { - err = bitonicSortShared(d_DstKey, d_SrcKey, d_DstArg, d_SrcArg, 1, (unsigned int)msConfig->dims, - msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx); + checkErr( bitonicSortShared(d_DstKey, d_SrcKey, d_DstArg, d_SrcArg, 1, (unsigned int)msConfig->dims, + msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx) + ); } // Merge - Bitonic sort for bigger arrays else { + unsigned int batchSize = msConfig->Nfloor / SHARED_SIZE_LIMIT; unsigned int arrayLength = SHARED_SIZE_LIMIT; - err = bitonicSortShared(ikey, d_SrcKey, iarg, d_SrcArg, batchSize, arrayLength, - msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx); + checkErr( bitonicSortShared(ikey, d_SrcKey, iarg, d_SrcArg, batchSize, arrayLength, + msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx) + ); for (stride = SHARED_SIZE_LIMIT; stride < msConfig->Nfloor; stride <<= 1) { unsigned int lastSegmentElements = msConfig->Nfloor % (2 * stride); //Find sample ranks and prepare for limiters merge - err = generateSampleRanks(msData, ikey, stride, msConfig, &k_ranks, ctx); + checkErr( generateSampleRanks(msData, ikey, stride, msConfig, &k_ranks, ctx) ); //Merge ranks and indices - err = mergeRanksAndIndices(msData, stride, msConfig, &k_ranks_idxs, ctx); + checkErr( mergeRanksAndIndices(msData, stride, msConfig, &k_ranks_idxs, ctx) ); //Merge elementary intervals - err = mergeElementaryIntervals(okey, ikey, msData, stride, msConfig, &k_merge, ctx); + checkErr( mergeElementaryIntervals(okey, ikey, oarg, iarg, msData, stride, msConfig, &k_merge, ctx) ); if (lastSegmentElements <= stride) { //Last merge segment consists of a single array which just needs to be passed through - lstCopyOff = okey->offset + ((msConfig->Nfloor - lastSegmentElements) * msConfig->typesize); - err = gpudata_move(okey->data, lstCopyOff, ikey->data, lstCopyOff, - lastSegmentElements * msConfig->typesize); + lstCopyOffDst = okey->offset + ((msConfig->Nfloor - lastSegmentElements) * msConfig->typesizeKey); + lstCopyOffSrc = ikey->offset + ((msConfig->Nfloor - lastSegmentElements) * msConfig->typesizeKey); + checkErr( gpudata_move(okey->data, lstCopyOffDst, ikey->data, lstCopyOffSrc, + lastSegmentElements * msConfig->typesizeKey) + ); + + if (msConfig->argSortFlg) { + lstCopyOffDst = oarg->offset + ((msConfig->Nfloor - lastSegmentElements) * msConfig->typesizeArg); + lstCopyOffSrc = iarg->offset + ((msConfig->Nfloor - lastSegmentElements) * msConfig->typesizeArg); + checkErr( gpudata_move(oarg->data, lstCopyOffDst, iarg->data, lstCopyOffSrc, + lastSegmentElements * msConfig->typesizeArg) + ); + } } // Swap pointers t = ikey; ikey = okey; okey = t; + if (msConfig->argSortFlg) { + t2 = iarg; + iarg = oarg; + oarg = t2; + } } // If the array is not multiple of 1024, sort the remaining and merge if (msConfig->Nleft > 0) { - err = bitonicSortShared(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, 1, msConfig->Nleft, - msConfig->sortDirFlg, msConfig->Nfloor, msConfig->argSortFlg, &k_bitonic, ctx); + + checkErr( bitonicSortShared(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, 1, msConfig->Nleft, + msConfig->sortDirFlg, msConfig->Nfloor, msConfig->argSortFlg, &k_bitonic, ctx) + ); // Copy the leftMost segment to the output array of which contains the first sorted sequence - lstCopyOff = okey->offset + msConfig->Nfloor * msConfig->typesize; - err = gpudata_move(d_DstKey->data, lstCopyOff, d_SrcKey->data, lstCopyOff, - msConfig->Nleft * msConfig->typesize); + lstCopyOffDst = d_DstKey->offset + (msConfig->Nfloor * msConfig->typesizeKey); + lstCopyOffSrc = d_SrcKey->offset + (msConfig->Nfloor * msConfig->typesizeKey); + checkErr( gpudata_move(d_DstKey->data, lstCopyOffDst, d_SrcKey->data, lstCopyOffSrc, + msConfig->Nleft * msConfig->typesizeKey) + ); + + if (msConfig->argSortFlg) { + lstCopyOffDst = d_DstArg->offset + (msConfig->Nfloor * msConfig->typesizeArg); + lstCopyOffSrc = d_SrcArg->offset + (msConfig->Nfloor * msConfig->typesizeArg); + checkErr( gpudata_move(d_DstArg->data, lstCopyOffDst, d_SrcArg->data, lstCopyOffSrc, + msConfig->Nleft * msConfig->typesizeArg) + ); + } + checkErr( mergeGlobalMem(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, msConfig, &k_merge_global, ctx) ); - err = mergeGlobalMem(d_SrcKey, d_DstKey, msConfig, &k_merge_global, ctx); + checkErr( GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER) ); - err = GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER); + if (msConfig->argSortFlg) { + checkErr( GpuArray_copy(d_DstArg, d_SrcArg, GA_C_ORDER) ); + } } } return err; @@ -1165,17 +1242,32 @@ static int sort( static int initArgSort( GpuArray *srcArg, GpuArray *src, + GpuSortConfig *msConfig, gpucontext *ctx ) { int err = GA_NO_ERROR; - const size_t dims = src->dimensions[0] * sizeof(unsigned long); - - unsigned long *tmp = (unsigned long*) malloc(dims); + size_t dims; + void *tmp; + unsigned long *lPtr; + unsigned int *iPtr; unsigned long i; - for (i = 0; i < src->dimensions[0]; ++i) tmp[i] = i; + + size_t typeSize = (msConfig->typecodeArg == GA_ULONG) ? sizeof(unsigned long) : sizeof(unsigned int); + dims = src->dimensions[0] * typeSize; + + tmp = malloc(dims); + lPtr = (unsigned long*)tmp; + iPtr = (unsigned int*)tmp; + + for (i = 0; i < src->dimensions[0]; ++i) { + if (msConfig->typecodeArg == GA_ULONG) + lPtr[i] = i; + else + iPtr[i] = (unsigned int)i; + } - err = GpuArray_empty(srcArg, ctx, GA_ULONG, src->nd, &dims, GA_C_ORDER); + err = GpuArray_empty(srcArg, ctx, msConfig->typecodeArg, src->nd, src->dimensions, GA_C_ORDER); if (err != GA_NO_ERROR) return err; err = GpuArray_write(srcArg, tmp, dims); @@ -1194,7 +1286,7 @@ static int initMergeSort( ) { int err = GA_NO_ERROR; - const size_t dims = (msConfig->Nfloor / 128) * sizeof(unsigned int); + const size_t dims = msConfig->Nfloor / 128; unsigned int nd = src->nd; err = GpuArray_empty(&msData->d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); @@ -1209,32 +1301,37 @@ static int initMergeSort( err = GpuArray_empty(&msData->d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); if (err != GA_NO_ERROR) return err; - if (msConfig->sortDirFlg) { - initArgSort(srcArg, src, ctx); + if (msConfig->argSortFlg) { + initArgSort(srcArg, src, msConfig, ctx); } return err; } -static void initMsConfig(GpuSortConfig *msConfig, GpuArray *src, unsigned int sortDir, unsigned int argSort) +static void initMsConfig(GpuSortConfig *msConfig, GpuArray *src, GpuArray *arg, unsigned int sortDir, unsigned int argSort) { msConfig->dims = src->dimensions[0]; msConfig->Nfloor = roundDown((unsigned int)msConfig->dims, SHARED_SIZE_LIMIT); msConfig->Nleft = (unsigned int)msConfig->dims - msConfig->Nfloor; msConfig->sortDirFlg = sortDir; msConfig->argSortFlg = argSort; - msConfig->typecode = src->typecode; - msConfig->typesize = typesize(src->typecode); + msConfig->typecodeKey = src->typecode; + msConfig->typesizeKey = gpuarray_get_elsize(src->typecode); + if (argSort) { + assert(arg->typecode == GA_UINT || arg->typecode == GA_ULONG); + msConfig->typecodeArg = arg->typecode; + msConfig->typesizeArg = gpuarray_get_elsize(arg->typecode); + } } -static int initMsBuff(GpuSortBuff *msBuff, GpuArray *src, gpucontext *ctx, unsigned int argSort) +static int initMsBuff(GpuSortBuff *msBuff, GpuArray *src, gpucontext *ctx, GpuSortConfig *msConfig) { int err = GA_NO_ERROR; - err = GpuArray_empty(&msBuff->BufKey, ctx, src->typecode, src->nd, src->dimensions, GA_C_ORDER); + err = GpuArray_empty(&msBuff->BufKey, ctx, msConfig->typecodeKey, src->nd, src->dimensions, GA_C_ORDER); if (err != GA_NO_ERROR) return err; - if (argSort) { - err = GpuArray_empty(&msBuff->BufArg, ctx, GA_ULONG, src->nd, src->dimensions, GA_C_ORDER); + if (msConfig->argSortFlg) { + err = GpuArray_empty(&msBuff->BufArg, ctx, msConfig->typecodeArg, src->nd, src->dimensions, GA_C_ORDER); if (err != GA_NO_ERROR) return err; } @@ -1274,27 +1371,15 @@ int GpuArray_sort( GpuSortBuff msBuff; GpuSortData msData; - initMsConfig(&msConfig, srcKey, sortDir, dstArg != NULL ? 1 : 0); + if (srcKey->nd > 1) return GA_IMPL_ERROR; + + initMsConfig(&msConfig, srcKey, dstArg, sortDir, dstArg != NULL ? 1 : 0); - err = initMsBuff(&msBuff, srcKey, ctx, msConfig.argSortFlg); - if (err != GA_NO_ERROR) return err; + checkErr( initMsBuff(&msBuff, srcKey, ctx, &msConfig) ); - err = initMergeSort(&msData, &msConfig, srcKey, &srcArg, ctx); - if (err != GA_NO_ERROR) return err; - - if (srcKey->nd > 1) return GA_IMPL_ERROR; + checkErr( initMergeSort(&msData, &msConfig, srcKey, &srcArg, ctx) ); - // perform regular sort - err = sort( - dstKey, - srcKey, - dstArg, - &srcArg, - &msBuff, - &msData, - &msConfig, - ctx - ); + checkErr( sort(dstKey, srcKey, dstArg, &srcArg, &msBuff, &msData, &msConfig, ctx) ); destroyMergeSort(&msData, &msBuff, &srcArg, msConfig.sortDirFlg); From 606ad4dfe7a449653f1fcbca603d3dd2c3e0afb0 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Thu, 27 Jul 2017 21:39:17 +0200 Subject: [PATCH 14/19] argsort and minor refactor --- src/gpuarray/sort.h | 43 +++-- src/gpuarray_sort.c | 409 ++++++++++++++++++-------------------------- 2 files changed, 192 insertions(+), 260 deletions(-) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index 76a5860b95..362211f538 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -6,6 +6,7 @@ #include #include +#include #ifdef __cplusplus @@ -19,31 +20,39 @@ extern "C" { #define SAMPLE_STRIDE 128 typedef struct _GpuSortData { - GpuArray BufKey; - GpuArray BufArg; - GpuArray d_RanksA; - GpuArray d_RanksB; - GpuArray d_LimitsA; - GpuArray d_LimitsB; + GpuArray BufKey; + GpuArray BufArg; + GpuArray d_RanksA; + GpuArray d_RanksB; + GpuArray d_LimitsA; + GpuArray d_LimitsB; } GpuSortData; typedef struct _GpuSortConfig { - unsigned int dims; - unsigned int Nfloor; - int Nleft; - unsigned int sortDirFlg; - unsigned int argSortFlg; - int typecodeKey; - size_t typesizeKey; - int typecodeArg; - size_t typesizeArg; + unsigned int dims; + unsigned int Nfloor; + int Nleft; + unsigned int sortDirFlg; + unsigned int argSortFlg; + int typecodeKey; + size_t typesizeKey; + int typecodeArg; + size_t typesizeArg; } GpuSortConfig; typedef struct _GpuSortBuffers { - GpuArray BufKey; - GpuArray BufArg; + GpuArray BufKey; + GpuArray BufArg; } GpuSortBuff; +typedef struct _GpuSortKernels { + GpuKernel k_bitonic; + GpuKernel k_ranks; + GpuKernel k_ranks_idxs; + GpuKernel k_merge; + GpuKernel k_merge_global; +} GpuSortKernels; + int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstArg); diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 5c34bb9b32..9fafa68771 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -4,7 +4,6 @@ #include #include -#include #include #include "util/strb.h" @@ -183,13 +182,13 @@ static const char *code_bitonic_smem = " t_key *d_DstKey, " \ " size_t dstOff," \ " t_key *d_SrcKey, " \ -" size_t srcOff,"\ -"\n#ifdef ARGSORT\n" \ -" t_arg *d_DstArg, "\ -" size_t dstArgOff, "\ -" t_arg *d_SrcArg, "\ -" size_t srcArgOff, " \ -"\n#endif\n"\ +" size_t srcOff," \ +"\n#ifdef ARGSORT\n" \ +" t_arg *d_DstArg, " \ +" size_t dstArgOff, " \ +" t_arg *d_SrcArg, " \ +" size_t srcArgOff, " \ +"\n#endif\n" \ " unsigned int batchSize, " \ " unsigned int arrayLength, " \ " unsigned int elemsOff, " \ @@ -198,13 +197,13 @@ static const char *code_bitonic_smem = " { " \ " d_DstKey = (t_key*) (((char*)d_DstKey)+ dstOff);" \ " d_SrcKey = (t_key*) (((char*)d_SrcKey)+ srcOff);" \ -"\n#ifdef ARGSORT\n" \ -" d_DstArg = (t_arg*) (((char*)d_DstArg)+ dstArgOff); "\ -" d_SrcArg = (t_arg*) (((char*)d_SrcArg)+ srcArgOff);"\ -" d_DstArg += elemsOff;"\ -" d_SrcArg += elemsOff;" \ -" __shared__ t_arg s_arg[SHARED_SIZE_LIMIT];" \ -"\n#endif\n"\ +"\n#ifdef ARGSORT\n" \ +" d_DstArg = (t_arg*) (((char*)d_DstArg)+ dstArgOff); " \ +" d_SrcArg = (t_arg*) (((char*)d_SrcArg)+ srcArgOff); " \ +" d_DstArg += elemsOff;" \ +" d_SrcArg += elemsOff;" \ +" __shared__ t_arg s_arg[SHARED_SIZE_LIMIT];" \ +"\n#endif\n" \ " d_DstKey += elemsOff;" \ " d_SrcKey += elemsOff;" \ " __shared__ t_key s_key[SHARED_SIZE_LIMIT]; " \ @@ -218,18 +217,18 @@ static const char *code_bitonic_smem = " arrayLength * batchSize, " \ " sortDir " \ " ); " \ -"\n#ifdef ARGSORT\n" -" s_arg[threadIdx.x] = readArray_arg( d_SrcArg, "\ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, "\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ -" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray_arg( d_SrcArg," \ +"\n#ifdef ARGSORT\n" \ +" s_arg[threadIdx.x] = readArray_arg( d_SrcArg, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, " \ +" arrayLength * batchSize, " \ +" sortDir " \ +" ); " \ +" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = readArray_arg( d_SrcArg," \ " blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2),"\ -" arrayLength * batchSize, "\ -" sortDir "\ -" ); "\ -"\n#endif\n" \ +" arrayLength * batchSize, " \ +" sortDir " \ +" ); " \ +"\n#endif\n" \ " for (unsigned int size = 2; size < SHARED_SIZE_LIMIT; size <<= 1) { " \ " unsigned int ddd = sortDir ^ ((threadIdx.x & (size / 2)) != 0); " \ " for (unsigned int stride = size / 2; stride > 0; stride >>= 1) " \ @@ -241,11 +240,11 @@ static const char *code_bitonic_smem = " t = s_key[pos]; " \ " s_key[pos] = s_key[pos + stride]; " \ " s_key[pos + stride] = t; " \ -"\n#ifdef ARGSORT\n" \ -" t_arg t2 = s_arg[pos];"\ -" s_arg[pos] = s_arg[pos + stride];" \ -" s_arg[pos + stride] = t2;" \ -"\n#endif\n" \ +"\n#ifdef ARGSORT\n" \ +" t_arg t2 = s_arg[pos];" \ +" s_arg[pos] = s_arg[pos + stride];" \ +" s_arg[pos + stride] = t2;" \ +"\n#endif\n" \ " } " \ " } " \ " } " \ @@ -258,11 +257,11 @@ static const char *code_bitonic_smem = " t = s_key[pos]; " \ " s_key[pos] = s_key[pos + stride]; " \ " s_key[pos + stride] = t; " \ -"\n#ifdef ARGSORT\n" \ -" t_arg t2 = s_arg[pos];"\ -" s_arg[pos] = s_arg[pos + stride];" \ -" s_arg[pos + stride] = t2;" \ -"\n#endif\n" \ +"\n#ifdef ARGSORT\n" \ +" t_arg t2 = s_arg[pos];" \ +" s_arg[pos] = s_arg[pos + stride];" \ +" s_arg[pos + stride] = t2;" \ +"\n#endif\n" \ " } " \ " } " \ " } " \ @@ -277,18 +276,18 @@ static const char *code_bitonic_smem = " s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], " \ " arrayLength * batchSize " \ " ); " \ -"\n#ifdef ARGSORT\n" \ -" writeArray( d_DstArg, " \ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, " \ -" s_arg[threadIdx.x], " \ -" arrayLength * batchSize " \ -" ); " \ -" writeArray( d_DstArg, " \ -" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), " \ -" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], " \ -" arrayLength * batchSize " \ -" ); " \ -"\n#endif\n "\ +"\n#ifdef ARGSORT\n" \ +" writeArray( d_DstArg, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x, " \ +" s_arg[threadIdx.x], " \ +" arrayLength * batchSize " \ +" ); " \ +" writeArray( d_DstArg, " \ +" blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2), " \ +" s_arg[threadIdx.x + (SHARED_SIZE_LIMIT / 2)], " \ +" arrayLength * batchSize " \ +" ); " \ +"\n#endif\n " \ "}\n"; static int bitonicSortShared( GpuArray *d_DstKey, @@ -311,60 +310,35 @@ static int bitonicSortShared( ls = SHARED_SIZE_LIMIT / 2; gs = batchSize; - err = GpuKernel_setarg(k_bitonic, p++, d_DstKey->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, d_DstKey->data) ); - err = GpuKernel_setarg(k_bitonic, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, &d_DstKey->offset) ); - err = GpuKernel_setarg(k_bitonic, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, d_SrcKey->data) ); - err = GpuKernel_setarg(k_bitonic, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, &d_SrcKey->offset) ); if (argSortFlg) { - err = GpuKernel_setarg(k_bitonic, p++, d_DstArg->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, d_DstArg->data) ); - err = GpuKernel_setarg(k_bitonic, p++, &d_DstArg->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, &d_DstArg->offset) ); - err = GpuKernel_setarg(k_bitonic, p++, d_SrcArg->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, d_SrcArg->data) ); - err = GpuKernel_setarg(k_bitonic, p++, &d_SrcArg->offset); - if (err != GA_NO_ERROR) return err; - } - - err = GpuKernel_setarg(k_bitonic, p++, &batchSize); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, &d_SrcArg->offset) ); + + } + checkErr( GpuKernel_setarg(k_bitonic, p++, &batchSize) ); - err = GpuKernel_setarg(k_bitonic, p++, &arrayLength); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, &arrayLength) ); - err = GpuKernel_setarg(k_bitonic, p++, &elemsOff); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, &elemsOff) ); - err = GpuKernel_setarg(k_bitonic, p++, &sortDir); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_bitonic, p++, &sortDir) ); - err = GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_call(k_bitonic, 1, &gs, &ls, 0, NULL) ); return err; -/* - float *h_dst2 = (float *) malloc ( 16 * sizeof(float)); - err = GpuArray_read(h_dst2, 16 * sizeof(float), d_DstKey); - if (err != GA_NO_ERROR) printf("error reading \n"); - - - int i; - for (i = 0; i < 16; i++) - { - printf("%d afterbitonic %f \n", i, h_dst2[i]); - } - */ } #define NUMARGS_SAMPLE_RANKS 10 @@ -428,7 +402,6 @@ static int generateSampleRanks( unsigned int threadCount = (lastSegmentElements > stride) ? (msConfig->Nfloor + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE) : (msConfig->Nfloor - lastSegmentElements) / (2 * SAMPLE_STRIDE); - size_t ls, gs; unsigned int p = 0; int err = GA_NO_ERROR; @@ -436,39 +409,28 @@ static int generateSampleRanks( ls = 256; gs = iDivUp(threadCount, 256); - err = GpuKernel_setarg(k_ranks, p++, msData->d_RanksA.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, msData->d_RanksA.data) ); - err = GpuKernel_setarg(k_ranks, p++, &msData->d_RanksA.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, &msData->d_RanksA.offset) ); - err = GpuKernel_setarg(k_ranks, p++, msData->d_RanksB.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, msData->d_RanksB.data) ); - err = GpuKernel_setarg(k_ranks, p++, &msData->d_RanksB.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, &msData->d_RanksB.offset) ); - err = GpuKernel_setarg(k_ranks, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, d_SrcKey->data) ); - err = GpuKernel_setarg(k_ranks, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, &d_SrcKey->offset) ); - err = GpuKernel_setarg(k_ranks, p++, &stride); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, &stride) ); - err = GpuKernel_setarg(k_ranks, p++, &msConfig->Nfloor); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, &msConfig->Nfloor) ); - err = GpuKernel_setarg(k_ranks, p++, &threadCount); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, &threadCount) ); - err = GpuKernel_setarg(k_ranks, p++, &msConfig->sortDirFlg); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks, p++, &msConfig->sortDirFlg) ); + + checkErr( GpuKernel_call(k_ranks, 1, &gs, &ls, 0, NULL) ); - err = GpuKernel_call(k_ranks, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) return err; - return err; } @@ -528,56 +490,43 @@ static int mergeRanksAndIndices( ls = 256U; gs = iDivUp(threadCount, 256U); - err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_LimitsA.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, msData->d_LimitsA.data) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_LimitsA.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_LimitsA.offset) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_RanksA.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, msData->d_RanksA.data) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_RanksA.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_RanksA.offset) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, &stride); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, &stride) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, &msConfig->Nfloor); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, &msConfig->Nfloor) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, &threadCount); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, &threadCount) ); - err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL) ); p = 0; - err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_LimitsB.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, msData->d_LimitsB.data) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_LimitsB.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_LimitsB.offset) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, msData->d_RanksB.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, msData->d_RanksB.data) ); - err = GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_RanksB.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_ranks_idxs, p++, &msData->d_RanksB.offset) ); - err = GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_call(k_ranks_idxs, 1, &gs, &ls, 0, NULL) ); return err; } #define NUMARGS_MERGE 11 int type_args_merge[NUMARGS_MERGE] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, - GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; + GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; #define NUMARGS_MERGE_ARG 15 int type_args_merge_arg[NUMARGS_MERGE_ARG] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, - GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; + GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; static const char *code_merge = \ " template __device__ void merge( " \ " T *dstKey, " \ @@ -747,55 +696,38 @@ static int mergeElementaryIntervals( ls = SAMPLE_STRIDE; gs = mergePairs; - err = GpuKernel_setarg(k_merge, p++, d_DstKey->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, d_DstKey->data) ); - err = GpuKernel_setarg(k_merge, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &d_DstKey->offset) ); - err = GpuKernel_setarg(k_merge, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, d_SrcKey->data) ); - err = GpuKernel_setarg(k_merge, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &d_SrcKey->offset) ); if (msConfig->argSortFlg) { - err = GpuKernel_setarg(k_merge, p++, d_DstArg->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, d_DstArg->data) ); - err = GpuKernel_setarg(k_merge, p++, &d_DstArg->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &d_DstArg->offset) ); - err = GpuKernel_setarg(k_merge, p++, d_SrcArg->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, d_SrcArg->data) ); - err = GpuKernel_setarg(k_merge, p++, &d_SrcArg->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &d_SrcArg->offset) ); } + checkErr( GpuKernel_setarg(k_merge, p++, msData->d_LimitsA.data) ); - err = GpuKernel_setarg(k_merge, p++, msData->d_LimitsA.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &msData->d_LimitsA.offset) ); - err = GpuKernel_setarg(k_merge, p++, &msData->d_LimitsA.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, msData->d_LimitsB.data) ); - err = GpuKernel_setarg(k_merge, p++, msData->d_LimitsB.data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &msData->d_LimitsB.offset) ); - err = GpuKernel_setarg(k_merge, p++, &msData->d_LimitsB.offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &stride) ); - err = GpuKernel_setarg(k_merge, p++, &stride); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &msConfig->Nfloor) ); - err = GpuKernel_setarg(k_merge, p++, &msConfig->Nfloor); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge, p++, &msConfig->sortDirFlg) ); - err = GpuKernel_setarg(k_merge, p++, &msConfig->sortDirFlg); - if (err != GA_NO_ERROR) return err; - - err = GpuKernel_call(k_merge, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_call(k_merge, 1, &gs, &ls, 0, NULL) ); return err; } @@ -873,46 +805,32 @@ static int mergeGlobalMem( ls = 256; gs = iDivUp(msConfig->dims, (unsigned int)ls); - err = GpuKernel_setarg(k_merge_global, p++, d_DstKey->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, d_DstKey->data) ); - err = GpuKernel_setarg(k_merge_global, p++, &d_DstKey->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, &d_DstKey->offset) ); - err = GpuKernel_setarg(k_merge_global, p++, d_SrcKey->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, d_SrcKey->data) ); - err = GpuKernel_setarg(k_merge_global, p++, &d_SrcKey->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, &d_SrcKey->offset) ); if (msConfig->argSortFlg) { - err = GpuKernel_setarg(k_merge_global, p++, d_DstArg->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, d_DstArg->data) ); - err = GpuKernel_setarg(k_merge_global, p++, &d_DstArg->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, &d_DstArg->offset) ); - err = GpuKernel_setarg(k_merge_global, p++, d_SrcArg->data); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, d_SrcArg->data) ); - err = GpuKernel_setarg(k_merge_global, p++, &d_SrcArg->offset); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, &d_SrcArg->offset) ); } + checkErr( GpuKernel_setarg(k_merge_global, p++, &msConfig->Nfloor) ); - err = GpuKernel_setarg(k_merge_global, p++, &msConfig->Nfloor); - if (err != GA_NO_ERROR) return err; - - err = GpuKernel_setarg(k_merge_global, p++, &NleftC); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, &NleftC) ); - err = GpuKernel_setarg(k_merge_global, p++, &msConfig->dims); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, &msConfig->dims) ); - err = GpuKernel_setarg(k_merge_global, p++, &msConfig->sortDirFlg); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_setarg(k_merge_global, p++, &msConfig->sortDirFlg) ); - err = GpuKernel_call(k_merge_global, 1, &gs, &ls, 0, NULL); - if (err != GA_NO_ERROR) return err; + checkErr( GpuKernel_call(k_merge_global, 1, &gs, &ls, 0, NULL) ); return err; } @@ -966,8 +884,7 @@ static int genMergeSortTypeCode(strb *str, GpuSortConfig *msConfig) #define NSTRINGS_RKS_IDX 4 #define NSTRINGS_MERGE 4 #define NSTRINGS_MERGE_GLB 4 -static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k_ranks_idxs, GpuKernel *k_merge, - GpuKernel *k_merge_global, gpucontext *ctx, GpuSortConfig *msConfig) +static int compileKernels(GpuSortKernels *msKernels, gpucontext *ctx, GpuSortConfig *msConfig) { char *err_str = NULL; int err = GA_NO_ERROR; @@ -988,8 +905,7 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k int *types; strb sb = STRB_STATIC_INIT; - err = genMergeSortTypeCode(&sb, msConfig); - if (err != GA_NO_ERROR) return err; + checkErr( genMergeSortTypeCode(&sb, msConfig) ); // Compile Bitonic sort Kernel lens_bitonic[0] = sb.l; @@ -1002,7 +918,7 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k nargs = NUMARGS_BITONIC_KERNEL; types = type_args_bitonic; } - err = GpuKernel_init( k_bitonic, + err = GpuKernel_init( &msKernels->k_bitonic, ctx, NSTR_BITONIC, codes_bitonic, @@ -1022,7 +938,7 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k // Compile ranks kernel lens_ranks[0] = sb.l; codes_ranks[0] = sb.s; - err = GpuKernel_init( k_ranks, + err = GpuKernel_init( &msKernels->k_ranks, ctx, NSTR_RANKS, codes_ranks, @@ -1042,7 +958,7 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k // Compile ranks and idxs kernel lens_rks_idx[0] = sb.l; codes_rks_idx[0] = sb.s; - err = GpuKernel_init( k_ranks_idxs, + err = GpuKernel_init( &msKernels->k_ranks_idxs, ctx, NSTRINGS_RKS_IDX, codes_rks_idx, @@ -1070,7 +986,7 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k // Compile merge kernel lens_merge[0] = sb.l; codes_merge[0] = sb.s; - err = GpuKernel_init( k_merge, + err = GpuKernel_init( &msKernels->k_merge, ctx, NSTRINGS_MERGE, codes_merge, @@ -1098,7 +1014,7 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k // Compile merge global kernel lens_merge_glb[0] = sb.l; codes_merge_glb[0] = sb.s; - err = GpuKernel_init( k_merge_global, + err = GpuKernel_init( &msKernels->k_merge_global, ctx, NSTRINGS_MERGE_GLB, codes_merge_glb, @@ -1117,6 +1033,19 @@ static int compileKernels(GpuKernel *k_bitonic, GpuKernel *k_ranks, GpuKernel *k return err; } +static int copysrc2dst(GpuArray *dstKey, GpuArray *srcKey, GpuArray *dstArg, GpuArray *srcArg, unsigned int argSortFlg, int Nleft) +{ + int err = GA_NO_ERROR; + + if (Nleft > 0) { + checkErr( GpuArray_copy(dstKey, srcKey, GA_C_ORDER) ); + if (argSortFlg) { + checkErr( GpuArray_copy(dstArg, srcArg, GA_C_ORDER) ); + } + } + return err; +} + static int sort( GpuArray *d_DstKey, GpuArray *d_SrcKey, @@ -1132,10 +1061,12 @@ static int sort( size_t lstCopyOffDst, lstCopyOffSrc; unsigned int stageCount = 0; unsigned int stride; + unsigned int batchSize; + unsigned int arrayLength; GpuArray *ikey, *okey, *iarg, *oarg, *t, *t2; - GpuKernel k_bitonic, k_ranks, k_ranks_idxs, k_merge, k_merge_global; - checkErr( compileKernels(&k_bitonic, &k_ranks, &k_ranks_idxs, &k_merge, &k_merge_global, ctx, msConfig) ); + GpuSortKernels msKernels; + checkErr( compileKernels(&msKernels, ctx, msConfig) ); for (stride = SHARED_SIZE_LIMIT; stride < msConfig->Nfloor; stride <<= 1, stageCount++); @@ -1156,29 +1087,30 @@ static int sort( if (msConfig->dims <= SHARED_SIZE_LIMIT) { checkErr( bitonicSortShared(d_DstKey, d_SrcKey, d_DstArg, d_SrcArg, 1, (unsigned int)msConfig->dims, - msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx) + msConfig->sortDirFlg, 0, msConfig->argSortFlg, &msKernels.k_bitonic, ctx) ); } // Merge - Bitonic sort for bigger arrays else { + checkErr( copysrc2dst(d_DstKey, d_SrcKey, d_DstArg, d_SrcArg, msConfig->argSortFlg, msConfig->Nleft) ); - unsigned int batchSize = msConfig->Nfloor / SHARED_SIZE_LIMIT; - unsigned int arrayLength = SHARED_SIZE_LIMIT; + batchSize = msConfig->Nfloor / SHARED_SIZE_LIMIT; + arrayLength = SHARED_SIZE_LIMIT; checkErr( bitonicSortShared(ikey, d_SrcKey, iarg, d_SrcArg, batchSize, arrayLength, - msConfig->sortDirFlg, 0, msConfig->argSortFlg, &k_bitonic, ctx) + msConfig->sortDirFlg, 0, msConfig->argSortFlg, &msKernels.k_bitonic, ctx) ); for (stride = SHARED_SIZE_LIMIT; stride < msConfig->Nfloor; stride <<= 1) { unsigned int lastSegmentElements = msConfig->Nfloor % (2 * stride); //Find sample ranks and prepare for limiters merge - checkErr( generateSampleRanks(msData, ikey, stride, msConfig, &k_ranks, ctx) ); + checkErr( generateSampleRanks(msData, ikey, stride, msConfig, &msKernels.k_ranks, ctx) ); //Merge ranks and indices - checkErr( mergeRanksAndIndices(msData, stride, msConfig, &k_ranks_idxs, ctx) ); + checkErr( mergeRanksAndIndices(msData, stride, msConfig, &msKernels.k_ranks_idxs, ctx) ); //Merge elementary intervals - checkErr( mergeElementaryIntervals(okey, ikey, oarg, iarg, msData, stride, msConfig, &k_merge, ctx) ); + checkErr( mergeElementaryIntervals(okey, ikey, oarg, iarg, msData, stride, msConfig, &msKernels.k_merge, ctx) ); if (lastSegmentElements <= stride) { //Last merge segment consists of a single array which just needs to be passed through @@ -1195,8 +1127,7 @@ static int sort( lastSegmentElements * msConfig->typesizeArg) ); } - } - // Swap pointers + } t = ikey; ikey = okey; okey = t; @@ -1204,13 +1135,13 @@ static int sort( t2 = iarg; iarg = oarg; oarg = t2; - } + } } // If the array is not multiple of 1024, sort the remaining and merge if (msConfig->Nleft > 0) { - checkErr( bitonicSortShared(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, 1, msConfig->Nleft, - msConfig->sortDirFlg, msConfig->Nfloor, msConfig->argSortFlg, &k_bitonic, ctx) + checkErr( bitonicSortShared(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, 1, msConfig->Nleft, msConfig->sortDirFlg, + msConfig->Nfloor, msConfig->argSortFlg, &msKernels.k_bitonic, ctx) ); // Copy the leftMost segment to the output array of which contains the first sorted sequence @@ -1219,7 +1150,6 @@ static int sort( checkErr( gpudata_move(d_DstKey->data, lstCopyOffDst, d_SrcKey->data, lstCopyOffSrc, msConfig->Nleft * msConfig->typesizeKey) ); - if (msConfig->argSortFlg) { lstCopyOffDst = d_DstArg->offset + (msConfig->Nfloor * msConfig->typesizeArg); lstCopyOffSrc = d_SrcArg->offset + (msConfig->Nfloor * msConfig->typesizeArg); @@ -1227,7 +1157,7 @@ static int sort( msConfig->Nleft * msConfig->typesizeArg) ); } - checkErr( mergeGlobalMem(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, msConfig, &k_merge_global, ctx) ); + checkErr( mergeGlobalMem(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, msConfig, &msKernels.k_merge_global, ctx) ); checkErr( GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER) ); @@ -1267,11 +1197,9 @@ static int initArgSort( iPtr[i] = (unsigned int)i; } - err = GpuArray_empty(srcArg, ctx, msConfig->typecodeArg, src->nd, src->dimensions, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_empty(srcArg, ctx, msConfig->typecodeArg, src->nd, src->dimensions, GA_C_ORDER) ); - err = GpuArray_write(srcArg, tmp, dims); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_write(srcArg, tmp, dims) ); free(tmp); return err; @@ -1289,20 +1217,16 @@ static int initMergeSort( const size_t dims = msConfig->Nfloor / 128; unsigned int nd = src->nd; - err = GpuArray_empty(&msData->d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_empty(&msData->d_RanksA, ctx, GA_UINT, nd, &dims, GA_C_ORDER) ); - err = GpuArray_empty(&msData->d_RanksB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_empty(&msData->d_RanksB, ctx, GA_UINT, nd, &dims, GA_C_ORDER) ); - err = GpuArray_empty(&msData->d_LimitsA, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_empty(&msData->d_LimitsA, ctx, GA_UINT, nd, &dims, GA_C_ORDER) ); - err = GpuArray_empty(&msData->d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_empty(&msData->d_LimitsB, ctx, GA_UINT, nd, &dims, GA_C_ORDER) ); if (msConfig->argSortFlg) { - initArgSort(srcArg, src, msConfig, ctx); + checkErr( initArgSort(srcArg, src, msConfig, ctx) ); } return err; } @@ -1327,12 +1251,10 @@ static int initMsBuff(GpuSortBuff *msBuff, GpuArray *src, gpucontext *ctx, GpuSo { int err = GA_NO_ERROR; - err = GpuArray_empty(&msBuff->BufKey, ctx, msConfig->typecodeKey, src->nd, src->dimensions, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_empty(&msBuff->BufKey, ctx, msConfig->typecodeKey, src->nd, src->dimensions, GA_C_ORDER) ); if (msConfig->argSortFlg) { - err = GpuArray_empty(&msBuff->BufArg, ctx, msConfig->typecodeArg, src->nd, src->dimensions, GA_C_ORDER); - if (err != GA_NO_ERROR) return err; + checkErr( GpuArray_empty(&msBuff->BufArg, ctx, msConfig->typecodeArg, src->nd, src->dimensions, GA_C_ORDER) ); } return err; @@ -1356,6 +1278,7 @@ static void destroyMergeSort( } } + int GpuArray_sort( GpuArray *dstKey, GpuArray *srcKey, @@ -1381,7 +1304,7 @@ int GpuArray_sort( checkErr( sort(dstKey, srcKey, dstArg, &srcArg, &msBuff, &msData, &msConfig, ctx) ); - destroyMergeSort(&msData, &msBuff, &srcArg, msConfig.sortDirFlg); + destroyMergeSort(&msData, &msBuff, &srcArg, msConfig.argSortFlg); return err; } \ No newline at end of file From f89d56f0ddeda6924c3037c3f3e6dbc0d6d8fbf0 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Tue, 1 Aug 2017 15:35:40 +0200 Subject: [PATCH 15/19] sort interal dtypes hidden and copyright added --- src/gpuarray/sort.h | 39 +------------------------------------ src/gpuarray_sort.c | 24 +++++++++++++++++------ src/private_sort.h | 47 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 44 deletions(-) create mode 100644 src/private_sort.h diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index 362211f538..56efe335ee 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -16,43 +16,6 @@ extern "C" { } #endif -#define SHARED_SIZE_LIMIT 1024U -#define SAMPLE_STRIDE 128 - -typedef struct _GpuSortData { - GpuArray BufKey; - GpuArray BufArg; - GpuArray d_RanksA; - GpuArray d_RanksB; - GpuArray d_LimitsA; - GpuArray d_LimitsB; -} GpuSortData; - -typedef struct _GpuSortConfig { - unsigned int dims; - unsigned int Nfloor; - int Nleft; - unsigned int sortDirFlg; - unsigned int argSortFlg; - int typecodeKey; - size_t typesizeKey; - int typecodeArg; - size_t typesizeArg; -} GpuSortConfig; - -typedef struct _GpuSortBuffers { - GpuArray BufKey; - GpuArray BufArg; -} GpuSortBuff; - -typedef struct _GpuSortKernels { - GpuKernel k_bitonic; - GpuKernel k_ranks; - GpuKernel k_ranks_idxs; - GpuKernel k_merge; - GpuKernel k_merge_global; -} GpuSortKernels; - int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstArg); @@ -61,4 +24,4 @@ int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstA } #endif -#endif \ No newline at end of file +#endif diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 9fafa68771..0481810f8d 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -8,6 +8,7 @@ #include "util/strb.h" #include "private.h" +#include "private_sort.h" /* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. @@ -20,17 +21,16 @@ * * This software contains source code provided by NVIDIA Corporation. * - * Read more at: http://docs.nvidia.com/cuda/eula/index.html#ixzz4lUbgXjsr - * Follow us: @GPUComputing on Twitter | NVIDIA on Facebook - * - * */ - //#define checkErr(x) checkErrors(x, __FILE__, __LINE__) #define checkErr(err) if (err != GA_NO_ERROR) return err; const int flags = GA_USE_CLUDA; +/* + * Functions iDivUp, getSampleCount and nextPowerOfTwo taken from + * Merge Sort implementation in NVIDIA CUDA 8.0 Samples + */ static const char *code_helper_funcs = \ "\n#define SAMPLE_STRIDE 128 \n" \ "\n#define SHARED_SIZE_LIMIT 1024U \n" \ @@ -103,6 +103,10 @@ static inline const char *ctype(int typecode) { return gpuarray_get_type(typecode)->cluda_name; } +/* + * Functions binarySearchInclusive and binarySearchExclusive taken + * from Merge Sort implementation in NVIDIA CUDA 8.0 Samples + */ static const char *code_bin_search = \ "template __device__ unsigned int binarySearchInclusive(T val, T *data, unsigned int L, " \ " unsigned int stride, unsigned int sortDir){" \ @@ -177,6 +181,8 @@ int type_args_bitonic[NUMARGS_BITONIC_KERNEL] = {GA_BUFFER, GA_SIZE, GA_BUFFER, #define NUMARGS_BITONIC_KERNEL_ARG 12 int type_args_bitonic_arg[NUMARGS_BITONIC_KERNEL_ARG] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; + +/* Code based on Bitonic Sort implementation in NVIDIA CUDA 8.0 Samples */ static const char *code_bitonic_smem = \ " extern \"C\" __global__ void bitonicSortSharedKernel( " \ " t_key *d_DstKey, " \ @@ -344,6 +350,8 @@ static int bitonicSortShared( #define NUMARGS_SAMPLE_RANKS 10 const int type_args_ranks[NUMARGS_SAMPLE_RANKS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT, GA_UINT}; + +/* Code taken from Merge Sort implementation in NVIDIA CUDA 8.0 Samples */ static const char *code_sample_ranks = \ "extern \"C\" __global__ void generateSampleRanksKernel(" \ " unsigned int *d_RanksA," \ @@ -436,6 +444,8 @@ static int generateSampleRanks( #define NUMARGS_RANKS_IDXS 7 const int type_args_ranks_idxs[NUMARGS_RANKS_IDXS] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; + +/* Code taken from Merge Sort implementation in NVIDIA CUDA 8.0 Samples */ static const char *code_ranks_idxs = \ "extern \"C\" __global__ void mergeRanksAndIndicesKernel( " \ " unsigned int *d_Limits, " \ @@ -527,6 +537,8 @@ int type_args_merge[NUMARGS_MERGE] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA #define NUMARGS_MERGE_ARG 15 int type_args_merge_arg[NUMARGS_MERGE_ARG] = {GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_BUFFER, GA_SIZE, GA_UINT, GA_UINT, GA_UINT}; + +/* Code based on Merge Sort implementation in NVIDIA CUDA 8.0 Samples */ static const char *code_merge = \ " template __device__ void merge( " \ " T *dstKey, " \ @@ -1307,4 +1319,4 @@ int GpuArray_sort( destroyMergeSort(&msData, &msBuff, &srcArg, msConfig.argSortFlg); return err; -} \ No newline at end of file +} diff --git a/src/private_sort.h b/src/private_sort.h new file mode 100644 index 0000000000..982afd357d --- /dev/null +++ b/src/private_sort.h @@ -0,0 +1,47 @@ +#ifndef PRIVATE_SORT_H +#define PRIVATE_SORT_H + +#define SHARED_SIZE_LIMIT 1024U +#define SAMPLE_STRIDE 128 + +typedef struct _GpuSortData { + + GpuArray d_RanksA; + GpuArray d_RanksB; + GpuArray d_LimitsA; + GpuArray d_LimitsB; + +} GpuSortData; + +typedef struct _GpuSortConfig { + + unsigned int dims; + unsigned int Nfloor; + int Nleft; + unsigned int sortDirFlg; + unsigned int argSortFlg; + int typecodeKey; + size_t typesizeKey; + int typecodeArg; + size_t typesizeArg; + +} GpuSortConfig; + +typedef struct _GpuSortBuffers { + + GpuArray BufKey; + GpuArray BufArg; + +} GpuSortBuff; + +typedef struct _GpuSortKernels { + + GpuKernel k_bitonic; + GpuKernel k_ranks; + GpuKernel k_ranks_idxs; + GpuKernel k_merge; + GpuKernel k_merge_global; + +} GpuSortKernels; + +#endif From 06e5406e4ff2dbbeab3f7a7918b620fc34ad42d4 Mon Sep 17 00:00:00 2001 From: vcampmany Date: Tue, 1 Aug 2017 20:38:50 +0200 Subject: [PATCH 16/19] EULA deleted --- src/gpuarray_sort.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 0481810f8d..928fbf43c7 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -13,12 +13,6 @@ /* * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * * This software contains source code provided by NVIDIA Corporation. * */ From f9b0a479c362dd2e874a646382c16da7eff31b97 Mon Sep 17 00:00:00 2001 From: Victor Campmany Date: Thu, 10 Aug 2017 17:07:22 -0400 Subject: [PATCH 17/19] sort header fix --- src/gpuarray/sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray/sort.h b/src/gpuarray/sort.h index 56efe335ee..bd81cf9cbb 100644 --- a/src/gpuarray/sort.h +++ b/src/gpuarray/sort.h @@ -17,7 +17,7 @@ extern "C" { #endif -int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstArg); +GPUARRAY_PUBLIC int GpuArray_sort(GpuArray *r, GpuArray *a, unsigned int sortDir, GpuArray *dstArg); #ifdef __cplusplus From 90e7035c7fb1a1fcbfab2e138cecb9a734ae87a7 Mon Sep 17 00:00:00 2001 From: Victor Campmany Date: Mon, 28 Aug 2017 16:36:40 -0400 Subject: [PATCH 18/19] memory leak fix --- src/gpuarray_sort.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 928fbf43c7..9976d76f30 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -1044,9 +1044,9 @@ static int copysrc2dst(GpuArray *dstKey, GpuArray *srcKey, GpuArray *dstArg, Gpu int err = GA_NO_ERROR; if (Nleft > 0) { - checkErr( GpuArray_copy(dstKey, srcKey, GA_C_ORDER) ); + checkErr( GpuArray_move(dstKey, srcKey) ); if (argSortFlg) { - checkErr( GpuArray_copy(dstArg, srcArg, GA_C_ORDER) ); + checkErr( GpuArray_move(dstArg, srcArg) ); } } return err; @@ -1165,13 +1165,20 @@ static int sort( } checkErr( mergeGlobalMem(d_SrcKey, d_DstKey, d_SrcArg, d_DstArg, msConfig, &msKernels.k_merge_global, ctx) ); - checkErr( GpuArray_copy(d_DstKey, d_SrcKey, GA_C_ORDER) ); + checkErr( GpuArray_move(d_DstKey, d_SrcKey) ); if (msConfig->argSortFlg) { - checkErr( GpuArray_copy(d_DstArg, d_SrcArg, GA_C_ORDER) ); + checkErr( GpuArray_move(d_DstArg, d_SrcArg) ); } } } + + GpuKernel_clear(&msKernels.k_bitonic); + GpuKernel_clear(&msKernels.k_ranks); + GpuKernel_clear(&msKernels.k_ranks_idxs); + GpuKernel_clear(&msKernels.k_merge); + GpuKernel_clear(&msKernels.k_merge_global); + return err; } @@ -1300,7 +1307,7 @@ int GpuArray_sort( GpuSortBuff msBuff; GpuSortData msData; - if (srcKey->nd > 1) return GA_IMPL_ERROR; + if (srcKey->nd > 1) return error_set(ctx->err, GA_IMPL_ERROR, "Only 1 dim supported"); initMsConfig(&msConfig, srcKey, dstArg, sortDir, dstArg != NULL ? 1 : 0); From 5687f64ab1de2bd9de653d83611491121e119047 Mon Sep 17 00:00:00 2001 From: Victor Campmany Date: Mon, 28 Aug 2017 18:16:33 -0400 Subject: [PATCH 19/19] cluda flag removed --- src/gpuarray_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_sort.c b/src/gpuarray_sort.c index 9976d76f30..abdb0b1392 100644 --- a/src/gpuarray_sort.c +++ b/src/gpuarray_sort.c @@ -19,7 +19,7 @@ #define checkErr(err) if (err != GA_NO_ERROR) return err; -const int flags = GA_USE_CLUDA; +const int flags = 0; /* * Functions iDivUp, getSampleCount and nextPowerOfTwo taken from