From ba73ae744687e21ad6f4d6c9dc938b780d03a778 Mon Sep 17 00:00:00 2001 From: jintaoluo Date: Fri, 16 May 2014 18:06:03 -0400 Subject: [PATCH 1/5] GPUed accelseach based on PRESTO2 --- include/accelsearch_cmd.h | 5 + src/accel_utils.c | 332 ++++++++++++++++- src/accel_utils_gpu.cu | 762 ++++++++++++++++++++++++++++++++++++++ src/accelsearch.c | 309 +++++++++++++++- src/accelsearch_cmd.c | 16 + src/corr_routines.c | 301 +++++++++++++++ 6 files changed, 1723 insertions(+), 2 deletions(-) create mode 100644 src/accel_utils_gpu.cu diff --git a/include/accelsearch_cmd.h b/include/accelsearch_cmd.h index 0897fc7a5..5df8c554a 100644 --- a/include/accelsearch_cmd.h +++ b/include/accelsearch_cmd.h @@ -72,6 +72,11 @@ typedef struct s_Cmdline { /*@null*/char **argv; /***** the whole command line concatenated */ char *full_cmd_line; + /***** cuda device parameters */ + #ifdef USECUDA + char cudaP; + char cuda; + #endif } Cmdline; diff --git a/src/accel_utils.c b/src/accel_utils.c index eebdbd42b..eabf32e4e 100644 --- a/src/accel_utils.c +++ b/src/accel_utils.c @@ -22,6 +22,44 @@ /* Return 2**n */ #define index_to_twon(n) (1<numkern; + } else { + printf("You must call subharm_ffdot_plane() with numharm=1 and\n"); + printf("harnum=1 before you use other values! Exiting.\n\n"); + exit(0); + } + } + + harm_fract = (double) harmnum / (double) numharm; + drlo = calc_required_r(harm_fract, fullrlo); + drhi = calc_required_r(harm_fract, fullrhi); + rlo = (int) floor(drlo) ; + zlo = calc_required_z(harm_fract, obs->zlo) ; + + if(numharm==1 && harmnum==1){ + fundamental->rlo = rlo; + fundamental->zlo = zlo; + } + + + /*Initialize the z-lookup indices*/ + //printf("\ndebug, accel_utils.c line 1034, numzs_full= %d, numrs_full=%d, obs.zlo=%f, obs.zhi=%f\n", numzs_full, numrs_full, obs->zlo, obs->zhi); + unsigned short *zinds ; + zinds = (unsigned short *)malloc(numzs_full * sizeof(unsigned short)); + { + if (numharm > 1) { + int zz , subz, zind; + for (ii = 0; ii < numzs_full; ii++) { + zz = obs->zlo + ii * ACCEL_DZ;//ACCEL_DZ=2 + subz = calc_required_z(harm_fract, zz); + zind = index_from_z(subz, zlo); + zinds[ii]=zind; + } + } + } + + + + /* Initialize the r-lookup indices */ + if (numharm > 1) { + double rr, subr; + for (ii = 0; ii < numrs_full; ii++) { + rr = fullrlo + ii * ACCEL_DR; + subr = calc_required_r(harm_fract, rr); + shi->rinds[ii] = index_from_r(subr, rlo); + } + } + rinds = shi->rinds; + + numrs = (int) ((ceil(drhi) - floor(drlo)) + * ACCEL_RDR + DBLCORRECT) + 1; + if (numharm == 1 && harmnum == 1) { + numrs = ACCEL_USELEN; + } else { + if (numrs % ACCEL_RDR) { + numrs = (numrs / ACCEL_RDR + 1) * ACCEL_RDR; + } + } + + + numzs = shi->numkern ; + + binoffset = shi->kern[0].kern_half_width; + fftlen = shi->kern[0].fftlen; + lobin = rlo - binoffset; + hibin = (int) ceil(drhi) + binoffset; + numdata = hibin - lobin + 1; + nice_numdata = next2_to_n(numdata); // for FFTs + data = get_fourier_amplitudes(lobin, nice_numdata, obs); //lobin, nice_numdata, obs->lobin + + + if (!obs->mmap_file && !obs->dat_input && 0) + printf("This is newly malloc'd!\n"); + // Normalize the Fourier amplitudes + if (obs->nph > 0.0) { + // Use freq 0 normalization if requested (i.e. photons) + printf("\nDebug, accel_utils.c line 1073: Use freq 0 normalization\n"); + double norm = 1.0 / sqrt(obs->nph); + for (ii = 0; ii < numdata; ii++) { + data[ii].r *= norm; + data[ii].i *= norm; + } + } else if (obs->norm_type == 0) { + // old-style block median normalization + float *powers; + double norm; + + powers = gen_fvect(numdata); + for (ii = 0; ii < numdata; ii++) + powers[ii] = POWER(data[ii].r, data[ii].i); + norm = 1.0 / sqrt(median(powers, numdata)/log(2.0)); + vect_free(powers); + for (ii = 0; ii < numdata; ii++) { + data[ii].r *= norm; + data[ii].i *= norm; + } + } else { + // new-style running double-tophat local-power normalization + float *powers, *loc_powers; + + powers = gen_fvect(nice_numdata); + for (ii = 0; ii < nice_numdata; ii++) { + powers[ii] = POWER(data[ii].r, data[ii].i); + } + loc_powers = corr_loc_pow(powers, nice_numdata); + for (ii = 0; ii < numdata; ii++) { + float norm = invsqrt(loc_powers[ii]); + data[ii].r *= norm; + data[ii].i *= norm; + } + vect_free(powers); + vect_free(loc_powers); + } + + /* Perform the correlations */ + + datainf = RAW; + nrs = corr_complex_gpu(data, numdata, datainf, + kernel_vect_on_gpu, fftlen, FFT, + result, numrs, binoffset, + ACCEL_NUMBETWEEN, binoffset, CORR, + numzs, stage, numharm, harmnum, + offset_array, d_data, d_result, + zinds, rinds, numzs_full, numrs_full, + d_fundamental, + d_zinds, d_rinds, + obs->zlo, fullrlo, harm_fract, zlo, rlo); + + // Always free data + vect_free(data); + + free(zinds); +} +#endif +//----------------------------------------------------------------------------------------- + +//----------------------------------------------------------------------------------------- +//Initialize the fundamental ffdotpows +#ifdef USECUDA +ffdotpows *init_fundamental_ffdot(int numharm, int harmnum, + double fullrlo, double fullrhi, + subharminfo * shi, accelobs * obs, + int stage, int **offset_array, + fcomplex *kernel_vect_on_gpu, fcomplex *d_data, fcomplex *d_result + ) +{ + static int numrs_full = 0, numzs_full = 0; + float powargr, powargi; + double drlo, drhi, harm_fract; + ffdotpows *ffdot; + + /* numrs, numzs, rlo, zlo , rinds*/ + int numrs, numzs, rlo, zlo; + unsigned short *rinds ; + + if (numrs_full == 0) { + if (numharm == 1 && harmnum == 1) { + numrs_full = ACCEL_USELEN; + numzs_full = shi->numkern; + } else { + printf("You must call subharm_ffdot_plane() with numharm=1 and\n"); + printf("harnum=1 before you use other values! Exiting.\n\n"); + exit(0); + } + } + ffdot = (ffdotpows *) malloc(sizeof(ffdotpows)); + + /* Calculate and get the required amplitudes */ + + harm_fract = (double) harmnum / (double) numharm; + drlo = calc_required_r(harm_fract, fullrlo); + drhi = calc_required_r(harm_fract, fullrhi); + rlo = (int) floor(drlo) ; + ffdot->rlo = rlo; + zlo = calc_required_z(harm_fract, obs->zlo) ; + ffdot->zlo = zlo; + + numrs = (int) ((ceil(drhi) - floor(drlo)) + * ACCEL_RDR + DBLCORRECT) + 1; + ffdot-> numrs = numrs; + if (numharm == 1 && harmnum == 1) { + numrs = ACCEL_USELEN; + } else { + if (numrs % ACCEL_RDR) { + numrs = (numrs / ACCEL_RDR + 1) * ACCEL_RDR; + } + } + + ffdot-> numrs = numrs; + + numzs = shi->numkern ; + ffdot->numzs = numzs; + + if (!obs->mmap_file && !obs->dat_input && 0) + printf("This is newly malloc'd!\n"); + + ffdot->powers = gen_fmatrix(numzs, numrs); + + return ffdot; +} +#endif +//----------------------------------------------------------------------------------------- ffdotpows *copy_ffdotpows(ffdotpows * orig) { @@ -1172,6 +1439,51 @@ GSList *search_ffdotpows(ffdotpows * ffdot, int numharm, return cands; } +#ifdef USECUDA +GSList *search_ffdotpows_sort_gpu_result(ffdotpows * ffdot, int numharm, + accelobs * obs, GSList * cands, accel_cand_gpu *cand_gpu_cpu, int nof_cand, int numzs, int numrs) +{ + int ii, jj; + float powcut; + long long numindep; + + powcut = obs->powcut[twon_to_index(numharm)]; + numindep = obs->numindep[twon_to_index(numharm)]; + + float pow, sig; + double rr, zz; + int added = 0; + + int rind, zind; + + if(nof_cand > 0) + { + for(ii=0; ii< nof_cand; ii++) + { + added = 0; + + pow = cand_gpu_cpu[ii].pow; + sig = candidate_sigma(pow, numharm, numindep); + + zind = cand_gpu_cpu[ii].z_ind; + rind = cand_gpu_cpu[ii].r_ind; + + rr = (ffdot->rlo + rind * (double) ACCEL_DR) / (double) numharm; + + zz = (ffdot->zlo + zind * (double) ACCEL_DZ) / (double) numharm; + + cands = insert_new_accelcand(cands, pow, sig, numharm, rr, zz, &added); + if (added && !obs->dat_input) + fprintf(obs->workfile, + "%-7.2f %-7.4f %-2d %-14.4f %-14.9f %-10.4f\n", + pow, sig, numharm, rr, rr / obs->T, zz); + } + } + + return cands; +} +#endif + void deredden(fcomplex * fft, int numamps) /* Attempt to remove rednoise from a time series by using */ /* a median-filter of logarithmically increasing width. */ @@ -1518,10 +1830,28 @@ void create_accelobs(accelobs * obs, infodata * idata, Cmdline * cmd, int usemma memuse = sizeof(float) * (obs->highestbin + ACCEL_USELEN) \ * obs->numbetween * obs->numz; printf("Full f-fdot plane would need %.2f GB: ", (float)memuse / gb); - if (memuse < MAXRAMUSE || cmd->inmemP) { + if (memuse < MAXRAMUSE || cmd->inmemP) { + #ifdef USECUDA + if(cmd->cudaP == 1) + { + printf("using in-memory accelsearch is possible, but the on GPU in-memory is not ready yet.\n\n"); + printf("so on GPU, using standard accelsearch.\n\n"); + obs->inmem = 0; + obs->ffdotplane = NULL; + } + else + { printf("using in-memory accelsearch.\n\n"); obs->inmem = 1; obs->ffdotplane = gen_fvect(memuse / sizeof(float)); + } + #else + { + printf("using in-memory accelsearch.\n\n"); + obs->inmem = 1; + obs->ffdotplane = gen_fvect(memuse / sizeof(float)); + } + #endif } else { printf("using standard accelsearch.\n\n"); obs->inmem = 0; diff --git a/src/accel_utils_gpu.cu b/src/accel_utils_gpu.cu new file mode 100644 index 000000000..1cbdd2ad3 --- /dev/null +++ b/src/accel_utils_gpu.cu @@ -0,0 +1,762 @@ +//GPU functions for accelsearch +//by Jintao Luo, NRAO + +// includes, CUDA +#include +#include +#include +#include +#include + +#include "device_functions.h" + +#include "accel_utils_gpu.h" + +typedef float2 Complex; + +//define a texture memory +texture tex_d_kernel; +texture tex_d_data; + +texture tex_d_result; + +texture tex_d_result_2D; +cudaChannelFormatDesc channelDesc ; + +texture tex_d_zinds; +texture tex_d_rinds; + +texture tex_d_fundamental; + +static __device__ __host__ inline fcomplex ComplexScale(fcomplex, float); +static __device__ __host__ inline fcomplex ComplexMul(fcomplex, fcomplex); +static __device__ __host__ inline fcomplex ComplexMul_02(fcomplex, fcomplex); + +static __global__ void ComplexPointwiseMulAndScale_one_loop(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset); + +static __global__ void ComplexPointwiseMulAndScale_one_loop_02(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset); + +static __global__ void Complex_Pow_and_Chop(float *d_pow, fcomplex *d_result, int fftlen, int numkern_in_array, int chopbins, int numtocopy); + +static __global__ void add_ffdotpows_on_gpu(float *d_fundamental, fcomplex *d_result, int numzs_full, int numrs_full, unsigned short *zinds, unsigned short *rinds, int fftlen, int chopbins, double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo); + + +extern "C" +extern int search_ffdotpows_gpu(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, accel_cand_gpu * cand_array_sort_gpu, int numzs, int numrs, accel_cand_gpu *cand_gpu_cpu); + +static __global__ void search_ffdotpows_kernel(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, int numzs, int numrs, int *d_addr_jluo); + +extern "C" +fcomplex * prep_data_on_gpu(subharminfo **subharminfs, int numharmstages); + +extern "C" +fcomplex * prep_result_on_gpu(subharminfo **subharminfs, int numharmstages); + +extern "C" +fcomplex * cp_kernel_array_to_gpu(subharminfo **subharminfs, int numharmstages, int **offset_array); + +extern "C" +fcomplex * cp_input_to_gpu(fcomplex *input_vect_on_cpu, long long numbins, long long N); + +extern "C" +unsigned short *prep_rz_inds_on_gpu(int size_inds); + +extern "C" +float * prep_float_vect_on_gpu( int size); + +extern "C" +accel_cand_gpu *prep_cand_array(int size); + +extern "C" +void cudaFree_kernel_vect(fcomplex *in); + +extern "C" +void select_cuda_dev(int cuda_inds); + +extern "C" +void complex_corr_conv_gpu(fcomplex * data, fcomplex * kernel_vect_on_gpu, + int numdata, + int numkern_in_array, + int stage, int harmtosum, int harmnum, + int ** offset_array, fcomplex *d_data, fcomplex *d_result, + int chopbins, int numtocopy, + unsigned short *zinds, unsigned short *rinds, + int numzs_full, int numrs_full, + float *d_fundamental, + unsigned short *d_zinds, unsigned short *d_rinds, + int datainf_flag, + presto_ffts ffts, presto_optype type, + double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo); + + +extern "C" +void init_cuFFT_plans(subharminfo **subharminfs, int numharmstages); + +extern "C" +void destroy_cuFFT_plans(subharminfo **subharminfs, int numharmstages); + +cufftHandle plan_data_array[16][16]; +cufftHandle plan_result_array[16][16]; + +/******************************************** complex_corr_conv ********************************************************************/ +void complex_corr_conv_gpu(fcomplex * data, fcomplex * kernel_vect_on_gpu, + int numdata, + int numkern_in_array, + int stage, int harmtosum, int harmnum, + int ** offset_array, fcomplex *d_data, fcomplex *d_result, + int chopbins, int numtocopy, + unsigned short *zinds, unsigned short *rinds, + int numzs_full, int numrs_full, + float *d_fundamental, + unsigned short *d_zinds, unsigned short *d_rinds, + int datainf_flag, + presto_ffts ffts, presto_optype type, + double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo) +{ + + + int fftlen = numdata; + int kernel_array_offset; + + if (ffts > 3) { + printf("\nIllegal 'ffts' option (%d) in complex_corr_conv().\n", ffts); + printf("Exiting.\n\n"); + exit(1); + } + if (type > 3) { + printf("\nIllegal 'type' option (%d) in complex_corr_conv().\n", type); + printf("Exiting.\n\n"); + exit(1); + } + + if(harmtosum==1 && harmnum==1){ + kernel_array_offset = offset_array[0][0]; + } + if(harmtosum > 1){ + kernel_array_offset = offset_array[stage][harmnum-1]; + } + + //copy data to GPU memory + checkCudaErrors(cudaMemcpy(d_data, data, sizeof(fcomplex) * fftlen, cudaMemcpyHostToDevice)); + //FFT data on GPU + if(datainf_flag == 1) + { + if(harmtosum==1 && harmnum==1){ + checkCudaErrors(cufftExecC2C(plan_data_array[0][0], (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD)); + } + if(harmtosum > 1){ + checkCudaErrors(cufftExecC2C(plan_data_array[stage][harmnum-1], (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD)); + } + + } + + //Bind data and kernel to Texture Memory + cudaBindTexture(NULL, tex_d_kernel, kernel_vect_on_gpu, sizeof(fcomplex) * ( kernel_array_offset + 1 + numkern_in_array * fftlen ) ); + cudaBindTexture(NULL, tex_d_data, d_data, sizeof(fcomplex) * fftlen); + //Mul the FFTed data with Kernels + if (type == CORR || type == INPLACE_CORR) { + ComplexPointwiseMulAndScale_one_loop<<<512, 512>>>(d_result, d_data, kernel_vect_on_gpu, numkern_in_array, fftlen, 1.0/fftlen, kernel_array_offset); + } + else { + ComplexPointwiseMulAndScale_one_loop_02<<<512, 512>>>(d_result, d_data, kernel_vect_on_gpu, numkern_in_array, fftlen, 1.0/fftlen, kernel_array_offset); + } + //unbind the data and kenerl from Texture memory + cudaUnbindTexture(tex_d_kernel); + cudaUnbindTexture(tex_d_data); + + + //Inverse FFT + if(harmtosum==1 && harmnum==1){ + checkCudaErrors(cufftExecC2C(plan_result_array[0][0], (cufftComplex *)d_result, (cufftComplex *)d_result, CUFFT_INVERSE)); + } + if(harmtosum > 1){ + checkCudaErrors(cufftExecC2C(plan_result_array[stage][harmnum-1], (cufftComplex *)d_result, (cufftComplex *)d_result, CUFFT_INVERSE)); + } + + //bind the FFTed result + //cudaBindTexture(NULL, tex_d_result, d_result, sizeof(fcomplex) * fftlen * numkern_in_array ); + //bind d_result to its 2D texture memory + checkCudaErrors(cudaBindTexture2D(NULL, tex_d_result_2D, d_result, channelDesc, fftlen, numkern_in_array, sizeof(fcomplex) * fftlen )); + //sum harmonics + if(harmtosum==1 && harmnum==1){//if fundamental + Complex_Pow_and_Chop<<<512, 512>>>(d_fundamental, d_result, fftlen, numkern_in_array, chopbins, numtocopy); + } + if(harmtosum > 1){ //if harmonics + //move zinds and rinds to GPU + checkCudaErrors(cudaMemcpy(d_zinds, zinds, sizeof(unsigned short) * numzs_full, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_rinds, rinds, sizeof(unsigned short) * numrs_full, cudaMemcpyHostToDevice)); + //bind zinds and rinds to Texture Memory + cudaBindTexture(NULL, tex_d_zinds, d_zinds, sizeof(unsigned short) * numzs_full ); + cudaBindTexture(NULL, tex_d_rinds, d_rinds, sizeof(unsigned short) * numrs_full ); + //add_ffdotpows_on_gpu + add_ffdotpows_on_gpu<<<512, 512>>>(d_fundamental, d_result, numzs_full, numrs_full, d_zinds, d_rinds, fftlen, chopbins, obs_zlo, fullrlo, harm_fract, zlo, rlo); + //Unbind zinds and rinds from Texture Memory + cudaUnbindTexture(tex_d_zinds); + cudaUnbindTexture(tex_d_rinds); + } + + //cudaUnbindTexture(tex_d_result); + checkCudaErrors(cudaUnbindTexture(tex_d_result_2D)); + +} + +/******************************************** add fftdot pows on GPU, choping included ************************************************/ +static __global__ void add_ffdotpows_on_gpu(float *d_fundamental, fcomplex *d_result, int numzs_full, int numrs_full, unsigned short *zinds, unsigned short *rinds, int fftlen, int chopbins, double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo) +{ + + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; + + //int addr_z, addr_r, addr_result, addr_fundamental ; + int addr_z, addr_r ; + int addr_result ; + + Complex buf; + + int z_index, r_index; + + for (int i = threadID; i < numrs_full * numzs_full; i += numThreads){ + + z_index = i/numrs_full ; + addr_z = tex1Dfetch(tex_d_zinds, z_index) ; + + r_index = i - z_index * numrs_full ; + addr_r = tex1Dfetch(tex_d_rinds, r_index) ; + + //addr_result = addr_z * fftlen + chopbins + addr_r ; + //buf = tex1Dfetch(tex_d_result, addr_result) ; + + buf = tex2D(tex_d_result_2D, chopbins + addr_r, addr_z); + + d_fundamental[i] += buf.x * buf.x + buf.y * buf.y ; + + } + +} + + + + +/********************************************* Calculate complex ffdot pows, choping included *****************************************/ +static __global__ void Complex_Pow_and_Chop(float *d_pow, fcomplex *d_result, int fftlen, int numkern_in_array, int chopbins, int numtocopy) +/* + calaulate POW of data in d_result + d_pw: where the result stored + d_result: the array contains data to Pow_and_chop + fftlen: length of FFT used in the complex_corr_conv_gpu + numkern_in_array: num of kernels in array in complex_corr_conv_gpu + chopbins: num of bins that should be discarded in the head and tail of the result vectors + numtocopy: num of data should be kept in each vector of d_result, after the first chopbins points +*/ +{ + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; + + int addr_result; + + Complex buf; + + int z_ind, r_ind ; + + for (int i = threadID; i < numkern_in_array*numtocopy; i += numThreads){ + + z_ind = i/numtocopy; + r_ind = i - z_ind * numtocopy; + + //addr_result = z_ind * fftlen + chopbins + r_ind; + + //buf = tex1Dfetch(tex_d_result, addr_result) ; + + buf = tex2D(tex_d_result_2D, chopbins + r_ind, z_ind); + + d_pow[i] = buf.x * buf.x + buf.y * buf.y ; + } + +} + + +/**********************************************************************************************************************************************************/ +//complex operations + +// Complex scale +static __device__ __host__ inline fcomplex ComplexScale(fcomplex a, float s) +{ + fcomplex c; + c.r = s * a.r; + c.i = s * a.i; + return c; +} + +// Complex multiplication +static __device__ __host__ inline fcomplex ComplexMul(fcomplex a, fcomplex b) +{ + fcomplex c; + c.r = a.r * b.r + a.i * b.i; + c.i = a.i * b.r - a.r * b.i; + return c; +} + +static __device__ __host__ inline fcomplex ComplexMul_02(fcomplex a, fcomplex b) +{ + fcomplex c; + c.r = a.r * b.r - a.i * b.i; + c.i = a.i * b.r + a.r * b.i; + return c; +} + + +//Use one loop to realize Complex Point Mul and Scale +static __global__ void ComplexPointwiseMulAndScale_one_loop(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset) +{ + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; + + fcomplex a_buf ; + fcomplex b_buf ; + Complex buf_buf_a, buf_buf_b; + int i; + + int a_index, b_index; + + int total_num = data_size*numkern_in_array; + + for (i = threadID; i < total_num; i += numThreads) + { + a_index = i/data_size; + a_index = i - a_index * data_size ; + buf_buf_a = tex1Dfetch(tex_d_data, a_index); + a_buf.r = buf_buf_a.x; + a_buf.i = buf_buf_a.y; + + b_index = i + kernel_array_offset; + buf_buf_b = tex1Dfetch(tex_d_kernel, b_index) ; + + b_buf.r = buf_buf_b.x; + b_buf.i = buf_buf_b.y; + c[i] = ComplexScale(ComplexMul(a_buf, b_buf), scale); + + + } + +} + +static __global__ void ComplexPointwiseMulAndScale_one_loop_02(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset) +{ + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; + + fcomplex a_buf ; + fcomplex b_buf ; + Complex buf_buf_a, buf_buf_b; + int i; + + int a_index, b_index; + + int total_num = data_size*numkern_in_array; + + for (i = threadID; i < total_num; i += numThreads) + { + a_index = i/data_size; + a_index = i - a_index * data_size ; + buf_buf_a = tex1Dfetch(tex_d_data, a_index); + a_buf.r = buf_buf_a.x; + a_buf.i = buf_buf_a.y; + + b_index = i + kernel_array_offset; + buf_buf_b = tex1Dfetch(tex_d_kernel, b_index) ; + + b_buf.r = buf_buf_b.x; + b_buf.i = buf_buf_b.y; + c[i] = ComplexScale(ComplexMul_02(a_buf, b_buf), scale); + + + } + +} +/**********************************************************************************************************************************************************/ + +//-------------------------Prepare search_ffdotpows cand array +accel_cand_gpu *prep_cand_array(int size) +{ + + accel_cand_gpu * cand_on_gpu; + checkCudaErrors(cudaMalloc((void **)&cand_on_gpu, size * sizeof(accel_cand_gpu))); + + return cand_on_gpu ; + +} + +int search_ffdotpows_gpu(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, accel_cand_gpu * cand_array_sort_gpu, int numzs, int numrs, accel_cand_gpu *cand_gpu_cpu) +{ + + int *d_addr; + int h_addr; + + checkCudaErrors(cudaMalloc((void **)&d_addr, sizeof(int) * 1)); + checkCudaErrors(cudaMemset(d_addr, 0, sizeof(int))); // set d_addr to 0 + + //bind d_fundamental to texture + cudaBindTexture(NULL, tex_d_fundamental, d_fundamental, sizeof(float) * numzs * numrs ); + + //search ffdot_pow + search_ffdotpows_kernel<<<512, 512>>>(powcut, d_fundamental, cand_array_search_gpu, numzs, numrs, d_addr); + + //get nof_cand + checkCudaErrors(cudaMemcpy(&h_addr, d_addr, sizeof(int) * 1, cudaMemcpyDeviceToHost)); + int nof_cand ; + nof_cand = h_addr; + + //get the candicates + checkCudaErrors(cudaMemcpy(cand_gpu_cpu, cand_array_search_gpu, sizeof(accel_cand_gpu) * nof_cand, cudaMemcpyDeviceToHost)); + + cudaFree(d_addr); + + cudaUnbindTexture(tex_d_fundamental); + + return nof_cand ; +} + +static __global__ void search_ffdotpows_kernel(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, int numzs, int numrs, int *d_addr_jluo) +{ + + const int numThreads = blockDim.x * gridDim.x; + const int threadID = blockIdx.x * blockDim.x + threadIdx.x; + + int i ; + int nof_cand = 0; + + float pow ; + + int addr_search=0; + + accel_cand_gpu cand_tmp ; + + int z_ind, r_ind; + + for (i = threadID; i < numzs*numrs; i += numThreads) + { + + nof_cand = 0; + + pow = tex1Dfetch(tex_d_fundamental, i); + + if(pow > powcut) + { + cand_tmp.pow = pow ; + nof_cand += 1 ; + + cand_tmp.nof_cand = nof_cand ; + + z_ind = (int)(i/numrs); + cand_tmp.z_ind = z_ind; + + r_ind = i - z_ind * numrs ; + cand_tmp.r_ind = r_ind; + + addr_search = atomicAdd(&d_addr_jluo[0], 1); + + cand_array_search_gpu[ addr_search ] = cand_tmp ; + } + } +} + +//-------------------------Prepare vectors for zinds and rinds +unsigned short *prep_rz_inds_on_gpu(int size_inds) +{ + + unsigned short *inds; + checkCudaErrors(cudaMalloc((void **)&inds, size_inds * sizeof(unsigned short))); + + return inds; + +} + +//-------------------------Prepare a float-point vectors +float * prep_float_vect_on_gpu( int size) +{ + + float * f_vect; + checkCudaErrors(cudaMalloc((void **)&f_vect, size * sizeof(float))); + + return f_vect; + +} + +//----------Copy input data to GPU--------------------------------- +fcomplex * cp_input_to_gpu(fcomplex *input_vect_on_cpu, long long numbins, long long N) +{ + fcomplex *input_vect_on_gpu; + + checkCudaErrors(cudaMalloc((void **)&input_vect_on_gpu, sizeof(fcomplex) * numbins)); + + //check some parameters + printf("\nNum of data points: %d\n", N); + printf("\nNum of bins : %d\n", numbins); + + checkCudaErrors(cudaMemcpy( input_vect_on_gpu, input_vect_on_cpu, sizeof(fcomplex) * numbins, cudaMemcpyHostToDevice) ); + + return input_vect_on_gpu; +} + +//----------Prepare d_data on GPU--------------------- +fcomplex * prep_data_on_gpu(subharminfo **subharminfs, int numharmstages) +//cudaMalloc d_data for the whole search process +{ + + int harm, harmtosum, stage; + int fftlen; + int size_d_data; + + fcomplex *d_data; + + size_d_data = 0; + + fftlen = subharminfs[0][0].kern[0].fftlen; + + size_d_data = fftlen ; + +if (numharmstages > 1) { + + for(stage=1; stage 1) { + + for(stage=1; stage 1) { + for(stage=1; stage 1) { + + for(stage=1; stage 1) { + + for(stage=1; stage 1) { + n >>= 1; + x++; + } + return x; +} + int main(int argc, char *argv[]) { - int ii; + int ii, jj; double ttim, utim, stim, tott; struct tms runtimes; subharminfo **subharminfs; @@ -73,6 +136,7 @@ int main(int argc, char *argv[]) printf("\n\n"); printf(" Fourier-Domain Acceleration Search Routine\n"); printf(" by Scott M. Ransom\n\n"); + printf(" GPU version by Jintao Luo\n\n"); /* Create the accelobs structure */ create_accelobs(&obs, &idata, cmd, 1); @@ -119,8 +183,249 @@ int main(int argc, char *argv[]) obs.workfilenm); } + int use_cuda_flag = 0; + #ifdef USECUDA + if(cmd->cudaP == 1) + { + use_cuda_flag = 1 ; + } + #endif + +#ifdef USECUDA +if(cmd->cudaP == 1){/****************************************call GPU to run the search***************************************************/ +//if( use_cuda_flag == 1 ){ + + printf("\nTry to use a GPU to run the search\n"); + + #ifdef USECUDA + printf( "\nUSECUDA\n" ); + #endif + + select_cuda_dev(cmd->cuda); + + //prepare d_data and d_result on GPU + fcomplex *d_data; + fcomplex *d_result; + d_data = prep_data_on_gpu(subharminfs, obs.numharmstages); + d_result = prep_result_on_gpu(subharminfs, obs.numharmstages); + + //prepare the fundamental on GPU + float *d_fundamental; + d_fundamental = prep_float_vect_on_gpu(subharminfs[0][0].numkern * subharminfs[0][0].kern[0].fftlen ); + + //inds lookup tables for add_ffdot_pow + unsigned short *d_zinds, *d_rinds; + d_zinds = prep_rz_inds_on_gpu(subharminfs[0][0].numkern); + d_rinds = prep_rz_inds_on_gpu(subharminfs[0][0].kern[0].fftlen);//In fact fftlen >= size_d_rinds, but that's ok to have a bit waste + + //Generate the kernel offset_array for stages + int **offset_array; + fcomplex *kernel_vect_on_gpu; + offset_array = (int **)malloc(obs.numharmstages * sizeof(int *)); + offset_array[0] = (int *)malloc( 1 * sizeof(int) ); + for(ii=1; iinumzs, my_fundamental->numrs, cand_gpu_cpu); + cands = search_ffdotpows_sort_gpu_result(my_fundamental, 1, &obs, cands, cand_gpu_cpu, nof_cand, my_fundamental->numzs, my_fundamental->numrs); + + + if (obs.numharmstages > 1) { /* Search the subharmonics */ + int stage, harmtosum, harm; + + for (stage = 1; stage < obs.numharmstages; stage++) { + harmtosum = 1 << stage; + for (harm = 1; harm < harmtosum; harm += 2) { + subharm_ffdot_plane_gpu(harmtosum, harm, startr, lastr, + &subharminfs[stage][harm - 1], &obs, + stage, offset_array, + kernel_vect_on_gpu, d_data, d_result, + my_fundamental, + d_fundamental, + d_zinds, d_rinds); + } + nof_cand = search_ffdotpows_gpu(obs.powcut[twon_to_index(harmtosum)], d_fundamental, cand_array_search_gpu, cand_array_sort_gpu, my_fundamental->numzs, my_fundamental->numrs, cand_gpu_cpu); + cands = search_ffdotpows_sort_gpu_result(my_fundamental, harmtosum, &obs, cands, cand_gpu_cpu, nof_cand, my_fundamental->numzs, my_fundamental->numrs); + } + } + startr = nextr; + } + free_ffdotpows(my_fundamental); + print_percent_complete(obs.highestbin - obs.rlo, + obs.highestbin - obs.rlo, "search", 0); + } + + printf("\n\nDone searching. Now optimizing each candidate.\n\n"); + + destroy_cuFFT_plans( subharminfs, obs.numharmstages ); //destroy cuFFT plans + + printf("\ndebug:.......................\n"); + + //free_subharminfos(obs.numharmstages, subharminfs); //from presto1 + free_subharminfos(&obs, subharminfs); //from presto2 + + { /* Candidate list trimming and optimization */ + int numcands; + GSList *listptr; + accelcand *cand; + fourierprops *props; + + + numcands = g_slist_length(cands); + + if (numcands) { + + /* Sort the candidates according to the optimized sigmas */ + + cands = sort_accelcands(cands); + + /* Eliminate (most of) the harmonically related candidates */ + if ((cmd->numharm > 1) && !(cmd->noharmremoveP)) + eliminate_harmonics(cands, &numcands); + + /* Now optimize each candidate and its harmonics */ + + print_percent_complete(0, 0, NULL, 1); + listptr = cands; + for (ii = 0; ii < numcands; ii++) { + print_percent_complete(ii, numcands, "optimization", 0); + cand = (accelcand *) (listptr->data); + optimize_accelcand(cand, &obs); + listptr = listptr->next; + } + print_percent_complete(ii, numcands, "optimization", 0); + + /* Calculate the properties of the fundamentals */ + + props = (fourierprops *) malloc(sizeof(fourierprops) * numcands); + listptr = cands; + for (ii = 0; ii < numcands; ii++) { + cand = (accelcand *) (listptr->data); + /* In case the fundamental harmonic is not significant, */ + /* send the originally determined r and z from the */ + /* harmonic sum in the search. Note that the derivs are */ + /* not used for the computations with the fundamental. */ + calc_props(cand->derivs[0], cand->r, cand->z, 0.0, props + ii); + /* Override the error estimates based on power */ + props[ii].rerr = (float) (ACCEL_DR) / cand->numharm; + props[ii].zerr = (float) (ACCEL_DZ) / cand->numharm; + listptr = listptr->next; + } + + /* Write the fundamentals to the output text file */ + + output_fundamentals(props, cands, &obs, &idata); + + /* Write the harmonics to the output text file */ + + output_harmonics(cands, &obs, &idata); + + /* Write the fundamental fourierprops to the cand file */ + obs.workfile = chkfopen(obs.candnm, "wb"); + chkfwrite(props, sizeof(fourierprops), numcands, obs.workfile); + fclose(obs.workfile); + free(props); + printf("\n\n"); + } else { + printf("No candidates above sigma = %.2f were found.\n\n", obs.sigma); + } + } + + /* Finish up */ + + printf("Searched the following approx numbers of independent points:\n"); + printf(" %d harmonic: %9lld\n", 1, obs.numindep[0]); + for (ii = 1; ii < obs.numharmstages; ii++) + printf(" %d harmonics: %9lld\n", 1 << ii, obs.numindep[ii]); + + printf("\nTiming summary:\n"); + tott = times(&runtimes) / (double) CLK_TCK - tott; + utim = runtimes.tms_utime / (double) CLK_TCK; + stim = runtimes.tms_stime / (double) CLK_TCK; + ttim = utim + stim; + printf(" CPU time: %.3f sec (User: %.3f sec, System: %.3f sec)\n", + ttim, utim, stim); + printf(" Total time: %.3f sec\n\n", tott); + + printf("Final candidates in binary format are in '%s'.\n", obs.candnm); + printf("Final Candidates in a text format are in '%s'.\n\n", obs.accelnm); + + free_accelobs(&obs); + g_slist_foreach(cands, free_accelcand, NULL); + g_slist_free(cands); + + free(offset_array); + cudaFree_kernel_vect(d_data); + cudaFree_kernel_vect(d_result); + cudaFree_kernel_vect(d_fundamental); + cudaFree_kernel_vect(kernel_vect_on_gpu); + cudaFree_kernel_vect(d_zinds); + cudaFree_kernel_vect(d_rinds); + cudaFree_kernel_vect(cand_array_search_gpu); + cudaFree_kernel_vect(cand_array_sort_gpu); + //cudaFree_kernel_vect(input_vect_on_gpu); + + + free(cand_gpu_cpu); + + return (0); + + +} +#endif + +/****************************************call CPU to run the search***************************************************/ +//else{ +if( use_cuda_flag == 0 ){ + printf("\nUse CPU to run the search\n"); + /* Start the main search loop */ { double startr = obs.rlo, lastr = 0, nextr = 0; ffdotpows *fundamental; @@ -270,3 +575,5 @@ int main(int argc, char *argv[]) g_slist_free(cands); return (0); } + +} diff --git a/src/accelsearch_cmd.c b/src/accelsearch_cmd.c index 1aa0ac91e..e222bb149 100644 --- a/src/accelsearch_cmd.c +++ b/src/accelsearch_cmd.c @@ -1019,6 +1019,9 @@ usage(void) fprintf(stderr,"%s"," -noharmremove: Do not remove harmonically related candidates (never removed for numharm = 1)\n"); fprintf(stderr,"%s"," infile: Input file name of the floating point .fft or .[s]dat file. A '.inf' file of the same name must also exist\n"); fprintf(stderr,"%s"," 1 value\n"); + #ifdef USECUDA + fprintf(stderr,"%s"," -cuda: to run accel_search on GPU, indicate the index of cuda device to be used, 0 means the 1st cuda device \n"); + #endif fprintf(stderr,"%s"," version: 10Nov13\n"); fprintf(stderr,"%s"," "); exit(EXIT_FAILURE); @@ -1174,6 +1177,19 @@ parseCmdline(int argc, char **argv) continue; } + #ifdef USECUDA + if( 0==strcmp("-cuda", argv[i]) ) { + int keep = i; + //cmd.numharmP = 1; + cmd.cudaP = 1; + i = getIntOpt(argc, argv, i, &cmd.cuda, 1); + //cmd.numharmC = i-keep; + //checkIntLower("-numharm", &cmd.numharm, cmd.numharmC, 16); + //checkIntHigher("-numharm", &cmd.numharm, cmd.numharmC, 1); + continue; + }/* -cuda */ + #endif + if( argv[i][0]=='-' ) { fprintf(stderr, "\n%s: unknown option `%s'\n\n", Program, argv[i]); diff --git a/src/corr_routines.c b/src/corr_routines.c index ff17bcd74..5d5aa5bf0 100644 --- a/src/corr_routines.c +++ b/src/corr_routines.c @@ -1,7 +1,308 @@ #include "presto.h" +#ifdef USECUDA +extern void * complex_corr_conv_gpu(fcomplex * data, fcomplex * kernel_vect_on_gpu, + int numdata, + int numkern_in_array, + int stage, int harmtosum, int harmnum, + int ** offset_array, fcomplex *d_data, fcomplex *d_result, + int chopbins, int numtocopy, + unsigned short *zinds, unsigned short *rinds, + int numzs_full, int numrs_full, + fcomplex *d_fundamental, + unsigned short *d_zinds, unsigned short *d_rinds, + int datainf_flag, + presto_ffts ffts, presto_optype type, + double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo); +#endif + /* define DEBUGPRINT */ + +//-------------------------------------------------------------------------- +#ifdef USECUDA +int corr_complex_gpu(fcomplex * data, int numdata, presto_datainf datainf, + fcomplex * kernel_vect_on_gpu, int numkern, presto_datainf kerninf, + fcomplex ** result, int numresult, int lobin, + int numbetween, int kern_half_width, presto_optype optype, + int numkern_in_array, int stage, int harmtosum, int harmnum, + int ** offset_array, fcomplex *d_data, fcomplex *d_result, + unsigned short *zinds, unsigned short *rinds, + int numzs_full, int numrs_full, + //fcomplex *d_fundamental, + float *d_fundamental, + unsigned short *d_zinds, unsigned short *d_rinds, + double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo) + /* This routine is a general correlation or convolution routine */ + /* for complex data. It can perform convolutions or correlations */ + /* on raw complex data, data that is prepared for a convolution/ */ + /* correlation but not FFTd, or already FFTd data. The kernel */ + /* that it uses can also be raw, prepped, or FFTd. If you call */ + /* the routine multiple times with either the same kernel or data */ + /* array, it uses a saved version of the array from the previous */ + /* call to cut down on many processing steps. The return value */ + /* tells how many usable (i.e. non-contaminated) points were */ + /* returned in the result array (the first value will be that of */ + /* 'lobin'). This routine will _not_ perform in-place */ + /* correlations or convolutions (i.e. it ignores those choices */ + /* for 'optype'). */ + /* Arguments: */ + /* 'data' is a complex array of the data to be interpolated. */ + /* 'numdata' is the number of complex points in 'data'. */ + /* 'datainf' is one of the following that describes the data: */ + /* RAW = Normal un-altered complex data. */ + /* PREPPED = Data has been padded and spread based */ + /* on 'kern_half_width' and 'numbetween' */ + /* and is ready to be FFTd. */ + /* FFT = Data has already been prepared and FFTd. */ + /* SAME = Data is the same as the previous call. */ + /* The routine uses its saved data. */ + /* 'kern' is the correlation kernel. */ + /* 'numkern' is the number of complex points in 'kern'. */ + /* 'kerninf' is one of the same choices as 'datainf' above. */ + /* 'result' is the resulting complex array (must already exist). */ + /* 'numresult' is the number of complex points in 'result'. */ + /* 'lobin' is the lowest fourier bin to convolve/correlate. */ + /* 'numbetween' is the number of bins to spread the data points. */ + /* 'kern_half_width' is half the width (bins) of the raw kernel. */ + /* 'optype' is either CORR or CONV (correlation or convolution). */ + /* Notes: */ + /* If either 'datainf' or 'kerninf' are of type PREPPED or FFT, */ + /* then the length of the FFTs used in the correlation/ */ + /* convolution calculations will be of length 'numdata' or */ + /* 'numkern'. If both 'datainf' and 'kerninf' are of type */ + /* PREPPED or FFT then 'numdata' and 'numkern' must have the */ + /* same value. In order for SAME values of 'datainf' and */ + /* 'kerninf' to help out, the routine must be called with the */ + /* same values for 'kern_half_width' and 'numbetween' as well. */ +{ + static fcomplex *kernarray, *dataarray; + static int firsttime = 1, oldnumbetween, oldkern_half_width; + static int oldfftlen, oldnumdata, oldlobin; + fcomplex *tmpdata = NULL, zeros = { 0.0, 0.0 }; + int ii, jj, fftlen = 1, numbins, beginbin, endbin, padlo, padhi; + fcomplex *h_result; + + /* Check entry arguments for validity */ + + if (numdata <= 0) { + printf("\n 'numdata' = %d (out of bounds) in corr_complex().\n\n", numdata); + exit(1); + } + if (numkern <= 0 || numkern > MAXREALFFT / 4) { + printf("\n 'numkern' = %d (out of bounds) in corr_complex().\n\n", numkern); + exit(1); + } + if (numresult <= 0 || numresult > MAXREALFFT / 4) { + printf("\n 'numresult' = %d (out of bounds) in corr_complex().\n\n", + numresult); + exit(1); + } + if (numbetween < 0 || numbetween > 20000) { + printf("\n 'numbetween' = %d (out of bounds) in corr_complex().\n\n", + numbetween); + exit(1); + } + if (numresult % numbetween != 0) { + printf("\n 'numresult' = %d must be a multiple of ", numresult); + printf("'numbetween' in corr_complex().\n\n"); + exit(1); + } + if (lobin < 0 || lobin >= numdata) { + printf("\n 'lobin' = %d (out of bounds) in corr_complex().\n\n", lobin); + exit(1); + } + if ((datainf == SAME || kerninf == SAME) && firsttime) { + printf("\n Can't call corr_complex() with 'datainf' or 'kerninf'\n"); + printf(" being SAME if this is the 1st time calling the routine.\n\n"); + } + if (datainf == SAME && kerninf == SAME && lobin == oldlobin) { + printf("\n Doesn't make sense to call corr_complex() with SAME for\n"); + printf(" both 'datainf' and 'kerninf' if 'lobin' hasn't changed.\n\n"); + exit(1); + } + if (datainf == PREPPED || datainf == FFT || kerninf == PREPPED || kerninf == FFT) { + if (datainf == PREPPED || datainf == FFT) + fftlen = numdata; + if (kerninf == PREPPED || kerninf == FFT) + fftlen = numkern; + if ((datainf == PREPPED || datainf == FFT) && + (kerninf == PREPPED || kerninf == FFT)) { + if (numdata != numkern) { + printf("\n 'numdata' must equal 'numkern' if the data array and\n"); + printf(" the kernel are either PREPPED or FFT in corr_complex().\n\n"); + exit(1); + } + } + } else if (datainf == SAME || kerninf == SAME) { + fftlen = oldfftlen; + } else { +#ifdef DEBUGPRINT + printf("Yikes!!\n"); +#endif + fftlen = next2_to_n(numresult + 2 * kern_half_width * numbetween); + } + if (fftlen > MAXREALFFT / 4) { + printf("\n 'fftlen' = %d (out of bounds) in corr_complex().\n\n", fftlen); + exit(1); + } + if (kerninf == RAW || kerninf == PREPPED) { + if (2 * kern_half_width * numbetween > numkern) { + printf("\n 'numkern = %d (out of bounds in corr_complex().\n", numkern); + printf(" If 'kerninf' == RAW or PREPPED, 'numkern' must be >=\n"); + printf(" to 2 * 'kern_half_width' * 'numbetween'.\n\n"); + exit(1); + } + } else if (kerninf == SAME) { + if (lobin != oldlobin || kern_half_width != oldkern_half_width) { + printf("\n When 'kerninf' = SAME, 'lobin' and 'kern_half_width'\n"); + printf(" must match their previous values in corr_complex().\n\n"); + exit(1); + } + } +#ifdef DEBUGPRINT + printf("numdata = %d fftlen = %d\n", numdata, fftlen); +#endif + + /* Prep the data array */ + + if (firsttime || !(datainf == SAME && + lobin == oldlobin && + fftlen == oldfftlen && + numdata == oldnumdata && numbetween == oldnumbetween)) { + + numbins = fftlen / numbetween; + beginbin = lobin - kern_half_width; + endbin = beginbin + numbins; + tmpdata = data + beginbin; + + if (datainf == RAW) { + +#ifdef DEBUGPRINT + printf("Prepping, spreading, and FFTing the data...\n"); +#endif + /* Zero-pad if necessary (when looking at the beginning or */ + /* the end of the data array) */ + + if ((beginbin < 0) || (endbin > numdata)) { + tmpdata = gen_cvect(numbins); + for (ii = 0; ii < numbins; ii++) + tmpdata[ii] = zeros; + padlo = (beginbin < 0) ? abs(beginbin) : 0; + padhi = ((endbin - numdata) > 0) ? endbin - numdata : 0; + memcpy(tmpdata + padlo, data + beginbin + padlo, + sizeof(fcomplex) * (numbins - padhi - padlo)); + } + + /* Spread the data */ + + if (!firsttime) + vect_free(dataarray); + dataarray = gen_cvect(fftlen); + spread_no_pad(tmpdata, numbins, dataarray, fftlen, numbetween); + + if ((beginbin < 0) || (endbin > numdata)) + vect_free(tmpdata); + + /* FFT the Data */ + + //COMPLEXFFT(dataarray, fftlen, -1);//debug, default it is comment out + + } else if (datainf == PREPPED) { + +#ifdef DEBUGPRINT + printf("FFTing and copying the data...\n"); +#endif + + if (!firsttime) + vect_free(dataarray); + dataarray = gen_cvect(fftlen); + memcpy(dataarray, data, sizeof(fcomplex) * fftlen); + + /* FFT the Data */ + + //COMPLEXFFT(dataarray, fftlen, -1);//debug, default it is comment out + + } else if (datainf == FFT) { + +#ifdef DEBUGPRINT + printf("Just copying the data...\n"); +#endif + + if (!firsttime) + vect_free(dataarray); + dataarray = gen_cvect(fftlen); + memcpy(dataarray, data, sizeof(fcomplex) * fftlen); + + } + } + + /* Prep the kernel array */ + + /*calculate chop parameters*/ + int chopbins = kern_half_width * numbetween ; + if (fftlen < 2 * chopbins) { + printf("\n 'chopbins' is too large in chop_complex_ends()\n\n"); + exit(1); + } + + int numtocopy = fftlen - 2 * chopbins; + if (numresult < numtocopy) + numtocopy = numresult;// use chopbins and numtocopy to do chop on GPU + + if( datainf == FFT )//No fft in the conv + { + complex_corr_conv_gpu(dataarray, kernel_vect_on_gpu, + fftlen, + numkern_in_array, + stage, harmtosum, harmnum, + offset_array, d_data, d_result, + chopbins, numtocopy, + zinds, rinds, + numzs_full, numrs_full, + d_fundamental, + d_zinds, d_rinds, + 0, + NOFFTS, optype, + obs_zlo, fullrlo, harm_fract, zlo, rlo); + } + + if( datainf == RAW )//implement fft in the conv + { + complex_corr_conv_gpu(dataarray, kernel_vect_on_gpu, + fftlen, + numkern_in_array, + stage, harmtosum, harmnum, + offset_array, d_data, d_result, + chopbins, numtocopy, + zinds, rinds, + numzs_full, numrs_full, + d_fundamental, + d_zinds, d_rinds, + 1, + NOFFTS, optype, + obs_zlo, fullrlo, harm_fract, zlo, rlo); + } + + + /* Set variables for next time... */ + + if (firsttime) + firsttime = 0; + oldlobin = lobin; + oldfftlen = fftlen; + oldnumdata = numdata; + oldnumbetween = numbetween; + oldkern_half_width = kern_half_width; + if (numresult < fftlen - 2 * numbetween * kern_half_width) + return numresult; + else + return fftlen - 2 * numbetween * kern_half_width; +} +#endif +//-------------------------------------------------------------------------- + + int corr_complex(fcomplex * data, int numdata, presto_datainf datainf, fcomplex * kern, int numkern, presto_datainf kerninf, fcomplex * result, int numresult, int lobin, From 74d6de8ecc34e72bde53a2388ad26ba7eabeff48 Mon Sep 17 00:00:00 2001 From: jintaoluo Date: Fri, 16 May 2014 18:10:22 -0400 Subject: [PATCH 2/5] Makefile & Howto on the GPUed PRESTO2 --- Howto_GPU.md | 23 ++++++++++++ src/Makefile | 103 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 Howto_GPU.md diff --git a/Howto_GPU.md b/Howto_GPU.md new file mode 100644 index 000000000..6ea4fb79c --- /dev/null +++ b/Howto_GPU.md @@ -0,0 +1,23 @@ +#GPUed PRESTO + +The program of accelsearch in PRESTO was accelerated with CUDA by Jintao Luo. Any questions on how to compile and use the GPU routine may go to jluo@nrao.edu + +##How to compile + +###compile for CPU-only +By default PRESTO will be compiled for CPU-only. This is controlled by line 22 in $PRESTO/src/Makefile. For CPU-only, this line should be as: +>use_cuda = no# yes/no + +###compile with GPU routines +Open the $PRESTO/src/Makefile + +1. Make sure line 22 is: +>use_cuda = yes# yes/no + +2. Check the CUDA-related variables and flags, and modify them if necessary. + +##How to use +To run accelsearch on GPU, use the -cuda option. Or the program will run on CPU. For example: +>accelsearch -numharm 16 -zmax 256 ur_data.dat -cuda 0 + +0 means using the 1st GPU in your machine. \ No newline at end of file diff --git a/src/Makefile b/src/Makefile index aad332b51..790fb6d2a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,6 @@ # Makefile for PRESTO # by Scott M. Ransom +# GPU-related sections by Jintao Luo # OS type OS = Linux @@ -8,13 +9,60 @@ OS = Linux # Linux is the first choice ifeq ($(OS),Linux) LIBSUFFIX = .so - LIBCMD = -shared + LIBCMD = -shared # else assume Darwin (i.e. OSX) else LIBSUFFIX = .dylib LIBCMD = -dynamiclib endif + +#------------------------------------------------------------------------------------------ +#determine if compile with CUDA +use_cuda = no# yes/no + +ifeq ($(use_cuda),yes) +$(info Compile with CUDA) +#****************************************************************** +# for GUP version, CUDA +# CUDA compile option +# CUDA code generation flags + GENCODE_SM10 := -gencode arch=compute_10,code=sm_10 + GENCODE_SM10 := -gencode arch=compute_11,code=sm_11 + GENCODE_SM20 := -gencode arch=compute_20,code=sm_20 + GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 + GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30) +# Location of the CUDA Toolkit binaries and libraries +#modify it if necessary +#CUDA_PATH ?= /usr/local/cuda-5.0 + CUDA_PATH ?= /usr/local/cuda + +#modify it if necessary + CUDA_INC_PATH ?= -I $(CUDA_PATH)/include -I. -I.. -I $(CUDA_PATH)/samples/common/inc/ -I$(PRESTO)/include + + CUDA_BIN_PATH ?= $(CUDA_PATH)/bin + NVCC ?= $(CUDA_BIN_PATH)/nvcc + +#modify it if necessary, for example, if 32-bit machine, -m32 + NVCCFLAGS ?= -m64 -ccbin g++ + + EXTRA_NVCCFLAGS ?= -Xcompiler -fPIC -O2 + +#modify it if necessary, -m64 to -m32 for 32-bit machine + LINKCOMMAND_CUDA = g++ -m64 -O2 -o + +#modify it if necessary, maybe lib64 should be lib32? + CUDA_LINK_EXTRA_FLAGS = -L $(CUDA_PATH)/lib64 -lcufft -lcudart +#****************************************************************** +endif + +ifeq ($(use_cuda),no) +$(info Compile without CUDA) +endif +#------------------------------------------------------------------------------------------ + + + # How to link with some needed libraries of PGPLOT X11LINK := $(shell pkg-config --libs x11) PNGLINK := $(shell pkg-config --libs libpng) @@ -37,7 +85,14 @@ CFITSIOINC := $(shell pkg-config --cflags cfitsio) CFITSIOLINK := $(shell pkg-config --libs cfitsio) # The standard PRESTO libraries to link into executables +#PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK) +ifeq ($(use_cuda),no) PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK) +endif + +ifeq ($(use_cuda),yes) +PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK) $(CUDA_LINK_EXTRA_FLAGS) -L $(SYSDIR)/lib/x86_64-linux-gnu -lstdc++ +endif CC = gcc FC = gfortran @@ -46,6 +101,11 @@ FC = gfortran CFLAGS = -I$(PRESTO)/include $(GLIBINC) $(CFITSIOINC) $(PGPLOTINC) $(FFTINC) \ -DUSEFFTW -DUSEMMAP -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 \ -g -O3 -ffast-math -Wall -W -fPIC + +ifeq ($(use_cuda),yes) + CFLAGS += -DUSECUDA +endif + # -g -Wall -W -fPIC CLINKFLAGS = $(CFLAGS) # NOTE: Be careful of upping the optimization on the @@ -69,6 +129,20 @@ CLIG = clig mv ../clig/$*_cmd.c . cp ../clig/$*.1 ../docs/ +ifeq ($(use_cuda),yes) +PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\ + characteristics.o cldj.o chkio.o corr_prep.o corr_routines.o\ + correlations.o database.o dcdflib.o dispersion.o\ + fastffts.o fftcalls.o fminbr.o fold.o fresnl.o ioinf.o\ + get_candidates.o iomak.o ipmpar.o maximize_r.o maximize_rz.o\ + median.o minifft.o misc_utils.o clipping.o\ + orbint.o output.o read_fft.o responses.o\ + rzinterp.o rzwinterp.o select.o sorter.o swapendian.o\ + transpose.o twopass.o twopass_real_fwd.o\ + twopass_real_inv.o vectors.o multifiles.o mask.o\ + fitsfile.o hget.o hput.o imio.o djcl.o accel_utils_gpu.o +endif +ifeq ($(use_cuda),no) PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\ characteristics.o cldj.o chkio.o corr_prep.o corr_routines.o\ correlations.o database.o dcdflib.o dispersion.o\ @@ -79,7 +153,8 @@ PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\ rzinterp.o rzwinterp.o select.o sorter.o swapendian.o\ transpose.o twopass.o twopass_real_fwd.o\ twopass_real_inv.o vectors.o multifiles.o mask.o\ - fitsfile.o hget.o hput.o imio.o djcl.o + fitsfile.o hget.o hput.o imio.o djcl.o +endif INSTRUMENTOBJS = backend_common.o zerodm.o sigproc_fb.o psrfits.o @@ -112,6 +187,14 @@ indent: prep: touch *_cmd.c +ifeq ($(use_cuda),yes) +accel_utils_gpu.o: accel_utils_gpu.cu + $(NVCC) $(EXTRA_NVCCFLAGS) $(NVCCFLAGS) $(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $< +endif +ifeq ($(use_cuda),no) +$(info accel_utils_gpu.o will not be compiled) +endif + makewisdom: $(CC) $(CLINKFLAGS) -o $@ makewisdom.c $(FFTLINK) ./makewisdom @@ -138,6 +221,17 @@ libsla$(LIBSUFFIX): binaries: $(BINARIES) +ifeq ($(use_cuda),yes) +$(info link accelsearch with CUDA) +accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o accel_utils_gpu.o + $(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o accel_utils_gpu.o $(PRESTOLINK) -lglib-2.0 -lm $(CUDA_LINK_EXTRA_FLAGS) +endif +ifeq ($(use_cuda),no) +$(info link accelsearch without CUDA) +accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o + $(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm +endif + mpi: mpiprepsubband mpiprepsubband_utils.o: mpiprepsubband_utils.c @@ -149,8 +243,8 @@ mpiprepsubband.o: mpiprepsubband.c mpiprepsubband: mpiprepsubband_cmd.c mpiprepsubband_cmd.o mpiprepsubband_utils.o mpiprepsubband.o $(INSTRUMENTOBJS) mpicc $(CLINKFLAGS) -o $(PRESTO)/bin/$@ mpiprepsubband_cmd.o mpiprepsubband_utils.o mpiprepsubband.o $(INSTRUMENTOBJS) $(PRESTOLINK) -lcfitsio -lm -accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o - $(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm +#accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o +# $(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm bary: bary.o $(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ bary.o $(PRESTOLINK) -lm @@ -305,3 +399,4 @@ squeaky: cleaner cd $(PRESTO)/docs ; rm -f *# *~ cd $(PRESTO)/python ; rm -f *# *~ *.o *.pyc *.pyo cd $(PRESTO)/include ; rm -f *# *~ + From 4ed25456ae83c13e55ff71b159cca9a60fab50d9 Mon Sep 17 00:00:00 2001 From: jintaoluo Date: Fri, 23 May 2014 14:40:52 -0400 Subject: [PATCH 3/5] head file for accelsearch_on_gpu --- include/accel_utils_gpu.h | 70 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 include/accel_utils_gpu.h diff --git a/include/accel_utils_gpu.h b/include/accel_utils_gpu.h new file mode 100644 index 000000000..2553ef421 --- /dev/null +++ b/include/accel_utils_gpu.h @@ -0,0 +1,70 @@ +/* ACCEL_USELEN must be less than 65536 since we */ +/* use unsigned short ints to index our arrays... */ +/* #define ACCEL_USELEN 32160 */ +/* #define ACCEL_USELEN 16000 */ +#define ACCEL_USELEN 7560 +#define ACCEL_NUMBETWEEN 2 +/* Stepsize in Fourier Freq */ +#define ACCEL_DR 0.5 +/* Reciprocal of ACCEL_DR */ +#define ACCEL_RDR 2 +/* Stepsize in Fourier F-dot */ +#define ACCEL_DZ 2 +/* Reciprocal of ACCEL_DZ */ +#define ACCEL_RDZ 0.5 +/* Closest candidates we will accept as independent */ +#define ACCEL_CLOSEST_R 15.0 +/* Padding for .dat file reading so that we don't SEGFAULT */ +#define ACCEL_PADDING 2000 + + +#ifndef DBLCORRECT +#define DBLCORRECT 1e-14 +#endif + +#ifndef _ACCEL_UTILS_GPU_ +#define _ACCEL_UTILS_GPU_ + +typedef struct accel_cand_gpu{ + float pow; /*pow of selected candidate*/ + int nof_cand; /*number of candidates in sub_array/plane */ + int z_ind; /*z_index of the selected candidate*/ + int r_ind; /*r_index of the selected candidate*/ +} accel_cand_gpu; + +#ifndef _FCOMPLEX_DECLARED_ +typedef struct fcomplex { + float r, i; +} fcomplex; +#define _FCOMPLEX_DECLARED_ +#endif /* _FCOMPLEX_DECLARED_ */ + +typedef struct kernel{ + int z; /* The fourier f-dot of the kernel */ + int fftlen; /* Number of complex points in the kernel */ + int numgoodbins; /* The number of good points you can get back */ + int numbetween; /* Fourier freq resolution (2=interbin) */ + int kern_half_width; /* Half width (bins) of the raw kernel. */ + fcomplex *data; /* The FFTd kernel itself */ +} kernel; + +typedef struct subharminfo{ + int numharm; /* The number of sub-harmonics */ + int harmnum; /* The sub-harmonic number (fundamental = numharm) */ + int zmax; /* The maximum Fourier f-dot for this harmonic */ + int numkern; /* Number of kernels in the vector */ + kernel *kern; /* The kernels themselves */ + unsigned short *rinds; /* Table of indices for Fourier Freqs */ +} subharminfo; + + +/* Constants used in the correlation/convolution routines */ +typedef enum { + CONV, CORR, INPLACE_CONV, INPLACE_CORR +} presto_optype; + +typedef enum { + FFTDK, FFTD, FFTK, NOFFTS +} presto_ffts; + +#endif From 6a743bd8b420759c2044a1223f50a9182d63406d Mon Sep 17 00:00:00 2001 From: jintaoluo Date: Fri, 5 Sep 2014 15:42:37 -0400 Subject: [PATCH 4/5] fix python module presto add stdc++ and cuda options to the libpresto linking --- src/Makefile | 14 ++++++++++++-- src/accelsearch.c | 8 +------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/Makefile b/src/Makefile index 790fb6d2a..a75e5cde6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -19,7 +19,7 @@ endif #------------------------------------------------------------------------------------------ #determine if compile with CUDA -use_cuda = no# yes/no +use_cuda = yes# yes/no ifeq ($(use_cuda),yes) $(info Compile with CUDA) @@ -207,8 +207,18 @@ timetest: libpresto: libpresto$(LIBSUFFIX) +ifeq ($(use_cuda),yes) libpresto$(LIBSUFFIX): $(PRESTOOBJS) - $(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK) + $(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK) $(CUDA_LINK_EXTRA_FLAGS) -L $(SYSDIR)/lib/x86_64-linux-gnu -lstdc++ +endif + +ifeq ($(use_cuda),no) +libpresto$(LIBSUFFIX): $(PRESTOOBJS) + $(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK) +endif + +#libpresto$(LIBSUFFIX): $(PRESTOOBJS) + #$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK) slalib: libsla$(LIBSUFFIX) cd slalib ; $(FC) -o sla_test sla_test.f -fno-second-underscore -L$(PRESTO)/lib -lsla diff --git a/src/accelsearch.c b/src/accelsearch.c index 4693bccfb..79cfd5897 100644 --- a/src/accelsearch.c +++ b/src/accelsearch.c @@ -196,11 +196,7 @@ if(cmd->cudaP == 1){/****************************************call GPU to run the //if( use_cuda_flag == 1 ){ printf("\nTry to use a GPU to run the search\n"); - - #ifdef USECUDA - printf( "\nUSECUDA\n" ); - #endif - + select_cuda_dev(cmd->cuda); //prepare d_data and d_result on GPU @@ -305,8 +301,6 @@ if(cmd->cudaP == 1){/****************************************call GPU to run the destroy_cuFFT_plans( subharminfs, obs.numharmstages ); //destroy cuFFT plans - printf("\ndebug:.......................\n"); - //free_subharminfos(obs.numharmstages, subharminfs); //from presto1 free_subharminfos(&obs, subharminfs); //from presto2 From d1bb087479e7e51c05ed067d650acc7c107fc831 Mon Sep 17 00:00:00 2001 From: jintaoluo Date: Thu, 25 Jun 2015 15:44:00 -0400 Subject: [PATCH 5/5] Apply a patch from Scott's code Fixed a bug where we could potentially read beyond the end of the FFT --- src/accel_utils.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/accel_utils.c b/src/accel_utils.c index eabf32e4e..4e0515e25 100644 --- a/src/accel_utils.c +++ b/src/accel_utils.c @@ -870,25 +870,35 @@ fcomplex *get_fourier_amplitudes(int lobin, int numbins, accelobs * obs) { if (obs->mmap_file || obs->dat_input) { fcomplex *tmpdata = gen_cvect(numbins); - int offset = 0; + size_t ii, offset = 0, firstbin, newnumbins; + fcomplex zeros = {0.0, 0.0}; + // zero-pad if we try to read before the beginning of the FFT if (lobin - obs->lobin < 0) { - fcomplex zeros = {0.0, 0.0}; - int ii; offset = abs(lobin - obs->lobin); for (ii = 0 ; ii < offset ; ii++) tmpdata[ii] = zeros; - } - memcpy(tmpdata + offset, - (fcomplex *) (obs->fft + (lobin - obs->lobin) + offset), - sizeof(fcomplex) * (numbins - offset)); + } + firstbin = (lobin - obs->lobin) + offset; + newnumbins = numbins - offset; + + // zero-pad if we try to read beyond the end of the FFT + if (firstbin + newnumbins > obs->numbins) { + size_t numpad = firstbin + newnumbins - obs->numbins; + newnumbins = newnumbins - numpad; + for (ii = numbins - numpad ; ii < numbins ; ii++) + tmpdata[ii] = zeros; + } + + // Now grab the data we need + memcpy(tmpdata + offset, obs->fft + firstbin, + sizeof(fcomplex) * newnumbins); return tmpdata; } else { return read_fcomplex_file(obs->fftfile, lobin - obs->lobin, numbins); } } - ffdotpows *subharm_ffdot_plane(int numharm, int harmnum, double fullrlo, double fullrhi, subharminfo * shi, accelobs * obs)