From ba73ae744687e21ad6f4d6c9dc938b780d03a778 Mon Sep 17 00:00:00 2001
From: jintaoluo <luojintao@cv-nat-a-56.cv.nrao.edu>
Date: Fri, 16 May 2014 18:06:03 -0400
Subject: [PATCH 1/5] GPUed accelseach based on PRESTO2

---
 include/accelsearch_cmd.h |   5 +
 src/accel_utils.c         | 332 ++++++++++++++++-
 src/accel_utils_gpu.cu    | 762 ++++++++++++++++++++++++++++++++++++++
 src/accelsearch.c         | 309 +++++++++++++++-
 src/accelsearch_cmd.c     |  16 +
 src/corr_routines.c       | 301 +++++++++++++++
 6 files changed, 1723 insertions(+), 2 deletions(-)
 create mode 100644 src/accel_utils_gpu.cu

diff --git a/include/accelsearch_cmd.h b/include/accelsearch_cmd.h
index 0897fc7a5..5df8c554a 100644
--- a/include/accelsearch_cmd.h
+++ b/include/accelsearch_cmd.h
@@ -72,6 +72,11 @@ typedef struct s_Cmdline {
   /*@null*/char **argv;
   /***** the whole command line concatenated */
   char *full_cmd_line;
+  /***** cuda device parameters */
+  #ifdef USECUDA
+  char cudaP;
+  char cuda;  
+  #endif
 } Cmdline;
 
 
diff --git a/src/accel_utils.c b/src/accel_utils.c
index eebdbd42b..eabf32e4e 100644
--- a/src/accel_utils.c
+++ b/src/accel_utils.c
@@ -22,6 +22,44 @@
 /* Return 2**n */
 #define index_to_twon(n) (1<<n)
 
+#ifdef USECUDA
+typedef struct accel_cand_gpu{
+	float pow;					/*pow of selected candidate*/
+	int		nof_cand;			/*number of candidates in sub_array/plane */
+	int		z_ind;				/*z_index of the selected candidate*/
+	int		r_ind;				/*r_index of the selected candidate*/
+}	accel_cand_gpu;
+#endif
+
+#ifdef USECUDA
+extern GSList *search_ffdotpows_sort_gpu_result(ffdotpows * ffdot, int numharm,
+                         accelobs * obs, GSList * cands, accel_cand_gpu *cand_gpu_cpu, int nof_cand, int numzs, int numrs);
+#endif
+
+#ifdef USECUDA
+extern int corr_complex_gpu(fcomplex * data, int numdata, presto_datainf datainf,
+                 fcomplex * kernel_vect_on_gpu, int numkern, presto_datainf kerninf,
+                 fcomplex ** result, int numresult, int lobin,
+                 int numbetween, int kern_half_width, presto_optype optype,
+                 int numkern_in_array, int stage, int harmtosum, int harmnum,
+                 int ** offset_array, fcomplex *d_data, fcomplex *d_result,
+                 unsigned short *zinds, unsigned short *rinds,
+                 int numzs_full, int numrs_full,
+                 float *d_fundamental,
+                 unsigned short *d_zinds, unsigned short *d_rinds,
+                 double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo
+                 );
+#endif
+
+#ifdef USECUDA
+extern ffdotpows *init_fundamental_ffdot(int numharm, int harmnum,
+                               double fullrlo, double fullrhi,
+                               subharminfo * shi, accelobs * obs,
+                               int stage, int **offset_array,
+                               fcomplex *kernel_vect_on_gpu, fcomplex *d_data, fcomplex *d_result
+                               );
+#endif
+
 /* Return x such that 2**x = n */
 static inline int twon_to_index(int n)
 {
@@ -978,6 +1016,235 @@ ffdotpows *subharm_ffdot_plane(int numharm, int harmnum,
    return ffdot;
 }
 
+//-----------------------------------------------------------------------------------------
+#ifdef USECUDA
+void subharm_ffdot_plane_gpu(int numharm, int harmnum,
+                               double fullrlo, double fullrhi,
+                               subharminfo * shi, accelobs * obs,
+                               int stage, int **offset_array,
+                               fcomplex *kernel_vect_on_gpu, fcomplex *d_data, fcomplex *d_result,
+                               ffdotpows *fundamental,
+                               float *d_fundamental,
+                               unsigned short *d_zinds, unsigned short *d_rinds
+                               )
+{
+   int ii, lobin, hibin, numdata, nice_numdata, nrs, fftlen, binoffset;
+   static int numrs_full = 0, numzs_full = 0;
+   float powargr, powargi;
+   double drlo, drhi, harm_fract;
+   ffdotpows *ffdot;
+   fcomplex *data, **result;
+   presto_datainf datainf;
+
+	/* numrs, numzs, rlo, zlo , rinds*/
+	int numrs, numzs, rlo, zlo;
+	unsigned short *rinds ;
+
+   if (numrs_full == 0) {
+      if (numharm == 1 && harmnum == 1) {
+         numrs_full = ACCEL_USELEN;
+         numzs_full = shi->numkern;
+      } else {
+         printf("You must call subharm_ffdot_plane() with numharm=1 and\n");
+         printf("harnum=1 before you use other values!  Exiting.\n\n");
+         exit(0);
+      }
+   }
+
+   harm_fract = (double) harmnum / (double) numharm;
+   drlo = calc_required_r(harm_fract, fullrlo);
+   drhi = calc_required_r(harm_fract, fullrhi);
+   rlo = (int) floor(drlo) ;
+   zlo = calc_required_z(harm_fract, obs->zlo) ;
+
+	if(numharm==1 && harmnum==1){
+		fundamental->rlo = rlo;
+		fundamental->zlo = zlo;
+	}
+
+
+	/*Initialize the z-lookup indices*/
+		//printf("\ndebug, accel_utils.c line 1034, numzs_full= %d, numrs_full=%d, obs.zlo=%f, obs.zhi=%f\n", numzs_full, numrs_full, obs->zlo, obs->zhi);
+		unsigned short *zinds ;
+		zinds = (unsigned short *)malloc(numzs_full * sizeof(unsigned short));
+		{
+		if (numharm > 1) {
+			int zz , subz, zind;
+			for (ii = 0; ii < numzs_full; ii++) {			
+				zz = obs->zlo + ii * ACCEL_DZ;//ACCEL_DZ=2
+				subz = calc_required_z(harm_fract, zz);				
+  	    zind = index_from_z(subz, zlo);
+  	    zinds[ii]=zind;  	    
+			}
+		}
+		}
+
+	
+
+   /* Initialize the r-lookup indices */
+   if (numharm > 1) {
+      double rr, subr;
+      for (ii = 0; ii < numrs_full; ii++) {
+         rr = fullrlo + ii * ACCEL_DR;
+         subr = calc_required_r(harm_fract, rr);
+         shi->rinds[ii] = index_from_r(subr, rlo);
+      }
+   }
+   rinds = shi->rinds;
+   
+   numrs = (int) ((ceil(drhi) - floor(drlo))
+                         * ACCEL_RDR + DBLCORRECT) + 1;
+   if (numharm == 1 && harmnum == 1) {
+      numrs = ACCEL_USELEN;
+   } else {
+      if (numrs % ACCEL_RDR) {
+         numrs = (numrs / ACCEL_RDR + 1) * ACCEL_RDR;
+      }
+   }
+   
+   
+   numzs = shi->numkern ;
+
+   binoffset = shi->kern[0].kern_half_width;
+   fftlen = shi->kern[0].fftlen;
+   lobin = rlo - binoffset;
+   hibin = (int) ceil(drhi) + binoffset;
+   numdata = hibin - lobin + 1;
+   nice_numdata = next2_to_n(numdata);  // for FFTs
+   data = get_fourier_amplitudes(lobin, nice_numdata, obs); //lobin, nice_numdata, obs->lobin
+
+   
+   if (!obs->mmap_file && !obs->dat_input && 0)
+       printf("This is newly malloc'd!\n");
+   // Normalize the Fourier amplitudes
+   if (obs->nph > 0.0) {
+       //  Use freq 0 normalization if requested (i.e. photons)
+       printf("\nDebug, accel_utils.c line 1073: Use freq 0 normalization\n");
+       double norm = 1.0 / sqrt(obs->nph);
+       for (ii = 0; ii < numdata; ii++) {
+           data[ii].r *= norm;
+           data[ii].i *= norm;
+       }
+   } else if (obs->norm_type == 0) {
+       //  old-style block median normalization
+       float *powers;
+       double norm;
+
+       powers = gen_fvect(numdata);
+       for (ii = 0; ii < numdata; ii++)
+           powers[ii] = POWER(data[ii].r, data[ii].i);
+       norm = 1.0 / sqrt(median(powers, numdata)/log(2.0));
+       vect_free(powers);
+       for (ii = 0; ii < numdata; ii++) {
+           data[ii].r *= norm;
+           data[ii].i *= norm;
+       }
+   } else {
+       //  new-style running double-tophat local-power normalization
+       float *powers, *loc_powers;
+
+       powers = gen_fvect(nice_numdata);
+       for (ii = 0; ii < nice_numdata; ii++) {
+           powers[ii] = POWER(data[ii].r, data[ii].i);
+       }
+       loc_powers = corr_loc_pow(powers, nice_numdata);
+       for (ii = 0; ii < numdata; ii++) {
+           float norm = invsqrt(loc_powers[ii]);
+           data[ii].r *= norm;
+           data[ii].i *= norm;
+       }
+       vect_free(powers);
+       vect_free(loc_powers);
+   }
+
+   /* Perform the correlations */
+
+   datainf = RAW;   
+   nrs = corr_complex_gpu(data, numdata, datainf,
+				                 kernel_vect_on_gpu, fftlen, FFT,
+				                 result, numrs, binoffset,
+					               ACCEL_NUMBETWEEN, binoffset, CORR,
+				                 numzs, stage, numharm, harmnum,
+				                 offset_array, d_data, d_result, 
+				                 zinds, rinds, numzs_full, numrs_full,
+				                 d_fundamental,
+				                 d_zinds, d_rinds,
+				                 obs->zlo, fullrlo, harm_fract, zlo, rlo);
+
+   // Always free data
+   vect_free(data);
+   
+   free(zinds);
+}
+#endif
+//-----------------------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------------------
+//Initialize the fundamental ffdotpows
+#ifdef USECUDA
+ffdotpows *init_fundamental_ffdot(int numharm, int harmnum,
+                               double fullrlo, double fullrhi,
+                               subharminfo * shi, accelobs * obs,
+                               int stage, int **offset_array,
+                               fcomplex *kernel_vect_on_gpu, fcomplex *d_data, fcomplex *d_result
+                               )
+{
+   static int numrs_full = 0, numzs_full = 0;
+   float powargr, powargi;
+   double drlo, drhi, harm_fract;
+   ffdotpows *ffdot;   
+
+	/* numrs, numzs, rlo, zlo , rinds*/
+	int numrs, numzs, rlo, zlo;
+	unsigned short *rinds ;
+
+   if (numrs_full == 0) {
+      if (numharm == 1 && harmnum == 1) {
+         numrs_full = ACCEL_USELEN;
+         numzs_full = shi->numkern;
+      } else {
+         printf("You must call subharm_ffdot_plane() with numharm=1 and\n");
+         printf("harnum=1 before you use other values!  Exiting.\n\n");
+         exit(0);
+      }
+   }
+   ffdot = (ffdotpows *) malloc(sizeof(ffdotpows));
+
+   /* Calculate and get the required amplitudes */
+
+   harm_fract = (double) harmnum / (double) numharm;
+   drlo = calc_required_r(harm_fract, fullrlo);
+   drhi = calc_required_r(harm_fract, fullrhi);
+   rlo = (int) floor(drlo) ;
+   ffdot->rlo = rlo;
+   zlo = calc_required_z(harm_fract, obs->zlo) ;
+   ffdot->zlo = zlo;
+   
+   numrs = (int) ((ceil(drhi) - floor(drlo))
+                         * ACCEL_RDR + DBLCORRECT) + 1;
+   ffdot-> numrs = numrs;
+   if (numharm == 1 && harmnum == 1) {
+      numrs = ACCEL_USELEN;
+   } else {
+      if (numrs % ACCEL_RDR) {
+         numrs = (numrs / ACCEL_RDR + 1) * ACCEL_RDR;
+      }
+   }
+   
+	 ffdot-> numrs = numrs;
+   
+   numzs = shi->numkern ;
+   ffdot->numzs = numzs;     
+   
+   if (!obs->mmap_file && !obs->dat_input && 0)
+       printf("This is newly malloc'd!\n");
+       
+   ffdot->powers = gen_fmatrix(numzs, numrs);
+   
+   return ffdot;
+}
+#endif
+//-----------------------------------------------------------------------------------------
 
 ffdotpows *copy_ffdotpows(ffdotpows * orig)
 {
@@ -1172,6 +1439,51 @@ GSList *search_ffdotpows(ffdotpows * ffdot, int numharm,
    return cands;
 }
 
+#ifdef USECUDA
+GSList *search_ffdotpows_sort_gpu_result(ffdotpows * ffdot, int numharm,
+                         accelobs * obs, GSList * cands, accel_cand_gpu *cand_gpu_cpu, int nof_cand, int numzs, int numrs)
+{
+   int ii, jj;
+   float powcut;
+   long long numindep;
+
+   powcut = obs->powcut[twon_to_index(numharm)];
+   numindep = obs->numindep[twon_to_index(numharm)];	
+   
+   float pow, sig;
+   double rr, zz;
+   int added = 0;   
+   
+   int rind, zind;
+   
+   if(nof_cand > 0)
+   {
+   		for(ii=0; ii< nof_cand; ii++)
+   		{   			
+   			added = 0;      			
+   			
+   			pow = cand_gpu_cpu[ii].pow; 
+   			sig = candidate_sigma(pow, numharm, numindep);   						
+   			
+   			zind = cand_gpu_cpu[ii].z_ind;
+   			rind = cand_gpu_cpu[ii].r_ind;     			
+
+   			rr = (ffdot->rlo + rind * (double) ACCEL_DR) / (double) numharm;   			
+
+   			zz = (ffdot->zlo + zind * (double) ACCEL_DZ) / (double) numharm;
+   			
+   			cands = insert_new_accelcand(cands, pow, sig, numharm, rr, zz, &added);
+   			if (added && !obs->dat_input)
+               fprintf(obs->workfile,
+                       "%-7.2f  %-7.4f  %-2d  %-14.4f  %-14.9f  %-10.4f\n",
+                       pow, sig, numharm, rr, rr / obs->T, zz);
+   		}  		   		
+   }
+   
+   return cands;
+}
+#endif
+
 void deredden(fcomplex * fft, int numamps)
 /* Attempt to remove rednoise from a time series by using   */
 /* a median-filter of logarithmically increasing width.     */
@@ -1518,10 +1830,28 @@ void create_accelobs(accelobs * obs, infodata * idata, Cmdline * cmd, int usemma
        memuse = sizeof(float) * (obs->highestbin + ACCEL_USELEN) \
            * obs->numbetween * obs->numz;
        printf("Full f-fdot plane would need %.2f GB: ", (float)memuse / gb);
-       if (memuse < MAXRAMUSE || cmd->inmemP) {
+       if (memuse < MAXRAMUSE || cmd->inmemP) {       		
+           #ifdef USECUDA
+           if(cmd->cudaP == 1)
+           {
+           		printf("using in-memory accelsearch is possible, but the on GPU in-memory is not ready yet.\n\n");
+           		printf("so on GPU, using standard accelsearch.\n\n");
+	           obs->inmem = 0;
+	           obs->ffdotplane = NULL;
+           }                
+           else           
+           {
            printf("using in-memory accelsearch.\n\n");
            obs->inmem = 1;
            obs->ffdotplane = gen_fvect(memuse / sizeof(float));
+           }
+           #else
+           {
+           printf("using in-memory accelsearch.\n\n");
+           obs->inmem = 1;
+           obs->ffdotplane = gen_fvect(memuse / sizeof(float));
+           }
+           #endif
        } else {
            printf("using standard accelsearch.\n\n");
            obs->inmem = 0;
diff --git a/src/accel_utils_gpu.cu b/src/accel_utils_gpu.cu
new file mode 100644
index 000000000..1cbdd2ad3
--- /dev/null
+++ b/src/accel_utils_gpu.cu
@@ -0,0 +1,762 @@
+//GPU functions for accelsearch
+//by Jintao Luo, NRAO
+
+// includes, CUDA
+#include <cuda_runtime.h>
+#include <cufft.h>
+#include <helper_functions.h>
+#include <helper_cuda.h>
+#include <stdio.h>
+
+#include "device_functions.h"
+
+#include "accel_utils_gpu.h"
+
+typedef float2 Complex;
+
+//define a texture memory
+texture<Complex> tex_d_kernel;
+texture<Complex> tex_d_data;
+
+texture<Complex> tex_d_result;
+
+texture<Complex, 2> tex_d_result_2D;
+cudaChannelFormatDesc channelDesc ;
+
+texture<unsigned short> tex_d_zinds;
+texture<unsigned short> tex_d_rinds;
+
+texture<float> tex_d_fundamental;
+
+static __device__ __host__ inline fcomplex ComplexScale(fcomplex, float);
+static __device__ __host__ inline fcomplex ComplexMul(fcomplex, fcomplex);
+static __device__ __host__ inline fcomplex ComplexMul_02(fcomplex, fcomplex);
+
+static __global__ void ComplexPointwiseMulAndScale_one_loop(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset);
+
+static __global__ void ComplexPointwiseMulAndScale_one_loop_02(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset);
+
+static __global__ void Complex_Pow_and_Chop(float *d_pow, fcomplex *d_result, int fftlen, int numkern_in_array, int chopbins, int numtocopy);
+
+static __global__ void add_ffdotpows_on_gpu(float *d_fundamental, fcomplex *d_result, int numzs_full, int numrs_full, unsigned short *zinds, unsigned short *rinds, int fftlen, int chopbins, double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo);
+
+
+extern "C"
+extern int  search_ffdotpows_gpu(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, accel_cand_gpu * cand_array_sort_gpu, int numzs, int numrs, accel_cand_gpu *cand_gpu_cpu);
+
+static __global__ void  search_ffdotpows_kernel(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, int numzs, int numrs, int *d_addr_jluo);
+
+extern "C"
+fcomplex * prep_data_on_gpu(subharminfo **subharminfs, int numharmstages);
+
+extern "C"
+fcomplex * prep_result_on_gpu(subharminfo **subharminfs, int numharmstages);
+
+extern "C"
+fcomplex * cp_kernel_array_to_gpu(subharminfo **subharminfs, int numharmstages, int **offset_array);
+
+extern "C"
+fcomplex * cp_input_to_gpu(fcomplex *input_vect_on_cpu, long long numbins, long long N);
+
+extern "C"
+unsigned short *prep_rz_inds_on_gpu(int size_inds);
+
+extern "C"
+float * prep_float_vect_on_gpu( int size);
+
+extern "C"
+accel_cand_gpu *prep_cand_array(int size);
+
+extern "C"
+void cudaFree_kernel_vect(fcomplex *in);
+
+extern "C"
+void select_cuda_dev(int cuda_inds);
+
+extern "C"
+void complex_corr_conv_gpu(fcomplex * data, fcomplex * kernel_vect_on_gpu,
+                       int numdata, 
+                       int numkern_in_array, 
+                       int stage, int harmtosum, int harmnum, 
+                       int ** offset_array, fcomplex *d_data, fcomplex *d_result,
+                       int chopbins, int numtocopy,
+                       unsigned short *zinds, unsigned short *rinds,
+                       int numzs_full, int numrs_full,
+                       float *d_fundamental,
+                       unsigned short *d_zinds, unsigned short *d_rinds,
+                       int datainf_flag,
+                       presto_ffts ffts, presto_optype type,
+                       double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo);
+
+
+extern "C"
+void init_cuFFT_plans(subharminfo **subharminfs, int numharmstages);
+
+extern "C"
+void destroy_cuFFT_plans(subharminfo **subharminfs, int numharmstages);
+
+cufftHandle plan_data_array[16][16];		
+cufftHandle plan_result_array[16][16];		
+
+/******************************************** complex_corr_conv ********************************************************************/
+void complex_corr_conv_gpu(fcomplex * data, fcomplex * kernel_vect_on_gpu,
+                       int numdata, 
+                       int numkern_in_array, 
+                       int stage, int harmtosum, int harmnum, 
+                       int ** offset_array, fcomplex *d_data, fcomplex *d_result,
+                       int chopbins, int numtocopy,
+                       unsigned short *zinds, unsigned short *rinds,
+                       int numzs_full, int numrs_full,
+                       float *d_fundamental,
+                       unsigned short *d_zinds, unsigned short *d_rinds,
+                       int datainf_flag,
+                       presto_ffts ffts, presto_optype type,
+                       double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo)
+{
+
+
+	int fftlen = numdata;
+	int kernel_array_offset;
+
+   if (ffts > 3) {
+      printf("\nIllegal 'ffts' option (%d) in complex_corr_conv().\n", ffts);
+      printf("Exiting.\n\n");
+      exit(1);
+   }
+   if (type > 3) {
+      printf("\nIllegal 'type' option (%d) in complex_corr_conv().\n", type);
+      printf("Exiting.\n\n");
+      exit(1);
+   }
+	
+	if(harmtosum==1 && harmnum==1){
+		kernel_array_offset = offset_array[0][0];
+	}
+	if(harmtosum > 1){
+    kernel_array_offset = offset_array[stage][harmnum-1];
+	}
+	 
+	//copy data to GPU memory
+	checkCudaErrors(cudaMemcpy(d_data, data, sizeof(fcomplex) * fftlen, cudaMemcpyHostToDevice));
+	//FFT data on GPU
+	if(datainf_flag == 1)
+	{					
+		if(harmtosum==1 && harmnum==1){
+			checkCudaErrors(cufftExecC2C(plan_data_array[0][0], (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD));
+		}
+		if(harmtosum > 1){
+  	  checkCudaErrors(cufftExecC2C(plan_data_array[stage][harmnum-1], (cufftComplex *)d_data, (cufftComplex *)d_data, CUFFT_FORWARD));
+		}
+
+	}
+
+	//Bind data and kernel to Texture Memory
+	cudaBindTexture(NULL, tex_d_kernel, kernel_vect_on_gpu, sizeof(fcomplex) * ( kernel_array_offset + 1 + numkern_in_array * fftlen ) );
+	cudaBindTexture(NULL, tex_d_data, d_data, sizeof(fcomplex) * fftlen);
+	//Mul the FFTed data with Kernels
+	if (type == CORR || type == INPLACE_CORR) {
+		ComplexPointwiseMulAndScale_one_loop<<<512, 512>>>(d_result, d_data, kernel_vect_on_gpu, numkern_in_array, fftlen, 1.0/fftlen, kernel_array_offset);
+	}
+	else {
+		ComplexPointwiseMulAndScale_one_loop_02<<<512, 512>>>(d_result, d_data, kernel_vect_on_gpu, numkern_in_array, fftlen, 1.0/fftlen, kernel_array_offset);
+	}	
+	//unbind the data and kenerl from Texture memory
+	cudaUnbindTexture(tex_d_kernel);
+	cudaUnbindTexture(tex_d_data);  	
+
+   
+  //Inverse FFT   
+	if(harmtosum==1 && harmnum==1){
+		checkCudaErrors(cufftExecC2C(plan_result_array[0][0], (cufftComplex *)d_result, (cufftComplex *)d_result, CUFFT_INVERSE));	
+	}
+	if(harmtosum > 1){
+ 	  checkCudaErrors(cufftExecC2C(plan_result_array[stage][harmnum-1], (cufftComplex *)d_result, (cufftComplex *)d_result, CUFFT_INVERSE));	
+	} 
+
+	//bind the FFTed result
+		//cudaBindTexture(NULL, tex_d_result, d_result, sizeof(fcomplex) * fftlen * numkern_in_array );	
+		//bind d_result to its 2D texture memory
+		checkCudaErrors(cudaBindTexture2D(NULL, tex_d_result_2D, d_result, channelDesc, fftlen, numkern_in_array, sizeof(fcomplex) * fftlen ));
+	//sum harmonics
+	if(harmtosum==1 && harmnum==1){//if fundamental
+		Complex_Pow_and_Chop<<<512, 512>>>(d_fundamental, d_result, fftlen, numkern_in_array, chopbins, numtocopy);
+	}
+	if(harmtosum > 1){	//if harmonics
+		//move zinds and rinds to GPU
+		checkCudaErrors(cudaMemcpy(d_zinds, zinds, sizeof(unsigned short) * numzs_full, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpy(d_rinds, rinds, sizeof(unsigned short) * numrs_full, cudaMemcpyHostToDevice));		
+		//bind zinds and rinds to Texture Memory
+		cudaBindTexture(NULL, tex_d_zinds, d_zinds, sizeof(unsigned short) * numzs_full );
+		cudaBindTexture(NULL, tex_d_rinds, d_rinds, sizeof(unsigned short) * numrs_full );		
+		//add_ffdotpows_on_gpu
+		add_ffdotpows_on_gpu<<<512, 512>>>(d_fundamental, d_result, numzs_full, numrs_full, d_zinds, d_rinds, fftlen, chopbins, obs_zlo, fullrlo, harm_fract, zlo, rlo);
+		//Unbind zinds and rinds from Texture Memory		
+		cudaUnbindTexture(tex_d_zinds);    
+		cudaUnbindTexture(tex_d_rinds);    
+	}
+ 
+	//cudaUnbindTexture(tex_d_result);
+	checkCudaErrors(cudaUnbindTexture(tex_d_result_2D));
+  
+}                   
+
+/******************************************** add fftdot pows on GPU, choping included ************************************************/
+static __global__ void add_ffdotpows_on_gpu(float *d_fundamental, fcomplex *d_result, int numzs_full, int numrs_full, unsigned short *zinds, unsigned short *rinds, int fftlen, int chopbins, double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo)
+{
+
+    const int numThreads = blockDim.x * gridDim.x;
+    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;		
+    
+    //int addr_z, addr_r, addr_result, addr_fundamental ;
+    int addr_z, addr_r ;				
+    int addr_result ;
+		
+		Complex buf;
+		
+		int z_index, r_index;
+		
+		for (int i = threadID; i < numrs_full * numzs_full; i += numThreads){    		
+						
+				z_index = i/numrs_full ;
+				addr_z = tex1Dfetch(tex_d_zinds, z_index) ;	
+				
+				r_index = i -  z_index * numrs_full ;				
+				addr_r = tex1Dfetch(tex_d_rinds, r_index) ;
+				
+				//addr_result = addr_z * fftlen + chopbins + addr_r ;
+				//buf = tex1Dfetch(tex_d_result, addr_result) ;    		
+				
+				buf = tex2D(tex_d_result_2D, chopbins + addr_r, addr_z);
+				
+				d_fundamental[i] += buf.x * buf.x + buf.y * buf.y ;
+				
+		}
+
+}
+
+
+
+
+/********************************************* Calculate complex ffdot pows, choping included *****************************************/
+static __global__ void Complex_Pow_and_Chop(float *d_pow, fcomplex *d_result, int fftlen, int numkern_in_array, int chopbins, int numtocopy)
+/*
+	calaulate POW of data in d_result
+	d_pw:				where the result stored
+	d_result:		the array contains data to Pow_and_chop
+	fftlen:			length of FFT used in the complex_corr_conv_gpu
+	numkern_in_array:	num of kernels in array in complex_corr_conv_gpu
+	chopbins:		num of bins that should be discarded in the head and tail of the result vectors
+	numtocopy:	num of data should be kept in each vector of d_result, after the first chopbins points	
+*/
+{
+    const int numThreads = blockDim.x * gridDim.x;
+    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;		
+    
+    int addr_result;
+    
+    Complex buf;
+    
+    int z_ind, r_ind ;
+    
+    for (int i = threadID; i < numkern_in_array*numtocopy; i += numThreads){
+    	
+    	z_ind = i/numtocopy;
+    	r_ind = i - z_ind * numtocopy;
+    	
+    	//addr_result = z_ind * fftlen + chopbins +  r_ind;
+    	
+    	//buf = tex1Dfetch(tex_d_result, addr_result) ;
+    
+    	buf = tex2D(tex_d_result_2D, chopbins + r_ind, z_ind);
+    
+    	d_pow[i] = buf.x * buf.x + buf.y * buf.y ;
+    }   
+
+}
+
+
+/**********************************************************************************************************************************************************/
+//complex operations
+
+// Complex scale
+static __device__ __host__ inline fcomplex ComplexScale(fcomplex a, float s)
+{
+    fcomplex c;
+    c.r = s * a.r;
+    c.i = s * a.i;
+    return c;
+}
+
+// Complex multiplication
+static __device__ __host__ inline fcomplex ComplexMul(fcomplex a, fcomplex b)
+{
+    fcomplex c;
+    c.r = a.r * b.r + a.i * b.i;
+    c.i = a.i * b.r - a.r * b.i;
+    return c;
+}
+
+static __device__ __host__ inline fcomplex ComplexMul_02(fcomplex a, fcomplex b)
+{
+    fcomplex c;
+    c.r = a.r * b.r - a.i * b.i;
+    c.i = a.i * b.r + a.r * b.i;
+    return c;
+}
+
+
+//Use one loop to realize Complex Point Mul and Scale 
+static __global__ void ComplexPointwiseMulAndScale_one_loop(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset)
+{
+    const int numThreads = blockDim.x * gridDim.x;
+    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;		
+		
+		fcomplex a_buf ;
+		fcomplex b_buf ;
+		Complex buf_buf_a, buf_buf_b;
+		int i;
+		
+		int a_index, b_index;
+		
+		int total_num = data_size*numkern_in_array;
+		
+		for (i = threadID; i < total_num; i += numThreads)
+		{
+				a_index = i/data_size;
+				a_index = i - a_index * data_size ;
+				buf_buf_a = tex1Dfetch(tex_d_data, a_index);
+    		a_buf.r = buf_buf_a.x;
+    		a_buf.i = buf_buf_a.y;
+    		
+    		b_index = i + kernel_array_offset;
+    		buf_buf_b = tex1Dfetch(tex_d_kernel, b_index) ;
+				
+				b_buf.r = buf_buf_b.x;
+	    	b_buf.i = buf_buf_b.y;
+        c[i] = ComplexScale(ComplexMul(a_buf, b_buf), scale);                                
+        	
+        
+		}
+
+}
+
+static __global__ void ComplexPointwiseMulAndScale_one_loop_02(fcomplex *c, fcomplex *a, const fcomplex *b, int numkern_in_array, int data_size, float scale, int kernel_array_offset)
+{
+    const int numThreads = blockDim.x * gridDim.x;
+    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;		
+		
+		fcomplex a_buf ;
+		fcomplex b_buf ;
+		Complex buf_buf_a, buf_buf_b;
+		int i;
+
+		int a_index, b_index;
+		
+		int total_num = data_size*numkern_in_array;
+		
+		for (i = threadID; i < total_num; i += numThreads)
+		{
+				a_index = i/data_size;
+				a_index = i - a_index * data_size ;
+				buf_buf_a = tex1Dfetch(tex_d_data, a_index);
+    		a_buf.r = buf_buf_a.x;
+    		a_buf.i = buf_buf_a.y;
+    		
+    		b_index = i + kernel_array_offset;
+    		buf_buf_b = tex1Dfetch(tex_d_kernel, b_index) ;
+				
+				b_buf.r = buf_buf_b.x;
+	    	b_buf.i = buf_buf_b.y;
+        c[i] = ComplexScale(ComplexMul_02(a_buf, b_buf), scale);                                
+        	
+        
+		}		
+
+}
+/**********************************************************************************************************************************************************/
+
+//-------------------------Prepare search_ffdotpows cand array
+accel_cand_gpu *prep_cand_array(int size)
+{
+	
+	accel_cand_gpu * cand_on_gpu;
+	checkCudaErrors(cudaMalloc((void **)&cand_on_gpu, size * sizeof(accel_cand_gpu)));
+	
+	return cand_on_gpu ;
+	
+}
+
+int  search_ffdotpows_gpu(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, accel_cand_gpu * cand_array_sort_gpu, int numzs, int numrs, accel_cand_gpu *cand_gpu_cpu)
+{
+	
+	int *d_addr;
+	int h_addr;
+	
+	checkCudaErrors(cudaMalloc((void **)&d_addr, sizeof(int) * 1));	
+	checkCudaErrors(cudaMemset(d_addr, 0, sizeof(int))); // set d_addr to 0
+
+	//bind d_fundamental to texture
+	cudaBindTexture(NULL, tex_d_fundamental, d_fundamental, sizeof(float) * numzs * numrs );
+
+	//search ffdot_pow
+	search_ffdotpows_kernel<<<512, 512>>>(powcut, d_fundamental, cand_array_search_gpu, numzs, numrs, d_addr);
+
+	//get nof_cand
+	checkCudaErrors(cudaMemcpy(&h_addr, d_addr, sizeof(int) * 1, cudaMemcpyDeviceToHost));	
+	int nof_cand ;		
+	nof_cand = h_addr;		
+
+	//get the candicates
+	checkCudaErrors(cudaMemcpy(cand_gpu_cpu, cand_array_search_gpu, sizeof(accel_cand_gpu) * nof_cand, cudaMemcpyDeviceToHost));
+	
+	cudaFree(d_addr);
+	
+	cudaUnbindTexture(tex_d_fundamental);  
+	
+	return nof_cand ;
+}
+
+static __global__ void  search_ffdotpows_kernel(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, int numzs, int numrs, int *d_addr_jluo)
+{
+
+    const int numThreads = blockDim.x * gridDim.x;
+    const int threadID = blockIdx.x * blockDim.x + threadIdx.x;		
+
+		int i ;		
+		int nof_cand = 0;
+
+		float pow ;
+				
+		int addr_search=0;	
+
+		accel_cand_gpu cand_tmp ;
+		
+		int z_ind, r_ind;
+
+		for (i = threadID; i < numzs*numrs; i += numThreads)
+    {    	
+    
+    	nof_cand = 0;   	    	
+
+   		pow = tex1Dfetch(tex_d_fundamental, i);
+
+    		if(pow > powcut)
+    		{    			
+    			cand_tmp.pow = pow ;
+    			nof_cand += 1 ;
+    			
+    			cand_tmp.nof_cand = nof_cand ;
+    			
+    			z_ind = (int)(i/numrs);
+    			cand_tmp.z_ind = z_ind;
+    			
+    			r_ind = i - z_ind * numrs ;
+    			cand_tmp.r_ind = r_ind;  			
+
+    			addr_search = atomicAdd(&d_addr_jluo[0], 1);    			
+    			
+    			cand_array_search_gpu[ addr_search ] = cand_tmp ;    			   			    			
+    		}
+    }
+}
+
+//-------------------------Prepare vectors for zinds and rinds
+unsigned short *prep_rz_inds_on_gpu(int size_inds)
+{
+
+	unsigned short *inds;
+	checkCudaErrors(cudaMalloc((void **)&inds, size_inds * sizeof(unsigned short)));
+
+	return inds;
+
+}
+
+//-------------------------Prepare a float-point vectors 
+float * prep_float_vect_on_gpu( int size)
+{
+
+	float * f_vect;
+	checkCudaErrors(cudaMalloc((void **)&f_vect, size * sizeof(float)));
+	
+	return f_vect;
+
+}
+
+//----------Copy input data to GPU---------------------------------
+fcomplex * cp_input_to_gpu(fcomplex *input_vect_on_cpu, long long numbins, long long N)
+{
+	fcomplex *input_vect_on_gpu;		
+	
+  checkCudaErrors(cudaMalloc((void **)&input_vect_on_gpu, sizeof(fcomplex) * numbins));       		
+	
+	//check some parameters
+	printf("\nNum of data points: %d\n", N);
+	printf("\nNum of bins : %d\n", numbins);	
+	
+	checkCudaErrors(cudaMemcpy( input_vect_on_gpu, input_vect_on_cpu, sizeof(fcomplex) * numbins, cudaMemcpyHostToDevice) );
+	
+	return input_vect_on_gpu;
+}
+
+//----------Prepare d_data on GPU---------------------
+fcomplex * prep_data_on_gpu(subharminfo **subharminfs, int numharmstages)
+//cudaMalloc d_data for the whole search process
+{
+
+	int harm, harmtosum, stage;		
+	int fftlen;
+	int size_d_data;
+	
+	fcomplex *d_data;
+
+	size_d_data = 0;
+		
+	fftlen = subharminfs[0][0].kern[0].fftlen;
+	
+	size_d_data = fftlen ;
+
+if (numharmstages > 1) {
+
+	for(stage=1; stage<numharmstages; stage++){	
+		harmtosum = 1 << stage;		
+		for (harm = 1; harm < harmtosum; harm += 2) {               					
+			
+			fftlen = subharminfs[stage][harm-1].kern[0].fftlen;
+			
+			if( size_d_data < fftlen )
+			{
+				size_d_data = fftlen;
+			}
+			
+		}	
+	}
+
+}
+	
+	//alloc memory for device data
+	 checkCudaErrors(cudaMalloc((void **)&d_data, sizeof(fcomplex) * size_d_data));
+	
+	return d_data ;	
+}                            
+
+//----------Prepare d_result on GPU---------------------
+fcomplex * prep_result_on_gpu(subharminfo **subharminfs, int numharmstages)
+//cudaMalloc d_result for the whole search process
+{
+
+	int harm, harmtosum, stage;		
+	int numkern, fftlen;
+	int size_d_result;
+	
+	fcomplex *d_result;	
+	size_d_result = 0;
+
+	numkern = subharminfs[0][0].numkern;
+	fftlen = subharminfs[0][0].kern[0].fftlen;
+
+	size_d_result = numkern * fftlen;
+
+if (numharmstages > 1) {
+
+	for(stage=1; stage<numharmstages; stage++){	
+		harmtosum = 1 << stage;		
+		for (harm = 1; harm < harmtosum; harm += 2) {               		
+						
+			numkern = subharminfs[stage][harm-1].numkern;
+			fftlen = subharminfs[stage][harm-1].kern[0].fftlen;
+								
+			if(size_d_result < numkern * fftlen )
+			{
+				size_d_result = numkern * fftlen;
+			}			
+		}	
+	}	
+}
+	//Alloc mem for result on GPU  
+  checkCudaErrors(cudaMalloc((void **)&d_result, size_d_result * sizeof(fcomplex)));  
+	
+	//also prepare its 2D texture memory
+	channelDesc = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
+	tex_d_result_2D.addressMode[0] = cudaAddressModeClamp;
+  tex_d_result_2D.addressMode[1] = cudaAddressModeClamp;
+  //texRef2D.filterMode = cudaFilterModePoint;
+  tex_d_result_2D.filterMode = cudaFilterModePoint;
+  tex_d_result_2D.normalized = false;
+	
+	return d_result ;
+	
+}                            
+
+//----------Copy kernek array to GPU-------------------------------
+fcomplex * cp_kernel_array_to_gpu(subharminfo **subharminfs, int numharmstages, int **offset_array)
+//cp kernel arrays contained in **subharminfs to GPU memory *kernel_array_on_gpu
+//and store the offset within *kernel_array_on_gpu for every array, offsets stored in **offset_array
+//subharminfs: input
+//numharmstages: input
+//offset_array: output
+//kernel_array_on_gpu: output
+//
+{
+
+	int harm, harmtosum, stage;		
+	int ii, jj;
+	int kernel_total_size =0 ;
+	int numkern, fftlen;
+	unsigned int offset_base = 0, offset_tmp;
+	fcomplex *kernel_vect_host;
+	fcomplex *kernel_vect_on_gpu;
+	
+	//printf("cp_kernel_array_to_gpu: Check CUDA devices\n");
+	
+	numkern = subharminfs[0][0].numkern;
+	fftlen = subharminfs[0][0].kern[0].fftlen;
+	offset_array[0][0] = kernel_total_size ; 
+	kernel_total_size = kernel_total_size + numkern * fftlen;
+	
+	for(stage=1; stage<numharmstages; stage++){	
+		harmtosum = 1 << stage;		
+		for (harm = 1; harm < harmtosum; harm += 2) {               					
+			numkern = subharminfs[stage][harm-1].numkern;
+			fftlen = subharminfs[stage][harm-1].kern[0].fftlen;
+			offset_array[stage][harm-1] = kernel_total_size ; 
+			kernel_total_size = kernel_total_size + numkern * fftlen;
+		}	
+	}	
+	//printf("    ----Total size of kernel arrays is : %d\n", kernel_total_size);
+	checkCudaErrors(cudaMalloc((void **)&kernel_vect_on_gpu, sizeof(fcomplex) * kernel_total_size));       
+	
+	kernel_vect_host = (fcomplex *)malloc( sizeof(fcomplex) * kernel_total_size);
+	//take kernel_vect_host as buffer
+	 //stage : 0
+	 numkern = subharminfs[0][0].numkern;
+   fftlen = subharminfs[0][0].kern[0].fftlen;
+   offset_base = offset_array[0][0];
+   for(ii=0; ii<numkern; ii++)
+   	{   		
+   		for(jj=0; jj<fftlen; jj++){   		
+   			offset_tmp = offset_base + fftlen*ii + jj;
+   			kernel_vect_host[offset_tmp].r = subharminfs[0][0].kern[ii].data[jj].r;
+   			kernel_vect_host[offset_tmp].i = subharminfs[0][0].kern[ii].data[jj].i;
+   		}
+   	}
+
+   //other stages
+if (numharmstages > 1) {   
+   for(stage=1; stage<numharmstages; stage++){	
+			harmtosum = 1 << stage;		
+			for(harm = 1; harm < harmtosum; harm += 2) {         
+
+				offset_base = offset_array[stage][harm-1];
+
+				fftlen = subharminfs[stage][harm-1].kern[0].fftlen;
+				numkern = subharminfs[stage][harm-1].numkern;
+
+				for(ii=0; ii<numkern; ii++){
+					for(jj=0; jj<fftlen; jj++){
+						offset_tmp = offset_base + fftlen*ii + jj ;
+						kernel_vect_host[offset_tmp].r = subharminfs[stage][harm-1].kern[ii].data[jj].r;
+		   			kernel_vect_host[offset_tmp].i = subharminfs[stage][harm-1].kern[ii].data[jj].i;
+						}
+					}
+					
+				}
+			}
+}   
+   checkCudaErrors(cudaMemcpy( kernel_vect_on_gpu, kernel_vect_host, sizeof(fcomplex) * kernel_total_size, cudaMemcpyHostToDevice) );
+
+   free(kernel_vect_host);
+   
+   return kernel_vect_on_gpu;
+	
+}
+
+//----------------------initialize cuFFT plans ----------------------
+void init_cuFFT_plans(subharminfo **subharminfs, int numharmstages)
+{
+	int harm, harmtosum, stage;		
+	int numkern, fftlen;
+	
+	printf("\ninit_cuFFT_plans\n");
+	
+	numkern = subharminfs[0][0].numkern;
+	fftlen = subharminfs[0][0].kern[0].fftlen;
+
+	checkCudaErrors(cufftPlan1d(&plan_data_array[0][0], fftlen, CUFFT_C2C, 1));		
+	checkCudaErrors(cufftPlan1d(&plan_result_array[0][0], fftlen, CUFFT_C2C, numkern));		
+
+	printf("stage: 0, fftlen : %d, numkern: %d\n", fftlen, numkern);
+
+if (numharmstages > 1) {
+
+	for(stage=1; stage<numharmstages; stage++){	
+		harmtosum = 1 << stage;		
+		for (harm = 1; harm < harmtosum; harm += 2) {               					
+			numkern = subharminfs[stage][harm-1].numkern;
+			fftlen = subharminfs[stage][harm-1].kern[0].fftlen;
+			
+			printf("stage: %d, fftlen : %d, numkern: %d\n", stage, fftlen, numkern);
+				
+			checkCudaErrors(cufftPlan1d(&plan_data_array[stage][harm-1], fftlen, CUFFT_C2C, 1));		
+			checkCudaErrors(cufftPlan1d(&plan_result_array[stage][harm-1], fftlen, CUFFT_C2C, numkern));		
+						
+		}	
+	}	
+
+}
+	
+}
+
+//----------------------destroy cuFFT plans ----------------------
+void destroy_cuFFT_plans(subharminfo **subharminfs, int numharmstages)
+{
+	int harm, harmtosum, stage;		
+	int numkern, fftlen;
+	
+	printf("\ndestroy_cuFFT_plans\n");
+	
+	numkern = subharminfs[0][0].numkern;
+	fftlen = subharminfs[0][0].kern[0].fftlen;
+
+	checkCudaErrors(cufftDestroy(plan_data_array[0][0]));
+	checkCudaErrors(cufftDestroy(plan_result_array[0][0]));
+
+	printf("stage: 0, fftlen : %d, numkern: %d\n", fftlen, numkern);
+
+if (numharmstages > 1) {
+
+	for(stage=1; stage<numharmstages; stage++){	
+		harmtosum = 1 << stage;		
+		for (harm = 1; harm < harmtosum; harm += 2) {               					
+			numkern = subharminfs[stage][harm-1].numkern;
+			fftlen = subharminfs[stage][harm-1].kern[0].fftlen;
+			
+			printf("stage: %d, fftlen : %d, numkern: %d\n", stage, fftlen, numkern);
+	
+			checkCudaErrors(cufftDestroy(plan_data_array[stage][harm-1]));
+			checkCudaErrors(cufftDestroy(plan_result_array[stage][harm-1]));
+					
+		}	
+	}	
+}
+	
+}
+
+
+//----------------------select a cpu to play with --------------------
+void select_cuda_dev(int cuda_inds)
+{
+	
+	cudaSetDevice(cuda_inds);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, cuda_inds);
+
+  printf("\nGPU Device %d: \"%s\" with Capability: %d.%d\n", cuda_inds, deviceProp.name, deviceProp.major, deviceProp.minor);
+	
+	cudaDeviceReset();
+}
+
+//-------------------------clean up on GPU ----------------------------
+void cudaFree_kernel_vect(fcomplex *in)
+{
+	cudaFree( in );
+}
diff --git a/src/accelsearch.c b/src/accelsearch.c
index b195fbb9c..4693bccfb 100644
--- a/src/accelsearch.c
+++ b/src/accelsearch.c
@@ -14,9 +14,60 @@
 #include "dmalloc.h"
 #endif
 
+typedef struct accel_cand_gpu{
+	float pow;					/*pow of selected candidate*/
+	int		nof_cand;			/*number of candidates in sub_array/plane */
+	int		z_ind;				/*z_index of the selected candidate*/
+	int		r_ind;				/*r_index of the selected candidate*/
+}	accel_cand_gpu;
+
 extern float calc_median_powers(fcomplex * amplitudes, int numamps);
 extern void zapbirds(double lobin, double hibin, FILE * fftfile, fcomplex * fft);
 
+/**************************** Functions on GPU ****************************/
+#ifdef USECUDA
+extern fcomplex* prep_data_on_gpu(subharminfo **subharminfs, int numharmstages);
+extern fcomplex* prep_result_on_gpu(subharminfo **subharminfs, int numharmstages);
+
+extern fcomplex* cp_kernel_array_to_gpu(subharminfo **subharminfs, int numharmstages, int **offset_array);
+extern fcomplex* cp_input_to_gpu(fcomplex * input_vect_on_cpu, long long numbins, long long N);
+
+extern unsigned short* prep_rz_inds_on_gpu(int size_inds);
+
+extern float* prep_float_vect_on_gpu(int size);
+
+extern accel_cand_gpu* prep_cand_array(int size);
+
+extern void cudaFree_kernel_vect(fcomplex *in);
+
+extern void select_cuda_dev(int cuda_inds);
+
+extern int  search_ffdotpows_gpu(float powcut, float *d_fundamental, accel_cand_gpu * cand_array_search_gpu, accel_cand_gpu * cand_array_sort_gpu, int numzs, int numrs, accel_cand_gpu *cand_gpu_cpu);
+
+extern GSList* search_ffdotpows_sort_gpu_result(ffdotpows * ffdot, int numharm,
+                         accelobs * obs, GSList * cands, accel_cand_gpu *cand_gpu_cpu, int nof_cand, int numzs, int numrs);
+
+extern void subharm_ffdot_plane_gpu(int numharm, int harmnum,
+                               double fullrlo, double fullrhi,
+                               subharminfo * shi, accelobs * obs,
+                               int stage, int **offset_array,
+                               fcomplex *kernel_vect_on_gpu, fcomplex *d_data, fcomplex *d_result,
+                               ffdotpows *fundamental,
+                               float *d_fundamental,
+                               unsigned short *d_zinds, unsigned short *d_rinds
+                               );
+
+extern ffdotpows *init_fundamental_ffdot(int numharm, int harmnum,
+                               double fullrlo, double fullrhi,
+                               subharminfo * shi, accelobs * obs,
+                               int stage, int **offset_array,
+                               fcomplex *kernel_vect_on_gpu, fcomplex *d_data, fcomplex *d_result
+                               );
+                               
+extern void init_cuFFT_plans(subharminfo **subharminfs, int numharmstages);                               
+#endif
+/**************************** Functions on GPU, End****************************/
+
 static void print_percent_complete(int current, int number, char *what, int reset)
 {
    static int newper = 0, oldper = -1;
@@ -38,9 +89,21 @@ static void print_percent_complete(int current, int number, char *what, int rese
    }
 }
 
+/* Return x such that 2**x = n */
+static inline int twon_to_index(int n)
+{
+   int x = 0;
+
+   while (n > 1) {
+      n >>= 1;
+      x++;
+   }
+   return x;
+}
+
 int main(int argc, char *argv[])
 {
-   int ii;
+   int ii, jj;
    double ttim, utim, stim, tott;
    struct tms runtimes;
    subharminfo **subharminfs;
@@ -73,6 +136,7 @@ int main(int argc, char *argv[])
    printf("\n\n");
    printf("    Fourier-Domain Acceleration Search Routine\n");
    printf("               by Scott M. Ransom\n\n");
+   printf("            GPU version by Jintao Luo\n\n");
 
    /* Create the accelobs structure */
    create_accelobs(&obs, &idata, cmd, 1);
@@ -119,8 +183,249 @@ int main(int argc, char *argv[])
              obs.workfilenm);
    }
 
+	int use_cuda_flag = 0;
+	#ifdef USECUDA
+	if(cmd->cudaP == 1)	
+	{
+		use_cuda_flag = 1 ;
+	}
+	#endif
+
+#ifdef USECUDA
+if(cmd->cudaP == 1){/****************************************call GPU to run the search***************************************************/
+//if( use_cuda_flag == 1 ){
+	
+	printf("\nTry to use a GPU to run the search\n");
+	
+	#ifdef USECUDA
+	printf( "\nUSECUDA\n" );
+	#endif
+	
+	select_cuda_dev(cmd->cuda);
+	
+	//prepare d_data and d_result on GPU	
+	fcomplex *d_data;
+	fcomplex *d_result;	
+	d_data = prep_data_on_gpu(subharminfs, obs.numharmstages);
+	d_result = prep_result_on_gpu(subharminfs, obs.numharmstages);
+	
+	//prepare the fundamental on GPU
+	float *d_fundamental;	
+	d_fundamental = prep_float_vect_on_gpu(subharminfs[0][0].numkern * subharminfs[0][0].kern[0].fftlen );
+	
+	//inds lookup tables for add_ffdot_pow
+	unsigned short *d_zinds, *d_rinds;
+	d_zinds = prep_rz_inds_on_gpu(subharminfs[0][0].numkern);
+	d_rinds = prep_rz_inds_on_gpu(subharminfs[0][0].kern[0].fftlen);//In fact fftlen >= size_d_rinds, but that's ok to have a bit waste
+	
+	//Generate the kernel offset_array	for stages
+	int **offset_array;
+	fcomplex *kernel_vect_on_gpu;
+	offset_array = (int **)malloc(obs.numharmstages * sizeof(int *));
+	offset_array[0] = (int *)malloc( 1 * sizeof(int) );	 
+	for(ii=1; ii<obs.numharmstages; ii++){	
+			jj = 1 << ii;		
+			offset_array[ii] = (int *)malloc( jj * sizeof(int) );		
+	}	
+
+	//copy all kernels into the GPU memory
+	kernel_vect_on_gpu = cp_kernel_array_to_gpu(subharminfs,                 obs.numharmstages, offset_array);
+
+	//prepare arrays for ffdotpow_search on GPU
+	accel_cand_gpu * cand_array_search_gpu;
+	accel_cand_gpu * cand_array_sort_gpu;	
+	cand_array_search_gpu = prep_cand_array(subharminfs[0][0].numkern * subharminfs[0][0].kern[0].fftlen); 
+	cand_array_sort_gpu		= prep_cand_array(subharminfs[0][0].numkern * subharminfs[0][0].kern[0].fftlen); 
+	
+	//vector on CPU to store the GPU ffdot_search result
+	accel_cand_gpu *cand_gpu_cpu ;	
+	cand_gpu_cpu = (accel_cand_gpu *) malloc(sizeof(accel_cand_gpu) * subharminfs[0][0].numkern * subharminfs[0][0].kern[0].fftlen );
+	
+	//initialize cuFFT plans on GPU
+	init_cuFFT_plans( subharminfs, obs.numharmstages );
+
    /* Start the main search loop */
+   printf("\n\n/* Start the main search loop */\n\n");
+
+   {
+      double startr = obs.rlo, lastr = 0, nextr = 0;	
+			ffdotpows *my_fundamental;			
+			int nof_cand = 0 ;
+			
+			my_fundamental= init_fundamental_ffdot(1, 1, startr, lastr,
+                               &subharminfs[0][0], &obs,
+                               0, offset_array,
+                               kernel_vect_on_gpu, d_data, d_result
+                               );
+			
+      while (startr + ACCEL_USELEN * ACCEL_DR < obs.highestbin) {
+         /* Search the fundamental */
+         print_percent_complete(startr - obs.rlo,
+                                obs.highestbin - obs.rlo, "search", 0);
+         nextr = startr + ACCEL_USELEN * ACCEL_DR;
+         lastr = nextr - ACCEL_DR;
+
+						subharm_ffdot_plane_gpu(1, 1, startr, lastr,
+                               &subharminfs[0][0], &obs,
+                               0, offset_array,
+                               kernel_vect_on_gpu, d_data, d_result, 
+                               my_fundamental,
+                               d_fundamental,
+                               d_zinds, d_rinds);                                               
+         nof_cand = search_ffdotpows_gpu(obs.powcut[twon_to_index(1)], d_fundamental, cand_array_search_gpu, cand_array_sort_gpu, my_fundamental->numzs, my_fundamental->numrs, cand_gpu_cpu);
+         cands = search_ffdotpows_sort_gpu_result(my_fundamental, 1, &obs, cands, cand_gpu_cpu, nof_cand, my_fundamental->numzs, my_fundamental->numrs);
+
+
+         if (obs.numharmstages > 1) {   /* Search the subharmonics */
+            int stage, harmtosum, harm;
+
+            for (stage = 1; stage < obs.numharmstages; stage++) { 
+               harmtosum = 1 << stage;
+               for (harm = 1; harm < harmtosum; harm += 2) {                           
+                  subharm_ffdot_plane_gpu(harmtosum, harm, startr, lastr,
+                               &subharminfs[stage][harm - 1], &obs,
+                               stage, offset_array,
+                               kernel_vect_on_gpu, d_data, d_result,
+                               my_fundamental,
+                               d_fundamental,
+                               d_zinds, d_rinds);                               
+               }
+               nof_cand = search_ffdotpows_gpu(obs.powcut[twon_to_index(harmtosum)], d_fundamental, cand_array_search_gpu, cand_array_sort_gpu, my_fundamental->numzs, my_fundamental->numrs, cand_gpu_cpu);
+               cands = search_ffdotpows_sort_gpu_result(my_fundamental, harmtosum, &obs, cands, cand_gpu_cpu, nof_cand, my_fundamental->numzs, my_fundamental->numrs);
+            }
+         }
+         startr = nextr;
+      }
+      free_ffdotpows(my_fundamental);            
+      print_percent_complete(obs.highestbin - obs.rlo,
+                             obs.highestbin - obs.rlo, "search", 0);
+   }
+
+   printf("\n\nDone searching.  Now optimizing each candidate.\n\n");
+
+	 destroy_cuFFT_plans( subharminfs, obs.numharmstages ); //destroy cuFFT plans
+	 
+	 printf("\ndebug:.......................\n");
+	 
+   //free_subharminfos(obs.numharmstages, subharminfs); //from presto1
+   free_subharminfos(&obs, subharminfs); //from presto2
+
+   {                            /* Candidate list trimming and optimization */
+      int numcands;
+      GSList *listptr;
+      accelcand *cand;
+      fourierprops *props;
+
+
+      numcands = g_slist_length(cands);
+
+      if (numcands) {
+
+         /* Sort the candidates according to the optimized sigmas */
+
+         cands = sort_accelcands(cands);
+
+         /* Eliminate (most of) the harmonically related candidates */
+         if ((cmd->numharm > 1) && !(cmd->noharmremoveP))
+             eliminate_harmonics(cands, &numcands);
+
+         /* Now optimize each candidate and its harmonics */
+
+         print_percent_complete(0, 0, NULL, 1);
+         listptr = cands;
+         for (ii = 0; ii < numcands; ii++) {
+            print_percent_complete(ii, numcands, "optimization", 0);
+            cand = (accelcand *) (listptr->data);
+            optimize_accelcand(cand, &obs);
+            listptr = listptr->next;
+         }
+         print_percent_complete(ii, numcands, "optimization", 0);
+
+         /* Calculate the properties of the fundamentals */
+
+         props = (fourierprops *) malloc(sizeof(fourierprops) * numcands);
+         listptr = cands;
+         for (ii = 0; ii < numcands; ii++) {
+            cand = (accelcand *) (listptr->data);
+            /* In case the fundamental harmonic is not significant,  */
+            /* send the originally determined r and z from the       */
+            /* harmonic sum in the search.  Note that the derivs are */
+            /* not used for the computations with the fundamental.   */
+            calc_props(cand->derivs[0], cand->r, cand->z, 0.0, props + ii);
+            /* Override the error estimates based on power */
+            props[ii].rerr = (float) (ACCEL_DR) / cand->numharm;
+            props[ii].zerr = (float) (ACCEL_DZ) / cand->numharm;
+            listptr = listptr->next;
+         }
+
+         /* Write the fundamentals to the output text file */
+
+         output_fundamentals(props, cands, &obs, &idata);
+
+         /* Write the harmonics to the output text file */
+
+         output_harmonics(cands, &obs, &idata);
+
+         /* Write the fundamental fourierprops to the cand file */
 
+         obs.workfile = chkfopen(obs.candnm, "wb");
+         chkfwrite(props, sizeof(fourierprops), numcands, obs.workfile);
+         fclose(obs.workfile);
+         free(props);
+         printf("\n\n");
+      } else {
+         printf("No candidates above sigma = %.2f were found.\n\n", obs.sigma);
+      }
+   }
+
+   /* Finish up */
+
+   printf("Searched the following approx numbers of independent points:\n");
+   printf("  %d harmonic:   %9lld\n", 1, obs.numindep[0]);
+   for (ii = 1; ii < obs.numharmstages; ii++)
+      printf("  %d harmonics:  %9lld\n", 1 << ii, obs.numindep[ii]);
+
+   printf("\nTiming summary:\n");
+   tott = times(&runtimes) / (double) CLK_TCK - tott;
+   utim = runtimes.tms_utime / (double) CLK_TCK;
+   stim = runtimes.tms_stime / (double) CLK_TCK;
+   ttim = utim + stim;
+   printf("    CPU time: %.3f sec (User: %.3f sec, System: %.3f sec)\n",
+          ttim, utim, stim);
+   printf("  Total time: %.3f sec\n\n", tott);
+
+   printf("Final candidates in binary format are in '%s'.\n", obs.candnm);
+   printf("Final Candidates in a text format are in '%s'.\n\n", obs.accelnm);
+
+   free_accelobs(&obs);
+   g_slist_foreach(cands, free_accelcand, NULL);
+   g_slist_free(cands);
+   
+  free(offset_array);    
+  cudaFree_kernel_vect(d_data);
+  cudaFree_kernel_vect(d_result);
+  cudaFree_kernel_vect(d_fundamental);
+	cudaFree_kernel_vect(kernel_vect_on_gpu);	
+	cudaFree_kernel_vect(d_zinds);
+	cudaFree_kernel_vect(d_rinds);	
+	cudaFree_kernel_vect(cand_array_search_gpu);
+	cudaFree_kernel_vect(cand_array_sort_gpu);
+	//cudaFree_kernel_vect(input_vect_on_gpu);
+  
+  
+  free(cand_gpu_cpu);
+   
+   return (0);
+   
+	
+}
+#endif
+
+/****************************************call CPU to run the search***************************************************/
+//else{
+if( use_cuda_flag == 0 ){
+	printf("\nUse CPU to run the search\n");
+   /* Start the main search loop */
    {
       double startr = obs.rlo, lastr = 0, nextr = 0;
       ffdotpows *fundamental;
@@ -270,3 +575,5 @@ int main(int argc, char *argv[])
    g_slist_free(cands);
    return (0);
 }
+
+}
diff --git a/src/accelsearch_cmd.c b/src/accelsearch_cmd.c
index 1aa0ac91e..e222bb149 100644
--- a/src/accelsearch_cmd.c
+++ b/src/accelsearch_cmd.c
@@ -1019,6 +1019,9 @@ usage(void)
   fprintf(stderr,"%s","    -noharmremove: Do not remove harmonically related candidates (never removed for numharm = 1)\n");
   fprintf(stderr,"%s","           infile: Input file name of the floating point .fft or .[s]dat file.  A '.inf' file of the same name must also exist\n");
   fprintf(stderr,"%s","                   1 value\n");
+	#ifdef USECUDA
+	fprintf(stderr,"%s","            -cuda: to run accel_search on GPU, indicate the index of cuda device to be used, 0 means the 1st cuda device \n");
+	#endif
   fprintf(stderr,"%s","  version: 10Nov13\n");
   fprintf(stderr,"%s","  ");
   exit(EXIT_FAILURE);
@@ -1174,6 +1177,19 @@ parseCmdline(int argc, char **argv)
       continue;
     }
 
+		#ifdef USECUDA
+    if( 0==strcmp("-cuda", argv[i]) ) {
+      int keep = i;
+      //cmd.numharmP = 1;
+      cmd.cudaP = 1;
+      i = getIntOpt(argc, argv, i, &cmd.cuda, 1);
+      //cmd.numharmC = i-keep;
+      //checkIntLower("-numharm", &cmd.numharm, cmd.numharmC, 16);
+      //checkIntHigher("-numharm", &cmd.numharm, cmd.numharmC, 1);
+      continue;
+    }/* -cuda  */
+    #endif
+
     if( argv[i][0]=='-' ) {
       fprintf(stderr, "\n%s: unknown option `%s'\n\n",
               Program, argv[i]);
diff --git a/src/corr_routines.c b/src/corr_routines.c
index ff17bcd74..5d5aa5bf0 100644
--- a/src/corr_routines.c
+++ b/src/corr_routines.c
@@ -1,7 +1,308 @@
 #include "presto.h"
 
+#ifdef USECUDA
+extern void * complex_corr_conv_gpu(fcomplex * data, fcomplex * kernel_vect_on_gpu,
+                       int numdata, 
+                       int numkern_in_array, 
+                       int stage, int harmtosum, int harmnum, 
+                       int ** offset_array, fcomplex *d_data, fcomplex *d_result,
+                       int chopbins, int numtocopy,
+                       unsigned short *zinds, unsigned short *rinds,
+                       int numzs_full, int numrs_full,
+                       fcomplex *d_fundamental,
+                       unsigned short *d_zinds, unsigned short *d_rinds,
+                       int datainf_flag,
+                       presto_ffts ffts, presto_optype type,
+                       double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo);
+#endif
+
 /* define DEBUGPRINT */
 
+
+//--------------------------------------------------------------------------
+#ifdef USECUDA
+int corr_complex_gpu(fcomplex * data, int numdata, presto_datainf datainf,
+                 fcomplex * kernel_vect_on_gpu, int numkern, presto_datainf kerninf,
+                 fcomplex ** result, int numresult, int lobin,
+                 int numbetween, int kern_half_width, presto_optype optype,
+                 int numkern_in_array, int stage, int harmtosum, int harmnum,
+                 int ** offset_array, fcomplex *d_data, fcomplex *d_result, 
+                 unsigned short *zinds, unsigned short *rinds,
+                 int numzs_full, int numrs_full,
+                 //fcomplex *d_fundamental,
+                 float *d_fundamental,
+                 unsigned short *d_zinds, unsigned short *d_rinds,
+                 double obs_zlo, double fullrlo, double harm_fract, int zlo, int rlo)
+  /* This routine is a general correlation or convolution routine    */
+  /* for complex data.  It can perform convolutions or correlations  */
+  /* on raw complex data, data that is prepared for a convolution/   */
+  /* correlation but not FFTd, or already FFTd data.  The kernel     */
+  /* that it uses can also be raw, prepped, or FFTd.  If you call    */
+  /* the routine multiple times with either the same kernel or data  */
+  /* array, it uses a saved version of the array from the previous   */
+  /* call to cut down on many processing steps. The return value     */
+  /* tells how many usable (i.e.  non-contaminated) points were      */
+  /* returned in the result array (the first value will be that of   */
+  /* 'lobin').  This routine will _not_ perform in-place             */
+  /* correlations or convolutions (i.e. it ignores those choices     */
+  /* for 'optype').                                                  */
+  /* Arguments:                                                      */
+  /*   'data' is a complex array of the data to be interpolated.     */
+  /*   'numdata' is the number of complex points in 'data'.          */
+  /*   'datainf' is one of the following that describes the data:    */
+  /*              RAW = Normal un-altered complex data.              */
+  /*              PREPPED = Data has been padded and spread based    */
+  /*                        on 'kern_half_width' and 'numbetween'    */
+  /*                        and is ready to be FFTd.                 */
+  /*              FFT = Data has already been prepared and FFTd.     */
+  /*              SAME = Data is the same as the previous call.      */
+  /*                        The routine uses its saved data.         */
+  /*   'kern' is the correlation kernel.                             */
+  /*   'numkern' is the number of complex points in 'kern'.          */
+  /*   'kerninf' is one of the same choices as 'datainf' above.      */
+  /*   'result' is the resulting complex array (must already exist). */
+  /*   'numresult' is the number of complex points in 'result'.      */
+  /*   'lobin' is the lowest fourier bin to convolve/correlate.      */
+  /*   'numbetween' is the number of bins to spread the data points. */
+  /*   'kern_half_width' is half the width (bins) of the raw kernel. */
+  /*   'optype' is either CORR or CONV (correlation or convolution). */
+  /* Notes:                                                          */
+  /*   If either 'datainf' or 'kerninf' are of type PREPPED or FFT,  */
+  /*   then the length of the FFTs used in the correlation/          */
+  /*   convolution calculations will be of length 'numdata' or       */
+  /*   'numkern'.  If both 'datainf' and 'kerninf' are of type       */
+  /*   PREPPED or FFT then 'numdata' and 'numkern' must have the     */
+  /*   same value.  In order for SAME values of 'datainf' and        */
+  /*   'kerninf' to help out, the routine must be called with the    */
+  /*   same values for 'kern_half_width' and 'numbetween' as well.   */
+{
+   static fcomplex *kernarray, *dataarray;
+   static int firsttime = 1, oldnumbetween, oldkern_half_width;
+   static int oldfftlen, oldnumdata, oldlobin;
+   fcomplex *tmpdata = NULL, zeros = { 0.0, 0.0 };
+   int ii, jj, fftlen = 1, numbins, beginbin, endbin, padlo, padhi;
+   fcomplex *h_result;
+
+   /* Check entry arguments for validity */
+
+   if (numdata <= 0) {
+      printf("\n  'numdata' = %d (out of bounds) in corr_complex().\n\n", numdata);
+      exit(1);
+   }
+   if (numkern <= 0 || numkern > MAXREALFFT / 4) {
+      printf("\n  'numkern' = %d (out of bounds) in corr_complex().\n\n", numkern);
+      exit(1);
+   }
+   if (numresult <= 0 || numresult > MAXREALFFT / 4) {
+      printf("\n  'numresult' = %d (out of bounds) in corr_complex().\n\n",
+             numresult);
+      exit(1);
+   }
+   if (numbetween < 0 || numbetween > 20000) {
+      printf("\n  'numbetween' = %d (out of bounds) in corr_complex().\n\n",
+             numbetween);
+      exit(1);
+   }
+   if (numresult % numbetween != 0) {
+      printf("\n  'numresult' = %d must be a multiple of ", numresult);
+      printf("'numbetween' in corr_complex().\n\n");
+      exit(1);
+   }
+   if (lobin < 0 || lobin >= numdata) {
+      printf("\n  'lobin' = %d (out of bounds) in corr_complex().\n\n", lobin);
+      exit(1);
+   }
+   if ((datainf == SAME || kerninf == SAME) && firsttime) {
+      printf("\n  Can't call corr_complex() with 'datainf' or 'kerninf'\n");
+      printf("  being SAME if this is the 1st time calling the routine.\n\n");
+   }
+   if (datainf == SAME && kerninf == SAME && lobin == oldlobin) {
+      printf("\n  Doesn't make sense to call corr_complex() with SAME for\n");
+      printf("  both 'datainf' and 'kerninf' if 'lobin' hasn't changed.\n\n");
+      exit(1);
+   }
+   if (datainf == PREPPED || datainf == FFT || kerninf == PREPPED || kerninf == FFT) {
+      if (datainf == PREPPED || datainf == FFT)
+         fftlen = numdata;
+      if (kerninf == PREPPED || kerninf == FFT)
+         fftlen = numkern;
+      if ((datainf == PREPPED || datainf == FFT) &&
+          (kerninf == PREPPED || kerninf == FFT)) {
+         if (numdata != numkern) {
+            printf("\n  'numdata' must equal 'numkern' if the data array and\n");
+            printf("  the kernel are either PREPPED or FFT in corr_complex().\n\n");
+            exit(1);
+         }
+      }
+   } else if (datainf == SAME || kerninf == SAME) {
+      fftlen = oldfftlen;
+   } else {
+#ifdef DEBUGPRINT
+      printf("Yikes!!\n");
+#endif
+      fftlen = next2_to_n(numresult + 2 * kern_half_width * numbetween);
+   }
+   if (fftlen > MAXREALFFT / 4) {
+      printf("\n  'fftlen' = %d (out of bounds) in corr_complex().\n\n", fftlen);
+      exit(1);
+   }
+   if (kerninf == RAW || kerninf == PREPPED) {
+      if (2 * kern_half_width * numbetween > numkern) {
+         printf("\n  'numkern = %d (out of bounds in corr_complex().\n", numkern);
+         printf("  If 'kerninf' == RAW or PREPPED, 'numkern' must be >=\n");
+         printf("  to 2 * 'kern_half_width' * 'numbetween'.\n\n");
+         exit(1);
+      }
+   } else if (kerninf == SAME) {
+      if (lobin != oldlobin || kern_half_width != oldkern_half_width) {
+         printf("\n  When 'kerninf' = SAME, 'lobin' and 'kern_half_width'\n");
+         printf("  must match their previous values in corr_complex().\n\n");
+         exit(1);
+      }
+   }
+#ifdef DEBUGPRINT
+   printf("numdata = %d  fftlen = %d\n", numdata, fftlen);
+#endif
+
+   /* Prep the data array */
+
+   if (firsttime || !(datainf == SAME &&
+                      lobin == oldlobin &&
+                      fftlen == oldfftlen &&
+                      numdata == oldnumdata && numbetween == oldnumbetween)) {
+
+      numbins = fftlen / numbetween;
+      beginbin = lobin - kern_half_width;
+      endbin = beginbin + numbins;
+      tmpdata = data + beginbin;
+
+      if (datainf == RAW) {
+
+#ifdef DEBUGPRINT
+         printf("Prepping, spreading, and FFTing the data...\n");
+#endif
+         /* Zero-pad if necessary (when looking at the beginning or */
+         /* the end of the data array)                              */
+
+         if ((beginbin < 0) || (endbin > numdata)) {
+            tmpdata = gen_cvect(numbins);
+            for (ii = 0; ii < numbins; ii++)
+               tmpdata[ii] = zeros;
+            padlo = (beginbin < 0) ? abs(beginbin) : 0;
+            padhi = ((endbin - numdata) > 0) ? endbin - numdata : 0;
+            memcpy(tmpdata + padlo, data + beginbin + padlo,
+                   sizeof(fcomplex) * (numbins - padhi - padlo));
+         }
+
+         /* Spread the data */
+
+         if (!firsttime)
+            vect_free(dataarray);
+         dataarray = gen_cvect(fftlen);
+         spread_no_pad(tmpdata, numbins, dataarray, fftlen, numbetween);
+
+         if ((beginbin < 0) || (endbin > numdata))
+            vect_free(tmpdata);
+
+         /* FFT the Data */
+
+         //COMPLEXFFT(dataarray, fftlen, -1);//debug, default it is comment out
+
+      } else if (datainf == PREPPED) {
+
+#ifdef DEBUGPRINT
+         printf("FFTing and copying the data...\n");
+#endif
+
+         if (!firsttime)
+            vect_free(dataarray);
+         dataarray = gen_cvect(fftlen);
+         memcpy(dataarray, data, sizeof(fcomplex) * fftlen);
+
+         /* FFT the Data */
+
+         //COMPLEXFFT(dataarray, fftlen, -1);//debug, default it is comment out
+
+      } else if (datainf == FFT) {
+
+#ifdef DEBUGPRINT
+         printf("Just copying the data...\n");
+#endif
+
+         if (!firsttime)
+            vect_free(dataarray);
+         dataarray = gen_cvect(fftlen);
+         memcpy(dataarray, data, sizeof(fcomplex) * fftlen);
+
+      }
+   }
+
+   /* Prep the kernel array */
+
+	/*calculate chop parameters*/
+   int chopbins = kern_half_width * numbetween ;   
+   if (fftlen < 2 * chopbins) {
+      printf("\n  'chopbins' is too large in chop_complex_ends()\n\n");
+      exit(1);
+   }
+      
+   int numtocopy = fftlen - 2 * chopbins;   
+   if (numresult < numtocopy)
+      numtocopy = numresult;// use chopbins and numtocopy to do chop on GPU
+
+	if( datainf == FFT )//No fft in the conv
+	{	
+		complex_corr_conv_gpu(dataarray, kernel_vect_on_gpu,
+                       fftlen, 
+                       numkern_in_array, 
+                       stage, harmtosum, harmnum, 
+                       offset_array, d_data, d_result,
+                       chopbins, numtocopy,
+                       zinds, rinds,
+                       numzs_full, numrs_full,
+                       d_fundamental,
+                       d_zinds, d_rinds,
+                       0,
+                       NOFFTS, optype,
+                       obs_zlo, fullrlo, harm_fract, zlo, rlo);	
+	}      
+	
+	if( datainf == RAW )//implement fft in the conv
+	{	
+		complex_corr_conv_gpu(dataarray, kernel_vect_on_gpu,
+                       fftlen, 
+                       numkern_in_array, 
+                       stage, harmtosum, harmnum, 
+                       offset_array, d_data, d_result,
+                       chopbins, numtocopy,
+                       zinds, rinds,
+                       numzs_full, numrs_full,
+                       d_fundamental,
+                       d_zinds, d_rinds,
+                       1,
+                       NOFFTS, optype,
+                       obs_zlo, fullrlo, harm_fract, zlo, rlo);	
+	}      	
+
+
+   /* Set variables for next time... */
+
+   if (firsttime)
+      firsttime = 0;
+   oldlobin = lobin;
+   oldfftlen = fftlen;
+   oldnumdata = numdata;
+   oldnumbetween = numbetween;
+   oldkern_half_width = kern_half_width;
+   if (numresult < fftlen - 2 * numbetween * kern_half_width)
+      return numresult;
+   else
+      return fftlen - 2 * numbetween * kern_half_width;
+}
+#endif
+//--------------------------------------------------------------------------
+
+
 int corr_complex(fcomplex * data, int numdata, presto_datainf datainf,
                  fcomplex * kern, int numkern, presto_datainf kerninf,
                  fcomplex * result, int numresult, int lobin,

From 74d6de8ecc34e72bde53a2388ad26ba7eabeff48 Mon Sep 17 00:00:00 2001
From: jintaoluo <luojintao@cv-nat-a-56.cv.nrao.edu>
Date: Fri, 16 May 2014 18:10:22 -0400
Subject: [PATCH 2/5] Makefile & Howto on the GPUed PRESTO2

---
 Howto_GPU.md |  23 ++++++++++++
 src/Makefile | 103 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 Howto_GPU.md

diff --git a/Howto_GPU.md b/Howto_GPU.md
new file mode 100644
index 000000000..6ea4fb79c
--- /dev/null
+++ b/Howto_GPU.md
@@ -0,0 +1,23 @@
+#GPUed PRESTO
+
+The program of accelsearch in PRESTO was accelerated with CUDA by Jintao Luo. Any questions on how to compile and use the GPU routine may go to jluo@nrao.edu
+
+##How to compile
+
+###compile for CPU-only
+By default PRESTO will be compiled for CPU-only. This is controlled by line 22 in $PRESTO/src/Makefile. For CPU-only, this line should be as:
+>use_cuda = no# yes/no
+
+###compile with GPU routines
+Open the $PRESTO/src/Makefile
+
+1.	Make sure line 22 is:
+>use_cuda = yes# yes/no
+
+2.	Check the CUDA-related variables and flags, and modify them if necessary.
+
+##How to use
+To run accelsearch on GPU, use the -cuda option. Or the program will run on CPU. For example: 
+>accelsearch -numharm 16 -zmax 256 ur_data.dat -cuda 0
+
+0 means using the 1st GPU in your machine.
\ No newline at end of file
diff --git a/src/Makefile b/src/Makefile
index aad332b51..790fb6d2a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,6 @@
 #  Makefile for PRESTO
 #   by Scott M. Ransom
+#		GPU-related sections by Jintao Luo
 
 # OS type
 OS = Linux
@@ -8,13 +9,60 @@ OS = Linux
 # Linux is the first choice
 ifeq ($(OS),Linux)
 	LIBSUFFIX = .so
-	LIBCMD = -shared
+	LIBCMD = -shared	
 # else assume Darwin (i.e. OSX)
 else
 	LIBSUFFIX = .dylib
 	LIBCMD = -dynamiclib
 endif
 
+
+#------------------------------------------------------------------------------------------
+#determine if compile with CUDA
+use_cuda = no# yes/no
+
+ifeq ($(use_cuda),yes)
+$(info Compile with CUDA)
+#******************************************************************
+#                     for GUP version, CUDA 
+# CUDA compile option
+# CUDA code generation flags
+	GENCODE_SM10    := -gencode arch=compute_10,code=sm_10
+	GENCODE_SM10    := -gencode arch=compute_11,code=sm_11
+	GENCODE_SM20    := -gencode arch=compute_20,code=sm_20
+	GENCODE_SM30    := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
+	GENCODE_FLAGS   := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)
+# Location of the CUDA Toolkit binaries and libraries
+#modify it if necessary
+#CUDA_PATH       ?= /usr/local/cuda-5.0
+	CUDA_PATH       ?= /usr/local/cuda
+
+#modify it if necessary
+	CUDA_INC_PATH   ?= -I $(CUDA_PATH)/include -I. -I.. -I $(CUDA_PATH)/samples/common/inc/ -I$(PRESTO)/include
+
+	CUDA_BIN_PATH   ?= $(CUDA_PATH)/bin
+	NVCC            ?= $(CUDA_BIN_PATH)/nvcc
+
+#modify it if necessary, for example, if 32-bit machine, -m32
+	NVCCFLAGS 			?= -m64 -ccbin g++
+
+	EXTRA_NVCCFLAGS ?= -Xcompiler -fPIC -O2
+
+#modify it if necessary, -m64 to -m32 for 32-bit machine
+	LINKCOMMAND_CUDA = g++ -m64 -O2 -o	
+
+#modify it if necessary, maybe lib64 should be lib32?
+	CUDA_LINK_EXTRA_FLAGS = -L $(CUDA_PATH)/lib64 -lcufft  -lcudart
+#******************************************************************
+endif
+
+ifeq ($(use_cuda),no)
+$(info Compile without CUDA)
+endif
+#------------------------------------------------------------------------------------------
+
+
+
 # How to link with some needed libraries of PGPLOT
 X11LINK := $(shell pkg-config --libs x11)
 PNGLINK := $(shell pkg-config --libs libpng)
@@ -37,7 +85,14 @@ CFITSIOINC := $(shell pkg-config --cflags cfitsio)
 CFITSIOLINK := $(shell pkg-config --libs cfitsio)
 
 # The standard PRESTO libraries to link into executables
+#PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK)
+ifeq ($(use_cuda),no)
 PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK)
+endif
+
+ifeq ($(use_cuda),yes)
+PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK) $(CUDA_LINK_EXTRA_FLAGS) -L $(SYSDIR)/lib/x86_64-linux-gnu -lstdc++
+endif
 
 CC = gcc
 FC = gfortran
@@ -46,6 +101,11 @@ FC = gfortran
 CFLAGS = -I$(PRESTO)/include $(GLIBINC) $(CFITSIOINC) $(PGPLOTINC) $(FFTINC) \
 	-DUSEFFTW -DUSEMMAP -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 \
 	-g -O3 -ffast-math -Wall -W -fPIC
+
+ifeq ($(use_cuda),yes)
+	CFLAGS += -DUSECUDA
+endif
+	
 #	-g -Wall -W -fPIC
 CLINKFLAGS = $(CFLAGS)
 # NOTE:  Be careful of upping the optimization on the
@@ -69,6 +129,20 @@ CLIG = clig
 	mv ../clig/$*_cmd.c .
 	cp ../clig/$*.1 ../docs/
 
+ifeq ($(use_cuda),yes)
+PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\
+	characteristics.o cldj.o chkio.o corr_prep.o corr_routines.o\
+	correlations.o database.o dcdflib.o dispersion.o\
+	fastffts.o fftcalls.o fminbr.o fold.o fresnl.o ioinf.o\
+	get_candidates.o iomak.o ipmpar.o maximize_r.o maximize_rz.o\
+	median.o minifft.o misc_utils.o clipping.o\
+	orbint.o output.o read_fft.o responses.o\
+	rzinterp.o rzwinterp.o select.o sorter.o swapendian.o\
+	transpose.o twopass.o twopass_real_fwd.o\
+	twopass_real_inv.o vectors.o multifiles.o mask.o\
+	fitsfile.o hget.o hput.o imio.o djcl.o accel_utils_gpu.o
+endif
+ifeq ($(use_cuda),no)
 PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\
 	characteristics.o cldj.o chkio.o corr_prep.o corr_routines.o\
 	correlations.o database.o dcdflib.o dispersion.o\
@@ -79,7 +153,8 @@ PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\
 	rzinterp.o rzwinterp.o select.o sorter.o swapendian.o\
 	transpose.o twopass.o twopass_real_fwd.o\
 	twopass_real_inv.o vectors.o multifiles.o mask.o\
-	fitsfile.o hget.o hput.o imio.o djcl.o
+	fitsfile.o hget.o hput.o imio.o djcl.o 
+endif
 
 INSTRUMENTOBJS = backend_common.o zerodm.o sigproc_fb.o psrfits.o
 
@@ -112,6 +187,14 @@ indent:
 prep:
 	touch *_cmd.c
 
+ifeq ($(use_cuda),yes)
+accel_utils_gpu.o:	accel_utils_gpu.cu	  
+	$(NVCC) $(EXTRA_NVCCFLAGS) $(NVCCFLAGS) $(CUDA_INC_PATH)  $(GENCODE_FLAGS) -o $@ -c $<
+endif
+ifeq ($(use_cuda),no)
+$(info accel_utils_gpu.o will not be compiled)
+endif
+
 makewisdom:
 	$(CC) $(CLINKFLAGS) -o $@ makewisdom.c $(FFTLINK)
 	./makewisdom
@@ -138,6 +221,17 @@ libsla$(LIBSUFFIX):
 
 binaries: $(BINARIES)
 
+ifeq ($(use_cuda),yes)
+$(info link accelsearch with CUDA)
+accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o accel_utils_gpu.o 
+	$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o accel_utils_gpu.o $(PRESTOLINK) -lglib-2.0 -lm $(CUDA_LINK_EXTRA_FLAGS)
+endif
+ifeq ($(use_cuda),no)
+$(info link accelsearch without CUDA)
+accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o
+	$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm
+endif
+
 mpi: mpiprepsubband
 
 mpiprepsubband_utils.o: mpiprepsubband_utils.c
@@ -149,8 +243,8 @@ mpiprepsubband.o: mpiprepsubband.c
 mpiprepsubband: mpiprepsubband_cmd.c mpiprepsubband_cmd.o mpiprepsubband_utils.o mpiprepsubband.o $(INSTRUMENTOBJS)
 	mpicc $(CLINKFLAGS) -o $(PRESTO)/bin/$@ mpiprepsubband_cmd.o mpiprepsubband_utils.o mpiprepsubband.o $(INSTRUMENTOBJS) $(PRESTOLINK) -lcfitsio -lm
 
-accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o
-	$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm
+#accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o
+#	$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm
 
 bary: bary.o
 	$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ bary.o $(PRESTOLINK) -lm
@@ -305,3 +399,4 @@ squeaky:  cleaner
 	cd $(PRESTO)/docs ; rm -f *# *~
 	cd $(PRESTO)/python ; rm -f *# *~ *.o *.pyc *.pyo
 	cd $(PRESTO)/include ; rm -f *# *~
+

From 4ed25456ae83c13e55ff71b159cca9a60fab50d9 Mon Sep 17 00:00:00 2001
From: jintaoluo <luojintao@jluo-MacBook-Pro.local>
Date: Fri, 23 May 2014 14:40:52 -0400
Subject: [PATCH 3/5] head file for accelsearch_on_gpu

---
 include/accel_utils_gpu.h | 70 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 include/accel_utils_gpu.h

diff --git a/include/accel_utils_gpu.h b/include/accel_utils_gpu.h
new file mode 100644
index 000000000..2553ef421
--- /dev/null
+++ b/include/accel_utils_gpu.h
@@ -0,0 +1,70 @@
+/* ACCEL_USELEN must be less than 65536 since we  */
+/* use unsigned short ints to index our arrays... */
+/* #define ACCEL_USELEN 32160 */
+/* #define ACCEL_USELEN 16000 */
+#define ACCEL_USELEN 7560 
+#define ACCEL_NUMBETWEEN 2
+/* Stepsize in Fourier Freq */
+#define ACCEL_DR  0.5
+/* Reciprocal of ACCEL_DR */
+#define ACCEL_RDR 2
+/* Stepsize in Fourier F-dot */
+#define ACCEL_DZ  2
+/* Reciprocal of ACCEL_DZ */
+#define ACCEL_RDZ 0.5
+/* Closest candidates we will accept as independent */
+#define ACCEL_CLOSEST_R 15.0
+/* Padding for .dat file reading so that we don't SEGFAULT */
+#define ACCEL_PADDING 2000
+
+
+#ifndef DBLCORRECT
+#define DBLCORRECT    1e-14
+#endif
+
+#ifndef _ACCEL_UTILS_GPU_
+#define _ACCEL_UTILS_GPU_
+
+typedef struct accel_cand_gpu{
+	float pow;					/*pow of selected candidate*/
+	int		nof_cand;			/*number of candidates in sub_array/plane */
+	int		z_ind;				/*z_index of the selected candidate*/
+	int		r_ind;				/*r_index of the selected candidate*/
+}	accel_cand_gpu;
+
+#ifndef _FCOMPLEX_DECLARED_
+typedef struct fcomplex {
+    float r, i;
+} fcomplex;
+#define _FCOMPLEX_DECLARED_
+#endif				/* _FCOMPLEX_DECLARED_ */
+
+typedef struct kernel{
+  int z;               /* The fourier f-dot of the kernel */
+  int fftlen;          /* Number of complex points in the kernel */
+  int numgoodbins;     /* The number of good points you can get back */
+  int numbetween;      /* Fourier freq resolution (2=interbin) */
+  int kern_half_width; /* Half width (bins) of the raw kernel. */
+  fcomplex *data;      /* The FFTd kernel itself */
+} kernel;
+
+typedef struct subharminfo{
+  int numharm;       /* The number of sub-harmonics */
+  int harmnum;       /* The sub-harmonic number (fundamental = numharm) */
+  int zmax;          /* The maximum Fourier f-dot for this harmonic */
+  int numkern;       /* Number of kernels in the vector */
+  kernel *kern;      /* The kernels themselves */
+  unsigned short *rinds; /* Table of indices for Fourier Freqs */
+} subharminfo;
+
+
+/*  Constants used in the correlation/convolution routines */
+typedef enum {
+  CONV, CORR, INPLACE_CONV, INPLACE_CORR
+} presto_optype;
+
+typedef enum {
+  FFTDK, FFTD, FFTK, NOFFTS
+} presto_ffts;
+
+#endif

From 6a743bd8b420759c2044a1223f50a9182d63406d Mon Sep 17 00:00:00 2001
From: jintaoluo <luojintao@jluo-MacBook-Pro.local>
Date: Fri, 5 Sep 2014 15:42:37 -0400
Subject: [PATCH 4/5] fix python module presto

add stdc++ and cuda options to the libpresto linking
---
 src/Makefile      | 14 ++++++++++++--
 src/accelsearch.c |  8 +-------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 790fb6d2a..a75e5cde6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -19,7 +19,7 @@ endif
 
 #------------------------------------------------------------------------------------------
 #determine if compile with CUDA
-use_cuda = no# yes/no
+use_cuda = yes# yes/no
 
 ifeq ($(use_cuda),yes)
 $(info Compile with CUDA)
@@ -207,8 +207,18 @@ timetest:
 
 libpresto: libpresto$(LIBSUFFIX)
 
+ifeq ($(use_cuda),yes)
 libpresto$(LIBSUFFIX): $(PRESTOOBJS)
-	$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK)
+	$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK) $(CUDA_LINK_EXTRA_FLAGS) -L $(SYSDIR)/lib/x86_64-linux-gnu -lstdc++
+endif
+
+ifeq ($(use_cuda),no)
+libpresto$(LIBSUFFIX): $(PRESTOOBJS)
+	$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK) 
+endif
+
+#libpresto$(LIBSUFFIX): $(PRESTOOBJS)	
+	#$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK)
 
 slalib: libsla$(LIBSUFFIX)
 	cd slalib ; $(FC) -o sla_test sla_test.f -fno-second-underscore -L$(PRESTO)/lib -lsla
diff --git a/src/accelsearch.c b/src/accelsearch.c
index 4693bccfb..79cfd5897 100644
--- a/src/accelsearch.c
+++ b/src/accelsearch.c
@@ -196,11 +196,7 @@ if(cmd->cudaP == 1){/****************************************call GPU to run the
 //if( use_cuda_flag == 1 ){
 	
 	printf("\nTry to use a GPU to run the search\n");
-	
-	#ifdef USECUDA
-	printf( "\nUSECUDA\n" );
-	#endif
-	
+		
 	select_cuda_dev(cmd->cuda);
 	
 	//prepare d_data and d_result on GPU	
@@ -305,8 +301,6 @@ if(cmd->cudaP == 1){/****************************************call GPU to run the
 
 	 destroy_cuFFT_plans( subharminfs, obs.numharmstages ); //destroy cuFFT plans
 	 
-	 printf("\ndebug:.......................\n");
-	 
    //free_subharminfos(obs.numharmstages, subharminfs); //from presto1
    free_subharminfos(&obs, subharminfs); //from presto2
 

From d1bb087479e7e51c05ed067d650acc7c107fc831 Mon Sep 17 00:00:00 2001
From: jintaoluo <luojintao@jluo-MacBook-Pro.local>
Date: Thu, 25 Jun 2015 15:44:00 -0400
Subject: [PATCH 5/5] Apply a patch from Scott's code

Fixed a bug where we could potentially read beyond the end of the FFT
---
 src/accel_utils.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/accel_utils.c b/src/accel_utils.c
index eabf32e4e..4e0515e25 100644
--- a/src/accel_utils.c
+++ b/src/accel_utils.c
@@ -870,25 +870,35 @@ fcomplex *get_fourier_amplitudes(int lobin, int numbins, accelobs * obs)
 {
     if (obs->mmap_file || obs->dat_input) {
         fcomplex *tmpdata = gen_cvect(numbins);
-        int offset = 0;
+        size_t ii, offset = 0, firstbin, newnumbins;
+        fcomplex zeros = {0.0, 0.0};
+
         // zero-pad if we try to read before the beginning of the FFT
         if (lobin - obs->lobin < 0) {
-            fcomplex zeros = {0.0, 0.0};
-            int ii;
             offset = abs(lobin - obs->lobin);
             for (ii = 0 ; ii < offset ; ii++)
                 tmpdata[ii] = zeros;
-        }            
-        memcpy(tmpdata + offset, 
-               (fcomplex *) (obs->fft + (lobin - obs->lobin) + offset),
-               sizeof(fcomplex) * (numbins - offset));
+        }
+        firstbin = (lobin - obs->lobin) + offset;
+        newnumbins = numbins - offset;
+
+        // zero-pad if we try to read beyond the end of the FFT
+        if (firstbin + newnumbins > obs->numbins) {
+            size_t numpad = firstbin + newnumbins - obs->numbins;
+            newnumbins = newnumbins - numpad;
+            for (ii = numbins - numpad ; ii < numbins ; ii++)
+                tmpdata[ii] = zeros;
+        }
+
+        // Now grab the data we need
+        memcpy(tmpdata + offset, obs->fft + firstbin,
+               sizeof(fcomplex) * newnumbins);
         return tmpdata;
     } else {
         return read_fcomplex_file(obs->fftfile, lobin - obs->lobin, numbins);
     }
 }
 
-
 ffdotpows *subharm_ffdot_plane(int numharm, int harmnum,
                                double fullrlo, double fullrhi,
                                subharminfo * shi, accelobs * obs)