allGenes.cu

/*
* getSSSim for the whole dataset 
* Author: Hirak Kashyap
* Date: 01/24/2016
*/
#include <sys/time.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <string.h>
#include <math.h>

#define nrows 5000 //Change this with number of genes in the dataset
#define ncols 32//Change this with number of samples in the dataset

#define sBuffer 8192//1024 //intermediate buffer to store score values

float h_data[nrows][ncols];
float h_score[nrows][nrows];
float c_score[nrows][nrows];

/**
 * CUDA Kernel Device code
 *
 */

__global__ void getSSSim(const float *data, float *score)
{

	int tid_x = blockIdx.x*blockDim.x + threadIdx.x;
	int tid_y = blockIdx.y*blockDim.y + threadIdx.y;
	
	int s_grid_width = gridDim.x * blockDim.x;
	int s_index = s_grid_width * tid_y + tid_x;

	int d_grid_width = gridDim.z * blockDim.z;
	int dx_index = d_grid_width * tid_x;
	int dy_index = d_grid_width * tid_y;
			
	float *gx_d = (float*)((char*) data + dx_index * sizeof(float));
	float *gy_d = (float*)((char*) data + dy_index * sizeof(float));

	//__shared__ float sum;
	//float sum = 0;
	float score_s = 0;
	float score_l[ncols-1];
	float gx[ncols-1];
	float gy[ncols-1];
	float gx_2min1 = gx_d[1] - gx_d[0];
	float gy_2min1 = gy_d[1] - gy_d[0];

	//0-(ncols-2)
	for(int i=0; i<ncols-1; i++)
	{
		gx[i] = (gx_d[i+1] - gx_d[i])/gx_2min1;
		gy[i] = (gy_d[i+1] - gy_d[i])/gy_2min1;
		
	}
	__syncthreads();

	//0 and (ncols-2) columns - score calculation
	score_l[0]=(gx[0]+gx[1]+gy[0]+gy[1])/4;
    	score_l[ncols-2]=(gx[ncols-3]+gx[ncols-2]+gy[ncols-3]+gy[ncols-2])/4;

    	//1 to (ncols-3) columns - score calculation
    	for(int i=1; i<ncols-2; i++)
    	{
        	score_l[i]=(gx[i-1]+gx[i]+gx[i+1]+gy[i-1]+gy[i]+gy[i+1])/6;
    	}
    	
	int n_diff=0;
    	for(int i=0; i<ncols-1; i++)
    	{
        	score_l[i] = 2 * fmaxf(fabsf(score_l[i]-gx[i]),fabsf(score_l[i]-gy[i]));
        	if (score_l[i]>=0.00001)
		{
        	      n_diff++;
        	      score_s = score_s + fabsf(gx[i]-gy[i])/score_l[i];
		}
    	}
	__syncthreads();

    	score[s_index] = 1 - (score_s/n_diff);
	__syncthreads();
}

void populateArrays(){
    char buf[sBuffer];
    FILE *fp;
    fp = fopen("miRNA-target-interaction.csv", "r");
    for(int i=0;i<nrows;i++)
    {
	printf("\n");
	fgets(buf, sizeof(buf), fp);
	char *tok = strtok(buf,",");
	h_data[i][0] = atof(tok);
	printf("%f\t",h_data[i][0]);
	for(int j=1;j<ncols;j++)
	{
		tok = strtok(NULL, ",");
		h_data[i][j] = atof(tok);
		printf("%f\t",h_data[i][j]);
	}
	printf("\n");
    }
}

// This function calculates getSSSim scores using only CPU cores and compares with GPU results for correctness
void print_cal()
{
	float maxdiff = 0;
	for(int i=0; i< nrows; i++)
	{
		
		//printf("\n");
		for(int j=0; j<nrows; j++)
		{
			float score_s = 0;
			float score_l[ncols-1];
			float gx_2min1 = h_data[j][1] - h_data[j][0];
			float gy_2min1 = h_data[i][1] - h_data[i][0];
			float gx[ncols-1], gy[ncols-1];

			for(int k=0; k<ncols-1; k++)
			{
				gx[k] = (h_data[j][k+1] - h_data[j][k])/gx_2min1;
				gy[k] = (h_data[i][k+1] - h_data[i][k])/gy_2min1;
			}
			//__syncthreads();

			//0 and (ncols-2)
			score_l[0]=(gx[0]+gx[1]+gy[0]+gy[1])/4;
    			score_l[ncols-2]=(gx[ncols-3]+gx[ncols-2]+gy[ncols-3]+gy[ncols-2])/4;

    			//1-(ncols-3)
    			for(int k=1; k<ncols-2; k++)
    			{
        			score_l[k]=(gx[k-1]+gx[k]+gx[k+1]+gy[k-1]+gy[k]+gy[k+1])/6;
    			}
			//__syncthreads();
    			int n_diff=0;
    			for(int k=0; k<ncols-1; k++)
    			{
        			score_l[k] = 2 * fmaxf(fabsf(score_l[k]-gx[k]),fabsf(score_l[k]-gy[k]));
        			if (score_l[k]>=0.00001)
				{
        			      n_diff++;
        			      score_s = score_s + fabsf(gx[k]-gy[k])/score_l[k];
				}
    			}
    			c_score[i][j] = 1 - (score_s/n_diff);
			
			if (abs(c_score[i][j] - h_score[i][j])>maxdiff)
				maxdiff = abs(c_score[i][j] - h_score[i][j]); 
		}
	}
	printf("\nMax diff %f", maxdiff);
}

void saveResults()
{
	FILE *fp;
	
	fp =fopen("results_miRNA-target-interaction.csv","w");
	for(int i=0; i<nrows; i++)
	{
		float score;
		for(int j=0; j<nrows-1; j++)
		{
			score = c_score[i][j];
			if(isnan(score)){
				score = 0.0;
			}
			fprintf(fp, "%f,", score);
		}	
		score = c_score[i][nrows-1];
		if(isnan(score)){
			score = 0.0;
		}
		fprintf(fp, "%f\n", score);
	}
	fclose(fp);
}

/**
 * Host main routine
 */
int
main(void)
{
    printf("[ getSSSim of all genes ]");

    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
    timeval t0, t1, t0_cpu, t1_cpu;

    populateArrays();

    //starts actual execution
    gettimeofday(&t0, 0);

    // Allocate the device input data matrix
    float *d_data = NULL;
    //err = cudaMallocPitch((void **)&d_data, &pitch_data, w_size, nrows);
    err = cudaMalloc((void **)&d_data, nrows*ncols*sizeof(float));

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector d_data (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device output score matrix
    float *d_score = NULL;
    err = cudaMalloc((void **)&d_score, nrows*nrows*sizeof(float));
    //err = cudaMallocPitch((void **)&d_score, &pitch_score, h_size, nrows);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector d_score (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    //Copy input data from the host memory to the CUDA device
    //err = cudaMemcpy(d_g1, h_g1, size, cudaMemcpyHostToDevice);
    err = cudaMemcpy(d_data, h_data, nrows*ncols*sizeof(float), cudaMemcpyHostToDevice);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector g1 from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    //launch the getSSSim kernel 
    dim3 blockSize;
    blockSize.x = 4;
    blockSize.y = 4;
    blockSize.z = ncols;

    dim3 gridSize;
    gridSize.x = nrows/blockSize.x;
    gridSize.y = nrows/blockSize.y;


    getSSSim<<<gridSize, blockSize>>>(d_data, d_score);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.

    err = cudaMemcpy(h_score, d_score, nrows*nrows*sizeof(float), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector g1 from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }


    gettimeofday(&t1, 0);
    long long elapsed = (t1.tv_sec-t0.tv_sec)*1000000LL + t1.tv_usec-t0.tv_usec;
    printf("\nTime elapsed GPU (microsecond):%lld\n", elapsed);
    
    printf("\nCPU\n");
    gettimeofday(&t0_cpu, 0);
    print_cal();
    gettimeofday(&t1_cpu, 0);

    
    long long elapsed_cpu = (t1_cpu.tv_sec-t0_cpu.tv_sec)*1000000LL + t1_cpu.tv_usec-t0_cpu.tv_usec;
    printf("\nTime elapsed CPU (microsecond):%lld\n", elapsed_cpu);

    //Save results into a csv file
    saveResults();

    // Free device global memory
    err = cudaFree(d_data);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector data (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_score);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector score (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }


    // Reset the device and exit
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}