-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresize.cu
207 lines (146 loc) · 5.94 KB
/
resize.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "EBMP/EasyBMP.h"
//Russian characters aren't displayed.Comments in English, sorry...
// 2D float texture
texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
using namespace std;
#define checkCudaErrors(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void saveImage(float* image, int height, int width, bool method) {
BMP Output;
Output.SetSize(width, height);
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
RGBApixel pixel;
pixel.Red = image[i * width + j];
pixel.Green = image[i * width + j];
pixel.Blue = image[i * width + j];
Output.SetPixel(j, i, pixel);
}
}
if (method)
Output.WriteToFile("GPUoutAngelina.bmp");
else
Output.WriteToFile("CPUoutAngelina.bmp");
}
//Unit square
//we choose a coordinate system in which the four points where f is known are(0, 0), (1, 0), (0, 1), and (1, 1)
// wiki
void medianFilterCPU(float* image, float* resault, int height, int width)
{
for (int j = 0; j < height-1; j++) {
for (int i = 0; i < width-1; i++) {
float f01 = image[j * width + i];
float f11 = image[j * width + i + 1];
float f00 = image[j * width + width + i];
float f10 = image[j * width + width + i + 1];
float n11 = f01 * 0.5 + f11 * 0.5;
float n00 = f00 * 0.5 + f01 * 0.5;
float n10 = f00 * 0.5 * 0.5 + f10 * 0.5 * 0.5 + f01 * 0.5 * 0.5 + f11 * 0.5 * 0.5;
resault[j* width * 4 + i * 2] = f01;
resault[j * width * 4 + i * 2 + 1] = n11;
resault[j * width * 4 + i * 2 + width * 2] = n00;
resault[j * width * 4 + i * 2 + width * 2 + 1] = n10;
}
}
}
// A good example to demonstrate the difference between a CPU and a GPU is because the algorithms are almost ideal.
//Unit square
//we choose a coordinate system in which the four points where f is known are(0, 0), (1, 0), (0, 1), and (1, 1)
// wiki
__global__ void myFilter(float* output, int imageWidth, int imageHeight) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
float f01 = tex2D(texRef, col, row);
float f11 = tex2D(texRef, col+1, row);
float f00 = tex2D(texRef, col, row+1);
float f10 = tex2D(texRef, col + 1, row+1);
float n11 = f01*0.5+f11*0.5;
float n00 = f00*0.5+f01*0.5;
float n10 = f00 * 0.5 * 0.5 + f10 * 0.5*0.5 + f01 * 0.5 * 0.5 + f11 * 0.5 * 0.5;
output[row * imageWidth * 4 + col*2] = f01;
output[row * imageWidth * 4 + col * 2 + 1] = n11;
output[row * imageWidth * 4 + col * 2 + imageWidth * 2] = n00;
output[row * imageWidth * 4 + col * 2 + imageWidth * 2 + 1] = n10;
}
int main(void)
{
int nIter = 100;
BMP Image;
Image.ReadFromFile("angelina4.bmp");
int height = Image.TellHeight();
int width = Image.TellWidth();
float* imageArray = (float*)calloc(height * width, sizeof(float));
float* outputCPU = (float*)calloc(height * width*4, sizeof(float));
float* outputGPU = (float*)calloc(height * width*4, sizeof(float));
float* outputDevice;
for (int j = 0; j < Image.TellHeight(); j++) {
for (int i = 0; i < Image.TellWidth(); i++) {
imageArray[j * width + i] = Image(i, j)->Red;
}
}
unsigned int start_time = clock();
for (int j = 0; j < nIter; j++) {
medianFilterCPU(imageArray, outputCPU, height, width);
}
unsigned int elapsedTime = clock() - start_time;
float msecPerMatrixMulCpu = elapsedTime / nIter;
cout << "CPU time: " << msecPerMatrixMulCpu << endl;
int device_count = 0;
cudaGetDeviceCount(&device_count);
if (device_count == 0)
cout << "Sorry! You dont have CudaDevice" << endl;
else {
cout << "CudaDevice found! Device count: " << device_count << endl;
// Allocate CUDA array in device memory
//Returns a channel descriptor with format f and number of bits of each component x, y, z, and w
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_arr;
checkCudaErrors(cudaMallocArray(&cu_arr, &channelDesc, width, height));
checkCudaErrors(cudaMemcpyToArray(cu_arr, 0, 0, imageArray, height * width * sizeof(float), cudaMemcpyHostToDevice)); // set texture parameters
texRef.addressMode[0] = cudaAddressModeClamp;
texRef.addressMode[1] = cudaAddressModeClamp;
texRef.filterMode = cudaFilterModePoint;
// Bind the array to the texture
cudaBindTextureToArray(texRef, cu_arr, channelDesc);
checkCudaErrors(cudaMalloc(&outputDevice, height * width * 4* sizeof(float)));
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid((width + threadsPerBlock.x - 1) / threadsPerBlock.x,
(height + threadsPerBlock.y - 1) / threadsPerBlock.y);
cudaEvent_t start;
cudaEvent_t stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// start record
checkCudaErrors(cudaEventRecord(start, 0));
for (int j = 0; j < nIter; j++) {
myFilter << <blocksPerGrid, threadsPerBlock >> > (outputDevice, width, height);
}
// stop record
checkCudaErrors(cudaEventRecord(stop, 0));
// wait end of event
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
float msecPerMatrixMul = msecTotal / nIter;
cout << "GPU time: " << msecPerMatrixMul << endl;
cudaDeviceSynchronize();
checkCudaErrors(cudaMemcpy(outputGPU, outputDevice, height * width * 4 * sizeof(float), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
saveImage(outputGPU, height*2, width*2, true);
saveImage(outputCPU, height*2, width*2, false);
checkCudaErrors(cudaFreeArray(cu_arr));
checkCudaErrors(cudaFree(outputDevice));
}
return 0;
}