-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbodysystemcuda.cu
386 lines (317 loc) · 14.6 KB
/
bodysystemcuda.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <cutil_inline.h> // includes cuda.h and cuda_runtime_api.h
#include <math.h>
#if defined(__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/freeglut.h>
#endif
#include <cuda_gl_interop.h>
__constant__ float softeningSquared;
__constant__ double softeningSquared_fp64;
template<class T>
struct SharedMemory
{
__device__ inline operator T*()
{
extern __shared__ int __smem[];
return (T*)__smem;
}
__device__ inline operator const T*() const
{
extern __shared__ int __smem[];
return (T*)__smem;
}
};
template <typename T> struct vec3 { typedef float Type; }; // dummy
template <> struct vec3<float> { typedef float3 Type; };
template <> struct vec3<double> { typedef double3 Type; };
template <typename T> struct vec4 { typedef float Type; }; // dummy
template <> struct vec4<float> { typedef float4 Type; };
template <> struct vec4<double> { typedef double4 Type; };
template<typename T>
__device__ T rsqrt_T(T x) { return rsqrt(x); }
template<>
__device__ float rsqrt_T<float>(float x) { return rsqrtf(x); }
// Macros to simplify shared memory addressing
#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]
// This macro is only used when multithreadBodies is true (below)
#define SX_SUM(i,j) sharedPos[i+blockDim.x*j]
template <typename T>
__device__ T getSofteningSquared() { return softeningSquared; }
template <>
__device__ double getSofteningSquared<double>() { return softeningSquared_fp64; }
template <typename T>
struct DeviceData {
T* dPos[2]; // mapped host pointers
T* dVel;
cudaEvent_t event;
unsigned int offset;
unsigned int numBodies;
};
template <typename T>
__device__ typename vec3<T>::Type
bodyBodyInteraction(typename vec3<T>::Type ai,
typename vec4<T>::Type bi,
typename vec4<T>::Type bj)
{
typename vec3<T>::Type r;
// r_ij [3 FLOPS]
r.x = bj.x - bi.x;
r.y = bj.y - bi.y;
r.z = bj.z - bi.z;
// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
T distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
distSqr += getSofteningSquared<T>();
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
T invDist = rsqrt_T(distSqr);
T invDistCube = invDist * invDist * invDist;
// s = m_j * invDistCube [1 FLOP]
T s = bj.w * invDistCube;
// a_i = a_i + s * r_ij [6 FLOPS]
ai.x += r.x * s;
ai.y += r.y * s;
ai.z += r.z * s;
return ai;
}
// This is the "tile_calculation" function from the GPUG3 article.
template <typename T>
__device__ typename vec3<T>::Type
gravitation(typename vec4<T>::Type iPos,
typename vec3<T>::Type accel)
{
typename vec4<T>::Type* sharedPos = SharedMemory<typename vec4<T>::Type>();
// The CUDA 1.1 compiler cannot determine that i is not going to
// overflow in the loop below. Therefore if int is used on 64-bit linux
// or windows (or long instead of long long on win64), the compiler
// generates suboptimal code. Therefore we use long long on win64 and
// long on everything else. (Workaround for Bug ID 347697)
#ifdef _Win64
unsigned long long j = 0;
#else
unsigned long j = 0;
#endif
// Here we unroll the loop to reduce bookkeeping instruction overhead
// 32x unrolling seems to provide best performance
// Note that having an unsigned int loop counter and an unsigned
// long index helps the compiler generate efficient code on 64-bit
// OSes. The compiler can't assume the 64-bit index won't overflow
// so it incurs extra integer operations. This is a standard issue
// in porting 32-bit code to 64-bit OSes.
#pragma unroll 32
for (unsigned int counter = 0; counter < blockDim.x; counter++ )
{
accel = bodyBodyInteraction<T>(accel, iPos, SX(j++));
}
return accel;
}
// WRAP is used to force each block to start working on a different
// chunk (and wrap around back to the beginning of the array) so that
// not all multiprocessors try to read the same memory locations at
// once.
#define WRAP(x,m) (((x)<m)?(x):(x-m)) // Mod without divide, works on values from 0 up to 2m
template <typename T, bool multithreadBodies>
__device__ typename vec3<T>::Type
computeBodyAccel(typename vec4<T>::Type bodyPos,
typename vec4<T>::Type* positions,
int numBodies)
{
typename vec4<T>::Type* sharedPos = SharedMemory<typename vec4<T>::Type>();
typename vec3<T>::Type acc = {0.0f, 0.0f, 0.0f};
int p = blockDim.x;
int q = blockDim.y;
int n = numBodies;
int numTiles = n / (p * q);
for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++)
{
sharedPos[threadIdx.x+blockDim.x*threadIdx.y] =
multithreadBodies ?
positions[WRAP(blockIdx.x + q * tile + threadIdx.y, gridDim.x) * p + threadIdx.x] :
positions[WRAP(blockIdx.x + tile, gridDim.x) * p + threadIdx.x];
__syncthreads();
// This is the "tile_calculation" function from the GPUG3 article.
acc = gravitation<T>(bodyPos, acc);
__syncthreads();
}
// When the numBodies / thread block size is < # multiprocessors (16 on G80), the GPU is
// underutilized. For example, with a 256 threads per block and 1024 bodies, there will only
// be 4 thread blocks, so the GPU will only be 25% utilized. To improve this, we use multiple
// threads per body. We still can use blocks of 256 threads, but they are arranged in q rows
// of p threads each. Each thread processes 1/q of the forces that affect each body, and then
// 1/q of the threads (those with threadIdx.y==0) add up the partial sums from the other
// threads for that body. To enable this, use the "--p=" and "--q=" command line options to
// this example. e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4 threads per body and 256
// threads per block. There will be n/p = 16 blocks, so a G80 GPU will be 100% utilized.
// We use a bool template parameter to specify when the number of threads per body is greater
// than one, so that when it is not we don't have to execute the more complex code required!
if (multithreadBodies)
{
SX_SUM(threadIdx.x, threadIdx.y).x = acc.x;
SX_SUM(threadIdx.x, threadIdx.y).y = acc.y;
SX_SUM(threadIdx.x, threadIdx.y).z = acc.z;
__syncthreads();
// Save the result in global memory for the integration step
if (threadIdx.y == 0)
{
for (int i = 1; i < blockDim.y; i++)
{
acc.x += SX_SUM(threadIdx.x,i).x;
acc.y += SX_SUM(threadIdx.x,i).y;
acc.z += SX_SUM(threadIdx.x,i).z;
}
}
}
return acc;
}
template<typename T, bool multithreadBodies>
__global__ void
integrateBodies(typename vec4<T>::Type* newPos,
typename vec4<T>::Type* oldPos,
typename vec4<T>::Type* vel,
unsigned int deviceOffset, unsigned int deviceNumBodies,
float deltaTime, float damping, int totalNumBodies)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= deviceNumBodies)
return;
typename vec4<T>::Type position = oldPos[deviceOffset + index];
typename vec3<T>::Type accel = computeBodyAccel<T, multithreadBodies>(position, oldPos, totalNumBodies);
if (!multithreadBodies || (threadIdx.y == 0))
{
// acceleration = force \ mass;
// new velocity = old velocity + acceleration * deltaTime
// note we factor out the body's mass from the equation, here and in bodyBodyInteraction
// (because they cancel out). Thus here force == acceleration
typename vec4<T>::Type velocity = vel[deviceOffset + index];
velocity.x += accel.x * deltaTime;
velocity.y += accel.y * deltaTime;
velocity.z += accel.z * deltaTime;
velocity.x *= damping;
velocity.y *= damping;
velocity.z *= damping;
// new position = old position + velocity * deltaTime
position.x += velocity.x * deltaTime;
position.y += velocity.y * deltaTime;
position.z += velocity.z * deltaTime;
// store new position and velocity
newPos[deviceOffset + index] = position;
vel[deviceOffset + index] = velocity;
}
}
template <typename T>
void integrateNbodySystem(DeviceData<T> *deviceData,
cudaGraphicsResource **pgres,
unsigned int currentRead,
float deltaTime,
float damping,
unsigned int numBodies,
unsigned int numDevices,
int p,
int q,
bool bUsePBO)
{
if (bUsePBO)
{
cutilSafeCall(cudaGraphicsResourceSetMapFlags(pgres[currentRead], cudaGraphicsMapFlagsReadOnly));
cutilSafeCall(cudaGraphicsResourceSetMapFlags(pgres[1-currentRead], cudaGraphicsMapFlagsWriteDiscard));
cutilSafeCall(cudaGraphicsMapResources(2, pgres, 0));
size_t bytes;
cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&(deviceData[0].dPos[currentRead]), &bytes, pgres[currentRead]));
cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&(deviceData[0].dPos[1-currentRead]), &bytes, pgres[1-currentRead]));
}
cudaDeviceProp props;
for (unsigned int dev = 0; dev != numDevices; dev++)
{
if (numDevices > 1)
cudaSetDevice(dev);
cutilSafeCall(cudaGetDeviceProperties(&props, dev));
while ((deviceData[dev].numBodies > 0) && p > 1 &&
(deviceData[dev].numBodies / p < (unsigned)props.multiProcessorCount))
{
p /= 2;
q *= 2;
}
dim3 threads(p,q,1);
dim3 grid((deviceData[dev].numBodies + (p-1))/p, 1, 1);
// execute the kernel:
// When the numBodies / thread block size is < # multiprocessors
// (16 on G80), the GPU is underutilized. For example, with 256 threads per
// block and 1024 bodies, there will only be 4 thread blocks, so the
// GPU will only be 25% utilized. To improve this, we use multiple threads
// per body. We still can use blocks of 256 threads, but they are arranged
// in q rows of p threads each. Each thread processes 1/q of the forces
// that affect each body, and then 1/q of the threads (those with
// threadIdx.y==0) add up the partial sums from the other threads for that
// body. To enable this, use the "--p=" and "--q=" command line options to
// this example. e.g.: "nbody.exe --n=1024 --p=64 --q=4" will use 4
// threads per body and 256 threads per block. There will be n/p = 16
// blocks, so a G80 GPU will be 100% utilized.
// We use a bool template parameter to specify when the number of threads
// per body is greater than one, so that when it is not we don't have to
// execute the more complex code required!
int sharedMemSize = p * q * 4 * sizeof(T); // 4 floats for pos
if (grid.x > 0 && threads.y == 1)
{
integrateBodies<T, false><<< grid, threads, sharedMemSize >>>
((typename vec4<T>::Type*)deviceData[dev].dPos[1-currentRead],
(typename vec4<T>::Type*)deviceData[dev].dPos[currentRead],
(typename vec4<T>::Type*)deviceData[dev].dVel,
deviceData[dev].offset, deviceData[dev].numBodies,
deltaTime, damping, numBodies);
}
else if (grid.x > 0)
{
integrateBodies<T, true><<< grid, threads, sharedMemSize >>>
((typename vec4<T>::Type*)deviceData[dev].dPos[1-currentRead],
(typename vec4<T>::Type*)deviceData[dev].dPos[currentRead],
(typename vec4<T>::Type*)deviceData[dev].dVel,
deviceData[dev].offset, deviceData[dev].numBodies,
deltaTime, damping, numBodies);
}
if (numDevices > 1)
{
cutilSafeCall(cudaEventRecord(deviceData[dev].event));
// MJH: Hack on older driver versions to force kernel launches to flush!
cudaStreamQuery(0);
}
// check if kernel invocation generated an error
cutilCheckMsg("Kernel execution failed");
}
if (numDevices > 1)
{
for (unsigned int dev = 0; dev < numDevices; dev++)
cutilSafeCall(cudaEventSynchronize(deviceData[dev].event));
}
if (bUsePBO)
{
cutilSafeCall(cudaGraphicsUnmapResources(2, pgres, 0));
}
}
// Explicit specializations needed to generate code
template void integrateNbodySystem<float>(DeviceData<float>* deviceData,
cudaGraphicsResource** pgres,
unsigned int currentRead,
float deltaTime,
float damping,
unsigned int numBodies,
unsigned int numDevices,
int p, int q,
bool bUsePBO);
template void integrateNbodySystem<double>(DeviceData<double>* deviceData,
cudaGraphicsResource** pgres,
unsigned int currentRead,
float deltaTime,
float damping,
unsigned int numBodies,
unsigned int numDevices,
int p, int q,
bool bUsePBO);