-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexclusiveScan.cu_inl
110 lines (80 loc) · 4.06 KB
/
exclusiveScan.cu_inl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// exclusiveScan.cu_inl
// This is a shared-memory implementation of exclusive scan. Note that the
// exclusive scan you implemented in Part 1 uses slower *global* memory, and has
// overhead from performing multiple kernel launches.
// Because it uses shared memory, it must be run within a single thread block.
// REQUIREMENTS:
// - Input array must have power-of-two length.
// - Number of threads in the thread block must be the size of the array!
// - SCAN_BLOCK_DIM is both the number of threads in the block (must be power of 2)
// and the number of elements that will be scanned.
// You should define this in your cudaRenderer.cu file,
// based on your implementation.
// - The parameter sScratch should be a pointer to an array with 2*SCAN_BLOCK_DIM elements
// - The 3 arrays should be in shared memory.
// ================= USAGE (in cudaRenderer.cu) =====================
// at the top of the file:
// #define SCAN_BLOCK_DIM BLOCKSIZE // needed by sharedMemExclusiveScan implementation
// #include "exclusiveScan.cu_inl"
// ...
// in a kernel:
// If you're using 2D indices, compute a linear thread index as folows.
// NOTE: scan assumes that every 32 adjacent linear thread indices
// (0-31, 32-63, ...) form a warp, which means they execute in lockstep.
// If you do linearThreadIndex = threadIdx.x * blockDim.x + threadIdx.y;
// you will get a linear thread index, but it won't be sorted into warps,
// which will break scan!
// int linearThreadIndex = threadIdx.y * blockDim.y + threadIdx.x;
// __shared__ uint prefixSumInput[BLOCKSIZE];
// __shared__ uint prefixSumOutput[BLOCKSIZE];
// __shared__ uint prefixSumScratch[2 * BLOCKSIZE];
// sharedMemExclusiveScan(linearThreadIndex, prefixSumInput, prefixSumOutput, prefixSumScratch, BLOCKSIZE);
#define LOG2_WARP_SIZE 5U
#define WARP_SIZE (1U << LOG2_WARP_SIZE)
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE
inline __device__ uint
warpScanInclusive(int threadIndex, uint idata, volatile uint *s_Data, uint size){
// Note some of the calculations are obscure because they are optimized.
// For example, (threadIndex & (size - 1)) computes threadIndex % size,
// which works, assuming size is a power of 2.
uint pos = 2 * threadIndex - (threadIndex & (size - 1));
s_Data[pos] = 0;
pos += size;
s_Data[pos] = idata;
for(uint offset = 1; offset < size; offset <<= 1)
s_Data[pos] += s_Data[pos - offset];
return s_Data[pos];
}
inline __device__ uint warpScanExclusive(int threadIndex, uint idata, volatile uint *sScratch, uint size){
return warpScanInclusive(threadIndex, idata, sScratch, size) - idata;
}
__inline__ __device__ void
sharedMemExclusiveScan(int threadIndex, uint* sInput, uint* sOutput, volatile uint* sScratch, uint size)
{
if (size > WARP_SIZE) {
uint idata = sInput[threadIndex];
//Bottom-level inclusive warp scan
uint warpResult = warpScanInclusive(threadIndex, idata, sScratch, WARP_SIZE);
// Save top elements of each warp for exclusive warp scan sync
// to wait for warp scans to complete (because s_Data is being
// overwritten)
__syncthreads();
if ( (threadIndex & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
sScratch[threadIndex >> LOG2_WARP_SIZE] = warpResult;
// wait for warp scans to complete
__syncthreads();
if ( threadIndex < (SCAN_BLOCK_DIM / WARP_SIZE)) {
// grab top warp elements
uint val = sScratch[threadIndex];
// calculate exclusive scan and write back to shared memory
sScratch[threadIndex] = warpScanExclusive(threadIndex, val, sScratch, size >> LOG2_WARP_SIZE);
}
//return updated warp scans with exclusive scan results
__syncthreads();
sOutput[threadIndex] = warpResult + sScratch[threadIndex >> LOG2_WARP_SIZE] - idata;
} else if (threadIndex < WARP_SIZE) {
uint idata = sInput[threadIndex];
sOutput[threadIndex] = warpScanExclusive(threadIndex, idata, sScratch, size);
}
}