exclusiveScan.cu_inl


// exclusiveScan.cu_inl

// This is a shared-memory implementation of exclusive scan. Note that the
// exclusive scan you implemented in Part 1 uses slower *global* memory, and has
// overhead from performing multiple kernel launches.
// Because it uses shared memory, it must be run within a single thread block.


// REQUIREMENTS:
//  - Input array must have power-of-two length.
//  - Number of threads in the thread block must be the size of the array!
//  - SCAN_BLOCK_DIM is both the number of threads in the block (must be power of 2) 
//         and the number of elements that will be scanned. 
//          You should define this in your cudaRenderer.cu file, 
//          based on your implementation.
//  - The parameter sScratch should be a pointer to an array with 2*SCAN_BLOCK_DIM elements
//  - The 3 arrays should be in shared memory. 

// ================= USAGE (in cudaRenderer.cu) =====================

// at the top of the file:

// #define SCAN_BLOCK_DIM   BLOCKSIZE  // needed by sharedMemExclusiveScan implementation
// #include "exclusiveScan.cu_inl"

// ...

// in a kernel:

// If you're using 2D indices, compute a linear thread index as folows.
// NOTE: scan assumes that every 32 adjacent linear thread indices 
// (0-31, 32-63, ...) form a warp, which means they execute in lockstep.

// If you do linearThreadIndex = threadIdx.x * blockDim.x + threadIdx.y;
// you will get a linear thread index, but it won't be sorted into warps,
// which will break scan!

// int linearThreadIndex =  threadIdx.y * blockDim.y + threadIdx.x;

// __shared__ uint prefixSumInput[BLOCKSIZE];
// __shared__ uint prefixSumOutput[BLOCKSIZE];
// __shared__ uint prefixSumScratch[2 * BLOCKSIZE];
// sharedMemExclusiveScan(linearThreadIndex, prefixSumInput, prefixSumOutput, prefixSumScratch, BLOCKSIZE);


#define LOG2_WARP_SIZE 5U
#define WARP_SIZE (1U << LOG2_WARP_SIZE)

//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE
inline __device__ uint
warpScanInclusive(int threadIndex, uint idata, volatile uint *s_Data, uint size){
    // Note some of the calculations are obscure because they are optimized.
    // For example, (threadIndex & (size - 1)) computes threadIndex % size,
    // which works, assuming size is a power of 2.

    uint pos = 2 * threadIndex - (threadIndex & (size - 1));
    s_Data[pos] = 0;
    pos += size;
    s_Data[pos] = idata;

    for(uint offset = 1; offset < size; offset <<= 1)
        s_Data[pos] += s_Data[pos - offset];

    return s_Data[pos];
}

inline __device__ uint warpScanExclusive(int threadIndex, uint idata, volatile uint *sScratch, uint size){
    return warpScanInclusive(threadIndex, idata, sScratch, size) - idata;
}

__inline__ __device__ void
sharedMemExclusiveScan(int threadIndex, uint* sInput, uint* sOutput, volatile uint* sScratch, uint size)
{
    if (size > WARP_SIZE) {

        uint idata = sInput[threadIndex];

        //Bottom-level inclusive warp scan
        uint warpResult = warpScanInclusive(threadIndex, idata, sScratch, WARP_SIZE);

        // Save top elements of each warp for exclusive warp scan sync
        // to wait for warp scans to complete (because s_Data is being
        // overwritten)
        __syncthreads();

        if ( (threadIndex & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
            sScratch[threadIndex >> LOG2_WARP_SIZE] = warpResult;

        // wait for warp scans to complete
        __syncthreads();

        if ( threadIndex < (SCAN_BLOCK_DIM / WARP_SIZE)) {
            // grab top warp elements
            uint val = sScratch[threadIndex];
            // calculate exclusive scan and write back to shared memory
            sScratch[threadIndex] = warpScanExclusive(threadIndex, val, sScratch, size >> LOG2_WARP_SIZE);
        }

        //return updated warp scans with exclusive scan results
        __syncthreads();

        sOutput[threadIndex] = warpResult + sScratch[threadIndex >> LOG2_WARP_SIZE] - idata;

    } else if (threadIndex < WARP_SIZE) {
        uint idata = sInput[threadIndex];
        sOutput[threadIndex] = warpScanExclusive(threadIndex, idata, sScratch, size);
    }
}