diff --git a/HelloSP.cu b/HelloSP.cu index 7deb327..a09d2b5 100644 --- a/HelloSP.cu +++ b/HelloSP.cu @@ -155,11 +155,11 @@ void printErrorMessage(cudaError_t error, int memorySize){ int main(int argc, const char * argv[]) { - const UInt SP_SIZE = 1024; - const UInt IN_SIZE = 2048; + const UInt SP_SIZE = 1048576; + const UInt IN_SIZE = 2097152; const UInt BLOCK_SIZE = 64; // Two warps - const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE; // 16 - const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // 128 Size of chunk of input processed by a single cuda block + const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE; + const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block const UInt MAX_CONNECTED = 16; const Real IN_DENSITY = 0.5; // Density of input connections srand(time(NULL)); @@ -169,7 +169,7 @@ int main(int argc, const char * argv[]) // TODO: Compute the mem. requirements, allocate proper amount // input chunk, overlaps, connected, boost factors // size_t sm = (IN_BLOCK_SIZE)*sizeof(bool) + (2*BLOCK_SIZE)*sizeof(UInt) + (BLOCK_SIZE)*sizeof(Real); - size_t sm = BLOCK_SIZE*(sizeof(bool) + sizeof(UInt)); + size_t sm = BLOCK_SIZE*sizeof(UInt); // construct input args args ar; @@ -213,7 +213,7 @@ int main(int argc, const char * argv[]) BLOCK_SIZE, IN_BLOCK_SIZE); generate01(in_host, IN_SIZE, IN_DENSITY); - visualize_input(in_host, potentialPools, permanences, numPotential, IN_SIZE, SP_SIZE, IN_BLOCK_SIZE, MAX_CONNECTED); + // visualize_input(in_host, potentialPools, permanences, numPotential, IN_SIZE, SP_SIZE, IN_BLOCK_SIZE, MAX_CONNECTED); // Global memory pointers args* ar_dev; @@ -230,6 +230,7 @@ int main(int argc, const char * argv[]) result = cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); result = cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); result = cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); + result = cudaMalloc((void **) &ar.avg_act_dev, SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); // Memcpy to device result = cudaMemcpy(ar_dev, &ar, sizeof(ar), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0); @@ -240,7 +241,26 @@ int main(int argc, const char * argv[]) result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0); // Kernel call - compute<<>>(ar_dev); + cudaStream_t stream1, stream2, stream3; //, stream4; + cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream3, cudaStreamNonBlocking); // cudaStreamCreateWithFlags(&stream4, cudaStreamNonBlocking); + + // calculateOverlap<<>>(ar.in_dev, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED); + // inhibitColumns<<>>(olaps_sh, ar.cols_dev, ar.localAreaDensity); + overlap_inhibit<<>>(ar_dev); + + cudaDeviceSynchronize(); + + adaptSynapses<<>>(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, ar.cols_dev, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED); + + averageActivity<<>>(ar.cols_dev, ar.avg_act_dev); + + updateDutyCycles<<>>(ar.odc_dev, ar.adc_dev, ar.olaps_dev, ar.cols_dev, ar.iteration_num, ar.dutyCyclePeriod); + + cudaDeviceSynchronize(); + + updateBoosts<<>>(ar.adc_dev, ar.boosts_dev, ar.avg_act_dev, ar.boostStrength); + + bumpUpColumnsWithWeakOdc<<>>(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED); // Memcpy from device result = cudaMemcpy(cols_host, ar.cols_dev, SP_SIZE*sizeof(bool), cudaMemcpyDeviceToHost); if(result) printErrorMessage(result, 0); diff --git a/SpatialPooler.cu b/SpatialPooler.cu index 4c5ada4..561a717 100644 --- a/SpatialPooler.cu +++ b/SpatialPooler.cu @@ -35,6 +35,7 @@ struct args Real* odc_dev; // odc serve to maintain same act. freq. for each col. (per block) Real* adc_dev; // adc serve to compute boost factors UInt* numPot_dev; + Real* avg_act_dev; // Constants UInt SP_SIZE; @@ -73,11 +74,11 @@ void calculateOverlap(bool* input, UInt* pot_dev, Real* per_dev, Real* boosts_de // TODO: This could be done via parallel sorting. __device__ -void inhibitColumns(UInt* olaps_sh, bool* cols_dev, bool* active_sh, bool &active, Real sparsity) +void inhibitColumns(UInt* olaps_sh, bool* cols_dev, Real sparsity) { int tx = threadIdx.x; int numLarger = 0; - active = false; + bool active = false; for(int i=0; i < blockDim.x; i++) { @@ -88,16 +89,15 @@ void inhibitColumns(UInt* olaps_sh, bool* cols_dev, bool* active_sh, bool &activ __syncthreads(); cols_dev[blockIdx.x*blockDim.x + tx] = active; - active_sh[tx] = active; } // TODO: Can this be implemented via matrix (element-wise) multiplication? -__device__ -void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool active, const UInt inBlockSize, const UInt MAX_CONNECTED) +__global__ +void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* cols_dev, const UInt inBlockSize, const UInt MAX_CONNECTED) { int tx = threadIdx.x; int inX = blockDim.x*blockIdx.x + tx; - if(active) + if(cols_dev[inX]) { for(int i=0; i < MAX_CONNECTED; i++) { @@ -110,38 +110,43 @@ void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActive } } -__device__ -void updateDutyCycles(Real* odc_dev, Real* adc_dev, UInt* olaps_sh, bool active, UInt iteration_num, UInt dutyCyclePeriod) +__global__ +void updateDutyCycles(Real* odc_dev, Real* adc_dev, UInt* olaps_dev, bool* cols_dev, UInt iteration_num, UInt dutyCyclePeriod) { int tx = threadIdx.x; + int inX = blockDim.x*blockIdx.x + tx; + bool active = cols_dev[inX]; // Let grow divisor only to a dutyCyclePeriod to not make the update increasingly negligible Real period = dutyCyclePeriod > iteration_num ? iteration_num : dutyCyclePeriod; - odc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)(olaps_sh[tx] > 0)) / period; + odc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)(olaps_dev[inX] > 0)) / period; adc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)active) / period; } // TODO: This can be done via reduction. -__device__ -void averageActivity(bool* active_sh, Real &avg) +__global__ +void averageActivity(bool* cols_dev, Real* avg_act_dev) { - avg = 0; + Real avg = 0; + int start = blockDim.x*blockIdx.x; + int inX = blockDim.x*blockIdx.x + threadIdx.x; for(int i=0; i < blockDim.x; i++) { - avg += (Real)active_sh[i]; + avg += (Real)cols_dev[start+i]; } avg /= (Real)blockDim.x; + avg_act_dev[inX] = avg; } -__device__ -void updateBoosts(Real* adc_dev, Real* boosts_dev, Real targetDensity, Real boostStrength) +__global__ +void updateBoosts(Real* adc_dev, Real* boosts_dev, Real* targetDensity, Real boostStrength) { int inX = blockIdx.x*blockDim.x+threadIdx.x; - boosts_dev[inX] = exp((targetDensity - adc_dev[inX])*boostStrength); + boosts_dev[inX] = exp((targetDensity[inX] - adc_dev[inX])*boostStrength); } -__device__ +__global__ void bumpUpColumnsWithWeakOdc(Real* odc_dev, Real* per_dev, UInt* numPot, Real minOdc, Real synPermBelowStimulusInc, const UInt MAX_CONNECTED) { int tx = threadIdx.x; @@ -154,7 +159,7 @@ void bumpUpColumnsWithWeakOdc(Real* odc_dev, Real* per_dev, UInt* numPot, Real m } // TODO: This can be done via reduction. -__device__ +__global__ void updateMinOdc(Real* odc_dev, Real &minOdc, Real minPctOdc, const UInt SP_SIZE) { Real maxOdc = 0; @@ -164,59 +169,25 @@ void updateMinOdc(Real* odc_dev, Real &minOdc, Real minPctOdc, const UInt SP_SIZ } __global__ -void compute(args* ar_ptr) +void overlap_inhibit(args* ar_ptr) { if (blockIdx.x == 0 && threadIdx.x == 0) ar_ptr->iteration_num++; args ar = *ar_ptr; - bool active = false; - Real avg_act = 0; - extern __shared__ UInt shared[]; UInt* olaps_sh = &shared[0]; - bool* active_sh = (bool*)&shared[blockDim.x]; - // TODO: Decide on what to copy to local memory. - // TODO: When do we need to synchronize threads? calculateOverlap(ar.in_dev, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED); __syncthreads(); - inhibitColumns(olaps_sh, ar.cols_dev, active_sh, active, ar.localAreaDensity); - - __syncthreads(); - - // printf(" %d ", olaps_sh[tx]); - - adaptSynapses(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, active, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED); + inhibitColumns(olaps_sh, ar.cols_dev, ar.localAreaDensity); - /* Dependencies: - adaptSynapses, updateDutyCycles, averageActivity - independent - updateDutyCycles->updateBoosts - updateDutyCycles->updateMinOdc - updateDutyCycles->bumpUp - updateBoosts, updateMinOdc, bumpUp - independent - - calculateOverlap->inhibit->(adaptSynapses, averageActivity), updateDutyCycles->(updateBoosts, updateMinOdc, bumpUp) - - TODO: Separate subroutines into 5 semi-dependent cuda streams. - */ - - updateDutyCycles(ar.odc_dev, ar.adc_dev, olaps_sh, active, ar.iteration_num, ar.dutyCyclePeriod); - - averageActivity(active_sh, avg_act); - - __syncthreads(); - - updateBoosts(ar.adc_dev, ar.boosts_dev, avg_act, ar.boostStrength); - - bumpUpColumnsWithWeakOdc(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED); - - if(ar.iteration_num % ar.update_period == 0) - updateMinOdc(ar.odc_dev, ar.minOdc, ar.minPctOdc, ar.SP_SIZE); + // if(ar.iteration_num % ar.update_period == 0) + // updateMinOdc(ar.odc_dev, ar.minOdc, ar.minPctOdc, ar.SP_SIZE); } @@ -240,24 +211,20 @@ void inhibitColumns_wrapper(UInt* olaps_dev, bool* cols_dev, Real localAreaDensi { extern __shared__ UInt shared[]; UInt* olaps_sh = &shared[0]; - bool* active_sh = (bool*) &olaps_sh[BLOCK_SIZE]; olaps_sh[threadIdx.x] = olaps_dev[threadIdx.x]; - bool active = false; - __syncthreads(); - inhibitColumns(olaps_sh, cols_dev, active_sh, active, localAreaDensity); + inhibitColumns(olaps_sh, cols_dev, localAreaDensity); } -__global__ -void adaptSynapses_wrapper(bool* in_dev, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* active_arr, const UInt IN_BLOCK_SIZE, const UInt MAX_CONNECTED, const UInt SP_SIZE) -{ - int inX = blockIdx.x*blockDim.x + threadIdx.x; - if(inX < SP_SIZE) - { - bool active = active_arr[inX]; - adaptSynapses(in_dev, pot_dev, per_dev, synPermActiveInc, synPermInactiveDec, active, IN_BLOCK_SIZE, MAX_CONNECTED); - } -} +// __global__ +// void adaptSynapses_wrapper(bool* in_dev, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* active_arr, const UInt IN_BLOCK_SIZE, const UInt MAX_CONNECTED, const UInt SP_SIZE) +// { +// int inX = blockIdx.x*blockDim.x + threadIdx.x; +// if(inX < SP_SIZE) +// { +// adaptSynapses(in_dev, pot_dev, per_dev, synPermActiveInc, synPermInactiveDec, active_arr, IN_BLOCK_SIZE, MAX_CONNECTED); +// } +// }