Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallel Streams - WIP #6

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions HelloSP.cu
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ void printErrorMessage(cudaError_t error, int memorySize){

int main(int argc, const char * argv[])
{
const UInt SP_SIZE = 1024;
const UInt IN_SIZE = 2048;
const UInt SP_SIZE = 1048576;
const UInt IN_SIZE = 2097152;
const UInt BLOCK_SIZE = 64; // Two warps
const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE; // 16
const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // 128 Size of chunk of input processed by a single cuda block
const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE;
const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block
const UInt MAX_CONNECTED = 16;
const Real IN_DENSITY = 0.5; // Density of input connections
srand(time(NULL));
Expand All @@ -169,7 +169,7 @@ int main(int argc, const char * argv[])
// TODO: Compute the mem. requirements, allocate proper amount
// input chunk, overlaps, connected, boost factors
// size_t sm = (IN_BLOCK_SIZE)*sizeof(bool) + (2*BLOCK_SIZE)*sizeof(UInt) + (BLOCK_SIZE)*sizeof(Real);
size_t sm = BLOCK_SIZE*(sizeof(bool) + sizeof(UInt));
size_t sm = BLOCK_SIZE*sizeof(UInt);

// construct input args
args ar;
Expand Down Expand Up @@ -213,7 +213,7 @@ int main(int argc, const char * argv[])
BLOCK_SIZE, IN_BLOCK_SIZE);
generate01(in_host, IN_SIZE, IN_DENSITY);

visualize_input(in_host, potentialPools, permanences, numPotential, IN_SIZE, SP_SIZE, IN_BLOCK_SIZE, MAX_CONNECTED);
// visualize_input(in_host, potentialPools, permanences, numPotential, IN_SIZE, SP_SIZE, IN_BLOCK_SIZE, MAX_CONNECTED);

// Global memory pointers
args* ar_dev;
Expand All @@ -230,6 +230,7 @@ int main(int argc, const char * argv[])
result = cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);
result = cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);
result = cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);
result = cudaMalloc((void **) &ar.avg_act_dev, SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);

// Memcpy to device
result = cudaMemcpy(ar_dev, &ar, sizeof(ar), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
Expand All @@ -240,7 +241,26 @@ int main(int argc, const char * argv[])
result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);

// Kernel call
compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>>(ar_dev);
cudaStream_t stream1, stream2, stream3; //, stream4;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream3, cudaStreamNonBlocking); // cudaStreamCreateWithFlags(&stream4, cudaStreamNonBlocking);

// calculateOverlap<<<NUM_BLOCKS, BLOCK_SIZE, sm, stream1>>>(ar.in_dev, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
// inhibitColumns<<<NUM_BLOCKS, BLOCK_SIZE, sm, stream1>>>(olaps_sh, ar.cols_dev, ar.localAreaDensity);
overlap_inhibit<<<NUM_BLOCKS, BLOCK_SIZE, sm, stream1>>>(ar_dev);

cudaDeviceSynchronize();

adaptSynapses<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream1>>>(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, ar.cols_dev, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);

averageActivity<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream2>>>(ar.cols_dev, ar.avg_act_dev);

updateDutyCycles<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream3>>>(ar.odc_dev, ar.adc_dev, ar.olaps_dev, ar.cols_dev, ar.iteration_num, ar.dutyCyclePeriod);

cudaDeviceSynchronize();

updateBoosts<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream1>>>(ar.adc_dev, ar.boosts_dev, ar.avg_act_dev, ar.boostStrength);

bumpUpColumnsWithWeakOdc<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream2>>>(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);

// Memcpy from device
result = cudaMemcpy(cols_host, ar.cols_dev, SP_SIZE*sizeof(bool), cudaMemcpyDeviceToHost); if(result) printErrorMessage(result, 0);
Expand Down
107 changes: 37 additions & 70 deletions SpatialPooler.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct args
Real* odc_dev; // odc serve to maintain same act. freq. for each col. (per block)
Real* adc_dev; // adc serve to compute boost factors
UInt* numPot_dev;
Real* avg_act_dev;

// Constants
UInt SP_SIZE;
Expand Down Expand Up @@ -73,11 +74,11 @@ void calculateOverlap(bool* input, UInt* pot_dev, Real* per_dev, Real* boosts_de

// TODO: This could be done via parallel sorting.
__device__
void inhibitColumns(UInt* olaps_sh, bool* cols_dev, bool* active_sh, bool &active, Real sparsity)
void inhibitColumns(UInt* olaps_sh, bool* cols_dev, Real sparsity)
{
int tx = threadIdx.x;
int numLarger = 0;
active = false;
bool active = false;

for(int i=0; i < blockDim.x; i++)
{
Expand All @@ -88,16 +89,15 @@ void inhibitColumns(UInt* olaps_sh, bool* cols_dev, bool* active_sh, bool &activ
__syncthreads();

cols_dev[blockIdx.x*blockDim.x + tx] = active;
active_sh[tx] = active;
}

// TODO: Can this be implemented via matrix (element-wise) multiplication?
__device__
void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool active, const UInt inBlockSize, const UInt MAX_CONNECTED)
__global__
void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* cols_dev, const UInt inBlockSize, const UInt MAX_CONNECTED)
{
int tx = threadIdx.x;
int inX = blockDim.x*blockIdx.x + tx;
if(active)
if(cols_dev[inX])
{
for(int i=0; i < MAX_CONNECTED; i++)
{
Expand All @@ -110,38 +110,43 @@ void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActive
}
}

__device__
void updateDutyCycles(Real* odc_dev, Real* adc_dev, UInt* olaps_sh, bool active, UInt iteration_num, UInt dutyCyclePeriod)
__global__
void updateDutyCycles(Real* odc_dev, Real* adc_dev, UInt* olaps_dev, bool* cols_dev, UInt iteration_num, UInt dutyCyclePeriod)
{
int tx = threadIdx.x;
int inX = blockDim.x*blockIdx.x + tx;
bool active = cols_dev[inX];

// Let grow divisor only to a dutyCyclePeriod to not make the update increasingly negligible
Real period = dutyCyclePeriod > iteration_num ? iteration_num : dutyCyclePeriod;

odc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)(olaps_sh[tx] > 0)) / period;
odc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)(olaps_dev[inX] > 0)) / period;
adc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)active) / period;
}

// TODO: This can be done via reduction.
__device__
void averageActivity(bool* active_sh, Real &avg)
__global__
void averageActivity(bool* cols_dev, Real* avg_act_dev)
{
avg = 0;
Real avg = 0;
int start = blockDim.x*blockIdx.x;
int inX = blockDim.x*blockIdx.x + threadIdx.x;
for(int i=0; i < blockDim.x; i++)
{
avg += (Real)active_sh[i];
avg += (Real)cols_dev[start+i];
}
avg /= (Real)blockDim.x;
avg_act_dev[inX] = avg;
}

__device__
void updateBoosts(Real* adc_dev, Real* boosts_dev, Real targetDensity, Real boostStrength)
__global__
void updateBoosts(Real* adc_dev, Real* boosts_dev, Real* targetDensity, Real boostStrength)
{
int inX = blockIdx.x*blockDim.x+threadIdx.x;
boosts_dev[inX] = exp((targetDensity - adc_dev[inX])*boostStrength);
boosts_dev[inX] = exp((targetDensity[inX] - adc_dev[inX])*boostStrength);
}

__device__
__global__
void bumpUpColumnsWithWeakOdc(Real* odc_dev, Real* per_dev, UInt* numPot, Real minOdc, Real synPermBelowStimulusInc, const UInt MAX_CONNECTED)
{
int tx = threadIdx.x;
Expand All @@ -154,7 +159,7 @@ void bumpUpColumnsWithWeakOdc(Real* odc_dev, Real* per_dev, UInt* numPot, Real m
}

// TODO: This can be done via reduction.
__device__
__global__
void updateMinOdc(Real* odc_dev, Real &minOdc, Real minPctOdc, const UInt SP_SIZE)
{
Real maxOdc = 0;
Expand All @@ -164,59 +169,25 @@ void updateMinOdc(Real* odc_dev, Real &minOdc, Real minPctOdc, const UInt SP_SIZ
}

__global__
void compute(args* ar_ptr)
void overlap_inhibit(args* ar_ptr)
{
if (blockIdx.x == 0 && threadIdx.x == 0)
ar_ptr->iteration_num++;

args ar = *ar_ptr;

bool active = false;
Real avg_act = 0;

extern __shared__ UInt shared[];
UInt* olaps_sh = &shared[0];
bool* active_sh = (bool*)&shared[blockDim.x];

// TODO: Decide on what to copy to local memory.
// TODO: When do we need to synchronize threads?

calculateOverlap(ar.in_dev, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);

__syncthreads();

inhibitColumns(olaps_sh, ar.cols_dev, active_sh, active, ar.localAreaDensity);

__syncthreads();

// printf(" %d ", olaps_sh[tx]);

adaptSynapses(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, active, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
inhibitColumns(olaps_sh, ar.cols_dev, ar.localAreaDensity);

/* Dependencies:
adaptSynapses, updateDutyCycles, averageActivity - independent
updateDutyCycles->updateBoosts
updateDutyCycles->updateMinOdc
updateDutyCycles->bumpUp
updateBoosts, updateMinOdc, bumpUp - independent

calculateOverlap->inhibit->(adaptSynapses, averageActivity), updateDutyCycles->(updateBoosts, updateMinOdc, bumpUp)

TODO: Separate subroutines into 5 semi-dependent cuda streams.
*/

updateDutyCycles(ar.odc_dev, ar.adc_dev, olaps_sh, active, ar.iteration_num, ar.dutyCyclePeriod);

averageActivity(active_sh, avg_act);

__syncthreads();

updateBoosts(ar.adc_dev, ar.boosts_dev, avg_act, ar.boostStrength);

bumpUpColumnsWithWeakOdc(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);

if(ar.iteration_num % ar.update_period == 0)
updateMinOdc(ar.odc_dev, ar.minOdc, ar.minPctOdc, ar.SP_SIZE);
// if(ar.iteration_num % ar.update_period == 0)
// updateMinOdc(ar.odc_dev, ar.minOdc, ar.minPctOdc, ar.SP_SIZE);
}


Expand All @@ -240,24 +211,20 @@ void inhibitColumns_wrapper(UInt* olaps_dev, bool* cols_dev, Real localAreaDensi
{
extern __shared__ UInt shared[];
UInt* olaps_sh = &shared[0];
bool* active_sh = (bool*) &olaps_sh[BLOCK_SIZE];

olaps_sh[threadIdx.x] = olaps_dev[threadIdx.x];

bool active = false;

__syncthreads();

inhibitColumns(olaps_sh, cols_dev, active_sh, active, localAreaDensity);
inhibitColumns(olaps_sh, cols_dev, localAreaDensity);
}

__global__
void adaptSynapses_wrapper(bool* in_dev, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* active_arr, const UInt IN_BLOCK_SIZE, const UInt MAX_CONNECTED, const UInt SP_SIZE)
{
int inX = blockIdx.x*blockDim.x + threadIdx.x;
if(inX < SP_SIZE)
{
bool active = active_arr[inX];
adaptSynapses(in_dev, pot_dev, per_dev, synPermActiveInc, synPermInactiveDec, active, IN_BLOCK_SIZE, MAX_CONNECTED);
}
}
// __global__
// void adaptSynapses_wrapper(bool* in_dev, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* active_arr, const UInt IN_BLOCK_SIZE, const UInt MAX_CONNECTED, const UInt SP_SIZE)
// {
// int inX = blockIdx.x*blockDim.x + threadIdx.x;
// if(inX < SP_SIZE)
// {
// adaptSynapses(in_dev, pot_dev, per_dev, synPermActiveInc, synPermInactiveDec, active_arr, IN_BLOCK_SIZE, MAX_CONNECTED);
// }
// }