diff --git a/HelloSP.cu b/HelloSP.cu
index 7deb327..a09d2b5 100644
--- a/HelloSP.cu
+++ b/HelloSP.cu
@@ -155,11 +155,11 @@ void printErrorMessage(cudaError_t error, int memorySize){
 
 int main(int argc, const char * argv[])
 {
-	const UInt SP_SIZE = 1024;
-	const UInt IN_SIZE = 2048;
+	const UInt SP_SIZE = 1048576;
+	const UInt IN_SIZE = 2097152;
 	const UInt BLOCK_SIZE = 64; // Two warps
-	const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE; // 16
-	const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // 128 Size of chunk of input processed by a single cuda block
+	const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE;
+	const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block
 	const UInt MAX_CONNECTED = 16;
     const Real IN_DENSITY = 0.5; // Density of input connections
     srand(time(NULL));
@@ -169,7 +169,7 @@ int main(int argc, const char * argv[])
     // TODO: Compute the mem. requirements, allocate proper amount
 	// input chunk, overlaps, connected, boost factors
 	// size_t sm = (IN_BLOCK_SIZE)*sizeof(bool) + (2*BLOCK_SIZE)*sizeof(UInt) + (BLOCK_SIZE)*sizeof(Real);
-	size_t sm = BLOCK_SIZE*(sizeof(bool) + sizeof(UInt));
+	size_t sm = BLOCK_SIZE*sizeof(UInt);
 
     // construct input args
     args ar;
@@ -213,7 +213,7 @@ int main(int argc, const char * argv[])
 					BLOCK_SIZE, IN_BLOCK_SIZE);
 	generate01(in_host, IN_SIZE, IN_DENSITY);
 
-	visualize_input(in_host, potentialPools, permanences, numPotential, IN_SIZE, SP_SIZE, IN_BLOCK_SIZE, MAX_CONNECTED);
+	// visualize_input(in_host, potentialPools, permanences, numPotential, IN_SIZE, SP_SIZE, IN_BLOCK_SIZE, MAX_CONNECTED);
 
 	// Global memory pointers
 	args* ar_dev;
@@ -230,6 +230,7 @@ int main(int argc, const char * argv[])
     result = cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
     result = cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
     result = cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
+    result = cudaMalloc((void **) &ar.avg_act_dev, SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
 
 	// Memcpy to device
     result = cudaMemcpy(ar_dev, &ar, sizeof(ar), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
@@ -240,7 +241,26 @@ int main(int argc, const char * argv[])
     result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
 
 	// Kernel call
-    compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>>(ar_dev);
+	cudaStream_t stream1, stream2, stream3; //, stream4;
+	cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); cudaStreamCreateWithFlags(&stream3, cudaStreamNonBlocking); // cudaStreamCreateWithFlags(&stream4, cudaStreamNonBlocking);
+
+	// calculateOverlap<<<NUM_BLOCKS, BLOCK_SIZE, sm, stream1>>>(ar.in_dev, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
+	// inhibitColumns<<<NUM_BLOCKS, BLOCK_SIZE, sm, stream1>>>(olaps_sh, ar.cols_dev, ar.localAreaDensity);
+	overlap_inhibit<<<NUM_BLOCKS, BLOCK_SIZE, sm, stream1>>>(ar_dev);
+
+	cudaDeviceSynchronize();
+
+	adaptSynapses<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream1>>>(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, ar.cols_dev, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);	
+
+	averageActivity<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream2>>>(ar.cols_dev, ar.avg_act_dev);
+
+	updateDutyCycles<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream3>>>(ar.odc_dev, ar.adc_dev, ar.olaps_dev, ar.cols_dev, ar.iteration_num, ar.dutyCyclePeriod);
+
+	cudaDeviceSynchronize();
+
+	updateBoosts<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream1>>>(ar.adc_dev, ar.boosts_dev, ar.avg_act_dev, ar.boostStrength);
+
+	bumpUpColumnsWithWeakOdc<<<NUM_BLOCKS, BLOCK_SIZE, 0, stream2>>>(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);
 
     // Memcpy from device
     result = cudaMemcpy(cols_host, ar.cols_dev, SP_SIZE*sizeof(bool), cudaMemcpyDeviceToHost); if(result) printErrorMessage(result, 0); 
diff --git a/SpatialPooler.cu b/SpatialPooler.cu
index 4c5ada4..561a717 100644
--- a/SpatialPooler.cu
+++ b/SpatialPooler.cu
@@ -35,6 +35,7 @@ struct args
 	Real* odc_dev; // odc serve to maintain same act. freq. for each col. (per block)
 	Real* adc_dev; // adc serve to compute boost factors
 	UInt* numPot_dev;
+	Real* avg_act_dev;
 
 	// Constants
 	UInt SP_SIZE;
@@ -73,11 +74,11 @@ void calculateOverlap(bool* input, UInt* pot_dev, Real* per_dev, Real* boosts_de
 
 // TODO: This could be done via parallel sorting.
 __device__
-void inhibitColumns(UInt* olaps_sh, bool* cols_dev, bool* active_sh, bool &active, Real sparsity)
+void inhibitColumns(UInt* olaps_sh, bool* cols_dev, Real sparsity)
 {
     int tx = threadIdx.x;
 	int numLarger = 0;
-	active = false;
+	bool active = false;
 	
 	for(int i=0; i < blockDim.x; i++)
 	{
@@ -88,16 +89,15 @@ void inhibitColumns(UInt* olaps_sh, bool* cols_dev, bool* active_sh, bool &activ
 	__syncthreads();
 
 	cols_dev[blockIdx.x*blockDim.x + tx] = active;
-	active_sh[tx] = active;
 }
 
 // TODO: Can this be implemented via matrix (element-wise) multiplication?
-__device__
-void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool active, const UInt inBlockSize, const UInt MAX_CONNECTED)
+__global__
+void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* cols_dev, const UInt inBlockSize, const UInt MAX_CONNECTED)
 {
     int tx = threadIdx.x;
    	int inX = blockDim.x*blockIdx.x + tx;
-	if(active)
+	if(cols_dev[inX])
 	{
 		for(int i=0; i < MAX_CONNECTED; i++)
     	{
@@ -110,38 +110,43 @@ void adaptSynapses(bool* input, UInt* pot_dev, Real* per_dev, Real synPermActive
 	}
 }
 
-__device__
-void updateDutyCycles(Real* odc_dev, Real* adc_dev, UInt* olaps_sh, bool active, UInt iteration_num, UInt dutyCyclePeriod)
+__global__
+void updateDutyCycles(Real* odc_dev, Real* adc_dev, UInt* olaps_dev, bool* cols_dev, UInt iteration_num, UInt dutyCyclePeriod)
 {
     int tx = threadIdx.x;
+   	int inX = blockDim.x*blockIdx.x + tx;
+	bool active = cols_dev[inX];
 
 	// Let grow divisor only to a dutyCyclePeriod to not make the update increasingly negligible
 	Real period = dutyCyclePeriod > iteration_num ? iteration_num : dutyCyclePeriod;
 
-	odc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)(olaps_sh[tx] > 0)) / period;
+	odc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)(olaps_dev[inX] > 0)) / period;
 	adc_dev[blockDim.x*blockIdx.x+tx] = (odc_dev[blockDim.x*blockIdx.x+tx]*(period-1) + (Real)active) / period;
 }
 
 // TODO: This can be done via reduction.
-__device__
-void averageActivity(bool* active_sh, Real &avg)
+__global__
+void averageActivity(bool* cols_dev, Real* avg_act_dev)
 {
-	avg = 0;
+	Real avg = 0;
+	int start = blockDim.x*blockIdx.x;
+	int inX = blockDim.x*blockIdx.x + threadIdx.x;
 	for(int i=0; i < blockDim.x; i++)
 	{
-		avg += (Real)active_sh[i];
+		avg += (Real)cols_dev[start+i];
 	}
 	avg /= (Real)blockDim.x;
+	avg_act_dev[inX] = avg;
 }
 
-__device__
-void updateBoosts(Real* adc_dev, Real* boosts_dev, Real targetDensity, Real boostStrength)
+__global__
+void updateBoosts(Real* adc_dev, Real* boosts_dev, Real* targetDensity, Real boostStrength)
 {
     int inX = blockIdx.x*blockDim.x+threadIdx.x;
-	boosts_dev[inX] = exp((targetDensity - adc_dev[inX])*boostStrength);
+	boosts_dev[inX] = exp((targetDensity[inX] - adc_dev[inX])*boostStrength);
 }
 
-__device__
+__global__
 void bumpUpColumnsWithWeakOdc(Real* odc_dev, Real* per_dev, UInt* numPot, Real minOdc, Real synPermBelowStimulusInc, const UInt MAX_CONNECTED)
 {
 	int tx = threadIdx.x;
@@ -154,7 +159,7 @@ void bumpUpColumnsWithWeakOdc(Real* odc_dev, Real* per_dev, UInt* numPot, Real m
 }
 
 // TODO: This can be done via reduction.
-__device__
+__global__
 void updateMinOdc(Real* odc_dev, Real &minOdc, Real minPctOdc, const UInt SP_SIZE)
 {
 	Real maxOdc = 0;
@@ -164,59 +169,25 @@ void updateMinOdc(Real* odc_dev, Real &minOdc, Real minPctOdc, const UInt SP_SIZ
 }
 
 __global__
-void compute(args* ar_ptr)
+void overlap_inhibit(args* ar_ptr)
 {
 	if (blockIdx.x == 0 && threadIdx.x == 0) 
 		ar_ptr->iteration_num++;
 	
 	args ar = *ar_ptr;
 
-	bool active = false;
-	Real avg_act = 0;
-
     extern __shared__ UInt shared[];
 	UInt* olaps_sh = &shared[0];
-	bool* active_sh = (bool*)&shared[blockDim.x];
 
-	// TODO: Decide on what to copy to local memory. 
-	// TODO: When do we need to synchronize threads?
 
 	calculateOverlap(ar.in_dev, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
 
 	__syncthreads();
 
-	inhibitColumns(olaps_sh, ar.cols_dev, active_sh, active, ar.localAreaDensity);
-	
-	__syncthreads();
-
-	// printf(" %d ", olaps_sh[tx]);
-
-	adaptSynapses(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, active, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
+	inhibitColumns(olaps_sh, ar.cols_dev, ar.localAreaDensity);
 	
-	/* Dependencies:
-	   adaptSynapses, updateDutyCycles, averageActivity - independent
-	   updateDutyCycles->updateBoosts
-	   updateDutyCycles->updateMinOdc
-	   updateDutyCycles->bumpUp
-	   updateBoosts, updateMinOdc, bumpUp - independent
-
-	   calculateOverlap->inhibit->(adaptSynapses, averageActivity), updateDutyCycles->(updateBoosts, updateMinOdc, bumpUp)
-
-       TODO: Separate subroutines into 5 semi-dependent cuda streams.
-	*/
-
-	updateDutyCycles(ar.odc_dev, ar.adc_dev, olaps_sh, active, ar.iteration_num, ar.dutyCyclePeriod);
-
-	averageActivity(active_sh, avg_act);
-
-	__syncthreads();
-
-	updateBoosts(ar.adc_dev, ar.boosts_dev, avg_act, ar.boostStrength);
-
-	bumpUpColumnsWithWeakOdc(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);
-
-	if(ar.iteration_num % ar.update_period == 0)
-		updateMinOdc(ar.odc_dev, ar.minOdc, ar.minPctOdc, ar.SP_SIZE);
+	// if(ar.iteration_num % ar.update_period == 0)
+	//	updateMinOdc(ar.odc_dev, ar.minOdc, ar.minPctOdc, ar.SP_SIZE);
 }
 
 
@@ -240,24 +211,20 @@ void inhibitColumns_wrapper(UInt* olaps_dev, bool* cols_dev, Real localAreaDensi
 {
 	extern __shared__ UInt shared[];
 	UInt* olaps_sh = &shared[0];
-	bool* active_sh = (bool*) &olaps_sh[BLOCK_SIZE];
 
 	olaps_sh[threadIdx.x] = olaps_dev[threadIdx.x];
 
-	bool active = false;
-
 	__syncthreads();
 
-	inhibitColumns(olaps_sh, cols_dev, active_sh, active, localAreaDensity);
+	inhibitColumns(olaps_sh, cols_dev, localAreaDensity);
 }
 
-__global__
-void adaptSynapses_wrapper(bool* in_dev, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* active_arr, const UInt IN_BLOCK_SIZE, const UInt MAX_CONNECTED, const UInt SP_SIZE)
-{
-	int inX = blockIdx.x*blockDim.x + threadIdx.x;
-	if(inX < SP_SIZE)
-	{
-		bool active = active_arr[inX];
-		adaptSynapses(in_dev, pot_dev, per_dev, synPermActiveInc, synPermInactiveDec, active, IN_BLOCK_SIZE, MAX_CONNECTED);
-	}
-}
+// __global__
+// void adaptSynapses_wrapper(bool* in_dev, UInt* pot_dev, Real* per_dev, Real synPermActiveInc, Real synPermInactiveDec, bool* active_arr, const UInt IN_BLOCK_SIZE, const UInt MAX_CONNECTED, const UInt SP_SIZE)
+// {
+// 	int inX = blockIdx.x*blockDim.x + threadIdx.x;
+// 	if(inX < SP_SIZE)
+// 	{
+// 		adaptSynapses(in_dev, pot_dev, per_dev, synPermActiveInc, synPermInactiveDec, active_arr, IN_BLOCK_SIZE, MAX_CONNECTED);
+// 	}
+// }