Initial release

tsiv · Jun 24, 2014 · 8701d5e · 8701d5e
commit 8701d5e
Show file tree

Hide file tree

Showing 891 changed files with 223,095 additions and 0 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1 @@
+See README.txt
diff --git a/COPYING b/COPYING
@@ -0,0 +1 @@
+See LICENSE.txt
diff --git a/ChangeLog b/ChangeLog
@@ -0,0 +1 @@
+See README.txt
diff --git a/INSTALL b/INSTALL
@@ -0,0 +1,9 @@
+
+It is advised to run ./autogen.sh before./configure (autoconf and automake
+need to be installed on your system for autogen.sh to work)
+
+./configure has an option named --with-cuda that allows you to specify
+where your CUDA 5.5 toolkit is installed (usually /usr/local/cuda-5.5,
+but some distros may have a different default location)
+
+See README.txt
diff --git a/JHA/.deps/.dirstamp b/JHA/.deps/.dirstamp
diff --git a/JHA/.dirstamp b/JHA/.dirstamp
diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu
@@ -0,0 +1,352 @@
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "sm_30_intrinsics.h"
+
+#include <stdio.h>
+#include <memory.h>
+#include <stdint.h>
+
+// aus cpu-miner.c
+extern int device_map[8];
+
+// diese Struktur wird in der Init Funktion angefordert
+static cudaDeviceProp props[8];
+
+static uint32_t *d_tempBranch1Nonces[8];
+static uint32_t *d_numValid[8];
+static uint32_t *h_numValid[8];
+
+static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
+
+// aus heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+// True/False tester
+typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
+
+__device__ uint32_t JackpotTrueTest(uint32_t *inpHash)
+{
+	uint32_t tmp = inpHash[0] & 0x01;
+	return (tmp == 1);
+}
+
+__device__ uint32_t JackpotFalseTest(uint32_t *inpHash)
+{
+	uint32_t tmp = inpHash[0] & 0x01;
+	return (tmp == 0);
+}
+
+__device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d_JackpotFalseFunction = JackpotFalseTest;
+cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
+
+// Setup-Funktionen
+__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
+{
+	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
+
+	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
+	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
+
+	// wir brauchen auch Speicherplatz auf dem Device
+	cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);	
+	cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
+	cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
+
+	uint32_t s1;
+	s1 = (threads / 256) * 2;
+
+	cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
+	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
+}
+
+// Die Summenfunktion (vom NVIDIA SDK)
+__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+{
+	extern __shared__ uint32_t sums[];
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	//int lane_id = id % warpSize;
+	int lane_id = id % width;
+	// determine a warp_id within a block
+	 //int warp_id = threadIdx.x / warpSize;
+	int warp_id = threadIdx.x / width;
+
+	sums[lane_id] = 0;
+
+	// Below is the basic structure of using a shfl instruction
+	// for a scan.
+	// Record "value" as a variable - we accumulate it along the way
+	uint32_t value;
+	if(testFunc != NULL)
+	{
+		if (id < threads)
+		{
+			uint32_t *inpHash;
+			if(d_validNonceTable == NULL)
+			{
+				// keine Nonce-Liste
+				inpHash = &inpHashes[id<<4];
+			}else
+			{
+				// Nonce-Liste verfügbar
+				int nonce = d_validNonceTable[id] - startNounce;
+				inpHash = &inpHashes[nonce<<4];
+			}			
+			value = (*testFunc)(inpHash);
+		}else
+		{
+			value = 0;
+		}
+	}else
+	{
+		value = data[id];
+	}
+
+	__syncthreads();
+
+	// Now accumulate in log steps up the chain
+	// compute sums, with another thread's value who is
+	// distance delta away (i).  Note
+	// those threads where the thread 'i' away would have
+	// been out of bounds of the warp are unaffected.  This
+	// creates the scan sum.
+#pragma unroll
+
+	for (int i=1; i<=width; i*=2)
+	{
+		uint32_t n = __shfl_up((int)value, i, width);
+
+		if (lane_id >= i) value += n;
+	}
+
+	// value now holds the scan value for the individual thread
+	// next sum the largest values for each warp
+
+	// write the sum of the warp to smem
+	//if (threadIdx.x % warpSize == warpSize-1)
+	if (threadIdx.x % width == width-1)
+	{
+		sums[warp_id] = value;
+	}
+
+	__syncthreads();
+
+	//
+	// scan sum the warp sums
+	// the same shfl scan operation, but performed on warp sums
+	//
+	if (warp_id == 0)
+	{
+		uint32_t warp_sum = sums[lane_id];
+
+		for (int i=1; i<=width; i*=2)
+		{
+			uint32_t n = __shfl_up((int)warp_sum, i, width);
+
+		if (lane_id >= i) warp_sum += n;
+		}
+
+		sums[lane_id] = warp_sum;
+	}
+
+	__syncthreads();
+
+	// perform a uniform add across warps in the block
+	// read neighbouring warp's sum and add it to threads value
+	uint32_t blockSum = 0;
+
+	if (warp_id > 0)
+	{
+		blockSum = sums[warp_id-1];
+	}
+
+	value += blockSum;
+
+	// Now write out our result
+	data[id] = value;
+
+	// last thread has sum, write write out the block's sum
+	if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
+	{
+		partial_sums[blockIdx.x] = value;
+	}
+}
+
+// Uniform add: add partial sums array
+__global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
+{
+	__shared__ uint32_t buf;
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	if (id > len) return;
+
+	if (threadIdx.x == 0)
+	{
+		buf = partial_sums[blockIdx.x];
+	}
+
+	__syncthreads();
+	data[id] += buf;
+}
+
+// Der Scatter
+__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+{
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	uint32_t actNounce = id;
+	uint32_t value;
+	if (id < threads)
+	{
+//		uint32_t nounce = startNounce + id;
+		uint32_t *inpHash;
+		if(d_validNonceTable == NULL)
+		{
+			// keine Nonce-Liste
+			inpHash = &inpHashes[id<<4];
+		}else
+		{
+			// Nonce-Liste verfügbar
+			int nonce = d_validNonceTable[id] - startNounce;
+			actNounce = nonce;
+			inpHash = &inpHashes[nonce<<4];
+		}
+
+		value = (*testFunc)(inpHash);
+	}else
+	{
+		value = 0;
+	}
+
+	if( value )
+	{
+		int idx = sum[id];
+		if(idx > 0)
+			outp[idx-1] = startNounce + actNounce;
+	}
+}
+
+__host__ static uint32_t jackpot_compactTest_roundUpExp(uint32_t val)
+{
+	if(val == 0)
+		return 0;
+
+	uint32_t mask = 0x80000000;
+	while( (val & mask) == 0 ) mask = mask >> 1;
+
+	if( (val & (~mask)) != 0 )
+		return mask << 1;
+
+	return mask;
+}
+
+__host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
+														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
+														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+{
+	int orgThreads = threads;
+	threads = (int)jackpot_compactTest_roundUpExp((uint32_t)threads);
+	// threadsPerBlock ausrechnen
+	int blockSize = 256;
+	int nSummen = threads / blockSize;
+
+	int thr1 = (threads+blockSize-1) / blockSize;
+	int thr2 = threads / (blockSize*blockSize);
+	int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
+	int thr3 = (nSummen + blockSize2-1) / blockSize2;
+
+	bool callThrid = (thr2 > 0) ? true : false;
+
+	// Erster Initialscan
+	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
+		d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
+
+	// weitere Scans
+	if(callThrid)
+	{		
+		jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
+		jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
+	}else
+	{
+		jackpot_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
+	}
+
+	// Sync + Anzahl merken
+	cudaStreamSynchronize(NULL);
+
+	if(callThrid)
+		cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	else
+		cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+
+	// Addieren
+	if(callThrid)
+	{
+		jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
+	}
+	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
+
+	// Scatter
+	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
+		function, orgThreads, startNounce, inpHashes, d_validNonceTable);
+
+	// Sync
+	cudaStreamSynchronize(NULL);
+}
+
+////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
+__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
+													 uint32_t *d_nonces1, uint32_t *d_nonces2,
+													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+{
+	jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_JackpotTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+	jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_JackpotFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+
+	/*
+	// threadsPerBlock ausrechnen
+	int blockSize = 256;
+	int thr1 = threads / blockSize;
+	int thr2 = threads / (blockSize*blockSize);
+
+	// 1
+	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
+	jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
+	jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
+	cudaStreamSynchronize(NULL);
+	cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
+	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
+
+	// 2
+	jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
+	jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
+	jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
+	cudaStreamSynchronize(NULL);
+	cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);	
+	jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
+	jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
+	
+	// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
+	// Schritt 3: Scatter
+	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
+	jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
+	cudaStreamSynchronize(NULL);
+	*/
+}
+
+__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+											uint32_t *d_nonces1, size_t *nrm1,
+											uint32_t *d_nonces2, size_t *nrm2,
+											int order)
+{
+	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
+	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
+
+	jackpot_compactTest_cpu_dualCompaction(thr_id, threads,
+		h_numValid[thr_id], d_nonces1, d_nonces2,
+		startNounce, inpHashes, d_validNonceTable);
+
+	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
+	*nrm1 = (size_t)h_numValid[thr_id][0];
+	*nrm2 = (size_t)h_numValid[thr_id][1];
+}