Consolidate mix and size to powers of 2

jean-m-cyr · jean-m-cyr · commit e4b5d8113000 · 2018-01-28T17:18:41.000-05:00
Allows CUDA to use shifts instead of multiplies
and sequential access of the mix.

Assume cuda arch &gt;= 3 and cuda toolkit &gt;= 9
and remove deprecated code and definitions.
diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
@@ -322,7 +322,7 @@ bool CUDAMiner::cuda_init(
 
 		cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
 
-		m_search_buf = new volatile uint32_t *[s_numStreams];
+		m_search_buf = new volatile search_results *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];
 
 		uint64_t dagSize = ethash_get_datasize(_light->block_number);
@@ -374,7 +374,7 @@ bool CUDAMiner::cuda_init(
 			cudalog << "Generating mining buffers"; //TODO whats up with this?
 			for (unsigned i = 0; i != s_numStreams; ++i)
 			{
-				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
+				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(search_results)));
 				CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
 			}
 			
@@ -383,8 +383,6 @@ bool CUDAMiner::cuda_init(
 			m_current_nonce = 0;
 			m_current_index = 0;
 
-			m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
-
 			if (!hostDAG)
 			{
 				if((m_device_num == dagCreateDevice) || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
@@ -453,7 +451,7 @@ void CUDAMiner::search(
 			m_current_index = 0;
 			CUDA_SAFE_CALL(cudaDeviceSynchronize());
 			for (unsigned int i = 0; i < s_numStreams; i++)
-				m_search_buf[i][0] = 0;
+				m_search_buf[i]->count = 0;
 		}
 		if (m_starting_nonce != _startN)
 		{
@@ -470,7 +468,7 @@ void CUDAMiner::search(
 			m_current_index = 0;
 			CUDA_SAFE_CALL(cudaDeviceSynchronize());
 			for (unsigned int i = 0; i < s_numStreams; i++)
-				m_search_buf[i][0] = 0;
+				m_search_buf[i]->count = 0;
 		}
 	}
 	uint64_t batch_size = s_gridSize * s_blockSize;
@@ -480,37 +478,37 @@ void CUDAMiner::search(
 		m_current_nonce += batch_size;
 		auto stream_index = m_current_index % s_numStreams;
 		cudaStream_t stream = m_streams[stream_index];
-		volatile uint32_t* buffer = m_search_buf[stream_index];
+		volatile search_results* buffer = m_search_buf[stream_index];
 		uint32_t found_count = 0;
-		uint64_t nonces[SEARCH_RESULT_ENTRIES];
-		uint32_t mixes[SEARCH_RESULT_ENTRIES][8];
+		uint64_t nonces[SEARCH_RESULTS];
+		uint32_t mixes[SEARCH_RESULTS][8];
 		uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
 		if (m_current_index >= s_numStreams)
 		{
 			CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
-			found_count = buffer[0];
+			found_count = buffer->count;
 			if (found_count) {
-				buffer[0] = 0;
-				if (found_count >= SEARCH_RESULT_ENTRIES)
-					found_count = SEARCH_RESULT_ENTRIES - 1;
-				for (unsigned int j = 1; j <= found_count; j++) {
-					nonces[j] = nonce_base + buffer[j];
-					mixes[j][0] = buffer[j + (SEARCH_RESULT_ENTRIES * 1)];
-					mixes[j][1] = buffer[j + (SEARCH_RESULT_ENTRIES * 2)];
-					mixes[j][2] = buffer[j + (SEARCH_RESULT_ENTRIES * 3)];
-					mixes[j][3] = buffer[j + (SEARCH_RESULT_ENTRIES * 4)];
-					mixes[j][4] = buffer[j + (SEARCH_RESULT_ENTRIES * 5)];
-					mixes[j][5] = buffer[j + (SEARCH_RESULT_ENTRIES * 6)];
-					mixes[j][6] = buffer[j + (SEARCH_RESULT_ENTRIES * 7)];
-					mixes[j][7] = buffer[j + (SEARCH_RESULT_ENTRIES * 8)];
+				buffer->count = 0;
+				if (found_count > SEARCH_RESULTS)
+					found_count = SEARCH_RESULTS;
+				for (unsigned int j = 0; j < found_count; j++) {
+					nonces[j] = nonce_base + buffer->result[j].gid;
+					mixes[j][0] = buffer->result[j].mix[0];
+					mixes[j][1] = buffer->result[j].mix[1];
+					mixes[j][2] = buffer->result[j].mix[2];
+					mixes[j][3] = buffer->result[j].mix[3];
+					mixes[j][4] = buffer->result[j].mix[4];
+					mixes[j][5] = buffer->result[j].mix[5];
+					mixes[j][6] = buffer->result[j].mix[6];
+					mixes[j][7] = buffer->result[j].mix[7];
 				}
 			}
 		}
-		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash);
+		run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce, m_parallelHash);
 		if (m_current_index >= s_numStreams)
 		{
 			if (found_count)
-				for (uint32_t i = 1; i <= found_count; i++)
+				for (uint32_t i = 0; i < found_count; i++)
 					farm.submitProof(
 						Solution{nonces[i],
 						*((const h256 *)mixes[i]),
diff --git a/libethash-cuda/CUDAMiner.h b/libethash-cuda/CUDAMiner.h
@@ -30,9 +30,6 @@ along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
 #include "ethash_cuda_miner_kernel.h"
 #include "libethash/internal.h"
 
-#define SHUFFLE_MIN_VER 300 //__CUDA_ARCH_
-#define SHUFFLE_DEPRECATED 9000 //CUDA_VERSION
-
 namespace dev
 {
 namespace eth
@@ -117,15 +114,14 @@ class CUDAMiner: public Miner
 	uint64_t m_current_nonce;
 	uint64_t m_starting_nonce;
 	uint64_t m_current_index;
-	uint32_t m_sharedBytes;
 
 	///Constants on GPU
 	hash128_t* m_dag = nullptr;
 	std::vector<hash64_t*> m_light;
 	uint32_t m_dag_size = -1;
 	uint32_t m_device_num;
 
-	volatile uint32_t ** m_search_buf;
+	volatile search_results** m_search_buf;
 	cudaStream_t  * m_streams;
 
 	/// The local work size for the search
diff --git a/libethash-cuda/dagger_shared.cuh b/libethash-cuda/dagger_shared.cuh
diff --git a/libethash-cuda/dagger_shuffled.cuh b/libethash-cuda/dagger_shuffled.cuh
@@ -104,3 +104,4 @@ __device__ __forceinline__ bool compute_hash(
 
 	return false;
 }
+
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.cu b/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -12,57 +12,50 @@
 
 #define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
 
-
-#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
-#include "keccak_u64.cuh"
-#include "dagger_shared.cuh"
-#else
 #include "keccak.cuh"
 #include "dagger_shuffled.cuh"
-#endif
 
 template <uint32_t _PARALLEL_HASH>
 __global__ void 
 ethash_search(
-	volatile uint32_t* g_output,
+	volatile search_results* g_output,
 	uint64_t start_nonce
 	)
 {
 	uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
 	uint2 mix[4];
         if (compute_hash<_PARALLEL_HASH>(start_nonce + gid, d_target, mix))
 		return;
-	uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), 0xffffffff) + 1;
-	if (index >= SEARCH_RESULT_ENTRIES)
+	uint32_t index = atomicInc((uint32_t *)&g_output->count, 0xffffffff);
+	if (index >= SEARCH_RESULTS)
 		return;
-	g_output[index] = gid;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 1)] = mix[0].x;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 2)] = mix[0].y;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 3)] = mix[1].x;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 4)] = mix[1].y;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 5)] = mix[2].x;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 6)] = mix[2].y;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 7)] = mix[3].x;
-	g_output[index + (SEARCH_RESULT_ENTRIES * 8)] = mix[3].y;
+	g_output->result[index].gid = gid;
+	g_output->result[index].mix[0] = mix[0].x;
+	g_output->result[index].mix[1] = mix[0].y;
+	g_output->result[index].mix[2] = mix[1].x;
+	g_output->result[index].mix[3] = mix[1].y;
+	g_output->result[index].mix[4] = mix[2].x;
+	g_output->result[index].mix[5] = mix[2].y;
+	g_output->result[index].mix[6] = mix[3].x;
+	g_output->result[index].mix[7] = mix[3].y;
 }
 
 void run_ethash_search(
 	uint32_t blocks,
 	uint32_t threads,
-	uint32_t sharedbytes,
 	cudaStream_t stream,
-	volatile uint32_t* g_output,
+	volatile search_results* g_output,
 	uint64_t start_nonce,
 	uint32_t parallelHash
 )
 {
 	switch (parallelHash)
 	{
-		case 1: ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
-		case 2: ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
-		case 4: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
-		case 8: ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
-		default: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
+		case 1: ethash_search <1> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
+		case 2: ethash_search <2> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
+		case 4: ethash_search <4> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
+		case 8: ethash_search <8> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
+		default: ethash_search <4> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
 	}
 	CUDA_SAFE_CALL(cudaGetLastError());
 }
@@ -86,61 +79,31 @@ ethash_calculate_dag_item(uint32_t start)
 
 	for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
 		uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
-#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
-		for (unsigned w = 0; w != 4; ++w) {
-			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
-		}
-#else
 		for (uint32_t t = 0; t < 4; t++) {
 
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-			uint32_t shuffle_index = __shfl(parent_index, t, 4);
-#else
 			uint32_t shuffle_index = __shfl_sync(0xFFFFFFFF,parent_index, t, 4);
-#endif
 
 			uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
 			for (int w = 0; w < 4; w++) {
 
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-				uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
-#else
 				uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
-#endif
 				if (t == thread_id) {
 					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
 				}
 			}
 		}
-
-
-#endif		
 	}
 	SHA3_512(dag_node.uint2s);
 	hash64_t * dag_nodes = (hash64_t *)d_dag;
 
-#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
-	for (uint32_t i = 0; i < 4; i++) {
-		dag_nodes[node_index].uint4s[i] =  dag_node.uint4s[i];
-	}
-#else
 	for (uint32_t t = 0; t < 4; t++) {
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-		uint32_t shuffle_index = __shfl(node_index, t, 4);
-#else
 		uint32_t shuffle_index = __shfl_sync(0xFFFFFFFF,node_index, t, 4);
-#endif
 		uint4 s[4];
 		for (uint32_t w = 0; w < 4; w++) {
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-			s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
-#else
 			s[w] = make_uint4(__shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].x, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].y, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].z, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].w, t, 4));
-#endif
 		}
 		dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
 	}
-#endif		 
 }
 
 void ethash_generate_dag(
diff --git a/libethash-cuda/ethash_cuda_miner_kernel.h b/libethash-cuda/ethash_cuda_miner_kernel.h
@@ -7,15 +7,22 @@
 
 // It is virtually impossible to get more than
 // one solution per stream hash calculation
-// Leave room for up to 3 results.
-#define SEARCH_RESULT_ENTRIES 4
-// One word for gid and 8 for mix hash
-#define SEARCH_RESULT_BUFFER_SIZE (SEARCH_RESULT_ENTRIES * 9)
+// Leave room for up to 4 results. A power
+// of 2 here will yield better CUDA optimization
+#define SEARCH_RESULTS 4
+
+typedef struct {
+	uint32_t count;
+	struct {
+		// One word for gid and 8 for mix hash
+		uint32_t gid;
+		uint32_t mix[8];
+		uint32_t pad[7]; // pad to size power of 2
+	} result[SEARCH_RESULTS];
+} search_results;
 
 #define ACCESSES 64
 #define THREADS_PER_HASH (128 / 16)
-#define SHUFFLE_MIN_VER 300 //__CUDA_ARCH_
-#define SHUFFLE_DEPRECATED 9000 //CUDA_VERSION
 
 typedef struct
 {
@@ -57,9 +64,8 @@ void set_target(
 void run_ethash_search(
 	uint32_t search_batch_size,
 	uint32_t workgroup_size,
-	uint32_t sharedbytes,
 	cudaStream_t stream,
-	volatile uint32_t* g_output,
+	volatile search_results* g_output,
 	uint64_t start_nonce,
 	uint32_t parallelHash
 	);
diff --git a/libethash-cuda/keccak_u64.cuh b/libethash-cuda/keccak_u64.cuh

Original file line number	Diff line number	Diff line change
`@@ -104,3 +104,4 @@ __device__ __forceinline__ bool compute_hash(`
`104`	`104`
`105`	`105`	`return false;`
`106`	`106`	`}`
	`107`	`+`