ethereum-mining · Jan 29, 2018
diff --git a/‎ethminer/MinerAux.h
+3-3 b/‎ethminer/MinerAux.h
+3-3
diff --git a/‎libethash-cl/CLMiner.cpp
+1-1 b/‎libethash-cl/CLMiner.cpp
+1-1
diff --git a/‎libethash-cuda/CUDAMiner.cpp
+28-28 b/‎libethash-cuda/CUDAMiner.cpp
+28-28
diff --git a/‎libethash-cuda/CUDAMiner.h
+1-5 b/‎libethash-cuda/CUDAMiner.h
+1-5
diff --git a/‎libethash-cuda/dagger_shared.cuh
-72 b/‎libethash-cuda/dagger_shared.cuh
-72
diff --git a/‎libethash-cuda/dagger_shuffled.cuh
+11-28 b/‎libethash-cuda/dagger_shuffled.cuh
+11-28
@@ -872,19 +872,19 @@ class MinerCLI
 					}
 					this_thread::sleep_for(chrono::milliseconds(_recheckPeriod));
 				}
-				bool ok = prpc->eth_submitWork("0x" + toHex(solution.nonce), "0x" + toString(solution.headerHash), "0x" + toString(solution.mixHash));
+				bool ok = prpc->eth_submitWork("0x" + toHex(solution.nonce), "0x" + toString(solution.work.header), "0x" + toString(solution.mixHash));
 				if (ok) {
 					cnote << "Solution found; Submitted to" << _remote;
 					cnote << "  Nonce:" << solution.nonce;
-					cnote << "  headerHash:" << solution.headerHash.hex();
+					cnote << "  headerHash:" << solution.work.header.hex();
 					cnote << "  mixHash:" << solution.mixHash.hex();
 					cnote << EthLime << " Accepted." << EthReset;
 					f.acceptedSolution(solution.stale);
 				}
 				else {
 					cwarn << "Solution found; Submitted to" << _remote;
 					cwarn << "  Nonce:" << solution.nonce;
-					cwarn << "  headerHash:" << solution.headerHash.hex();
+					cwarn << "  headerHash:" << solution.work.header.hex();
 					cwarn << "  mixHash:" << solution.mixHash.hex();
 					cwarn << EthYellow << " Rejected." << EthReset;
 					f.rejectedSolution(solution.stale);
 
@@ -271,7 +271,7 @@ void CLMiner::report(uint64_t _nonce, WorkPackage const& _w)
 	// TODO: Why re-evaluating?
 	Result r = EthashAux::eval(_w.seed, _w.header, _nonce);
 	if (r.value < _w.boundary)
-		farm.submitProof(Solution{_nonce, r.mixHash, _w.header, _w.seed, _w.boundary, _w.job, _w.job_len, false});
+		farm.submitProof(Solution{_nonce, r.mixHash, _w, false});
 	else {
 		farm.failedSolution();
 		cwarn << "FAILURE: GPU gave incorrect result!";
 
@@ -108,8 +108,6 @@ void CUDAMiner::workLoop()
 					std::this_thread::sleep_for(std::chrono::seconds(3));
 					continue;
 				}
-				
-				//cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
 				if (current.seed != w.seed)
 				{
 					if(!init(w.seed))
@@ -324,7 +322,7 @@ bool CUDAMiner::cuda_init(
 
 		cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
 
-		m_search_buf = new volatile uint32_t *[s_numStreams];
+		m_search_buf = new volatile search_results *[s_numStreams];
 		m_streams = new cudaStream_t[s_numStreams];
 
 		uint64_t dagSize = ethash_get_datasize(_light->block_number);
@@ -376,7 +374,7 @@ bool CUDAMiner::cuda_init(
 			cudalog << "Generating mining buffers"; //TODO whats up with this?
 			for (unsigned i = 0; i != s_numStreams; ++i)
 			{
-				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
+				CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(search_results)));
 				CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
 			}
 
@@ -385,8 +383,6 @@ bool CUDAMiner::cuda_init(
 			m_current_nonce = 0;
 			m_current_index = 0;
 
-			m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
-
 			if (!hostDAG)
 			{
 				if((m_device_num == dagCreateDevice) || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
@@ -455,7 +451,7 @@ void CUDAMiner::search(
 			m_current_index = 0;
 			CUDA_SAFE_CALL(cudaDeviceSynchronize());
 			for (unsigned int i = 0; i < s_numStreams; i++)
-				m_search_buf[i][0] = 0;
+				m_search_buf[i]->count = 0;
 		}
 		if (m_starting_nonce != _startN)
 		{
@@ -472,7 +468,7 @@ void CUDAMiner::search(
 			m_current_index = 0;
 			CUDA_SAFE_CALL(cudaDeviceSynchronize());
 			for (unsigned int i = 0; i < s_numStreams; i++)
-				m_search_buf[i][0] = 0;
+				m_search_buf[i]->count = 0;
 		}
 	}
 	uint64_t batch_size = s_gridSize * s_blockSize;
@@ -482,38 +478,42 @@ void CUDAMiner::search(
 		m_current_nonce += batch_size;
 		auto stream_index = m_current_index % s_numStreams;
 		cudaStream_t stream = m_streams[stream_index];
-		volatile uint32_t* buffer = m_search_buf[stream_index];
+		volatile search_results* buffer = m_search_buf[stream_index];
 		uint32_t found_count = 0;
-		uint64_t nonces[SEARCH_RESULT_ENTRIES];
-		uint32_t mixes[SEARCH_RESULT_ENTRIES][8];
+		uint64_t nonces[SEARCH_RESULTS];
+		uint32_t mixes[SEARCH_RESULTS][8];
 		uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
 		if (m_current_index >= s_numStreams)
 		{
 			CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
-			found_count = buffer[0];
+			found_count = buffer->count;
 			if (found_count) {
-				buffer[0] = 0;
-				if (found_count >= SEARCH_RESULT_ENTRIES)
-					found_count = SEARCH_RESULT_ENTRIES - 1;
-				for (unsigned int j = 1; j <= found_count; j++) {
-					nonces[j] = nonce_base + buffer[j];
-					mixes[j][0] = buffer[j + (SEARCH_RESULT_ENTRIES * 1)];
-					mixes[j][1] = buffer[j + (SEARCH_RESULT_ENTRIES * 2)];
-					mixes[j][2] = buffer[j + (SEARCH_RESULT_ENTRIES * 3)];
-					mixes[j][3] = buffer[j + (SEARCH_RESULT_ENTRIES * 4)];
-					mixes[j][4] = buffer[j + (SEARCH_RESULT_ENTRIES * 5)];
-					mixes[j][5] = buffer[j + (SEARCH_RESULT_ENTRIES * 6)];
-					mixes[j][6] = buffer[j + (SEARCH_RESULT_ENTRIES * 7)];
-					mixes[j][7] = buffer[j + (SEARCH_RESULT_ENTRIES * 8)];
+				buffer->count = 0;
+				if (found_count > SEARCH_RESULTS)
+					found_count = SEARCH_RESULTS;
+				for (unsigned int j = 0; j < found_count; j++) {
+					nonces[j] = nonce_base + buffer->result[j].gid;
+					mixes[j][0] = buffer->result[j].mix[0];
+					mixes[j][1] = buffer->result[j].mix[1];
+					mixes[j][2] = buffer->result[j].mix[2];
+					mixes[j][3] = buffer->result[j].mix[3];
+					mixes[j][4] = buffer->result[j].mix[4];
+					mixes[j][5] = buffer->result[j].mix[5];
+					mixes[j][6] = buffer->result[j].mix[6];
+					mixes[j][7] = buffer->result[j].mix[7];
 				}
 			}
 		}
-		run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash);
+		run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce, m_parallelHash);
 		if (m_current_index >= s_numStreams)
 		{
 			if (found_count)
-				for (uint32_t i = 1; i <= found_count; i++)
-					farm.submitProof(Solution{nonces[i], *((h256 *)mixes[i]), w.header, w.seed, w.boundary, w.job, w.job_len, m_abort});
+				for (uint32_t i = 0; i < found_count; i++)
+					farm.submitProof(
+						Solution{nonces[i],
+						*((const h256 *)mixes[i]),
+						w,
+						m_abort});
 			addHashCount(batch_size);
 			if (m_abort || shouldStop())
 			{
 
@@ -30,9 +30,6 @@ along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
 #include "ethash_cuda_miner_kernel.h"
 #include "libethash/internal.h"
 
-#define SHUFFLE_MIN_VER 300 //__CUDA_ARCH_
-#define SHUFFLE_DEPRECATED 9000 //CUDA_VERSION
-
 namespace dev
 {
 namespace eth
@@ -117,15 +114,14 @@ class CUDAMiner: public Miner
 	uint64_t m_current_nonce;
 	uint64_t m_starting_nonce;
 	uint64_t m_current_index;
-	uint32_t m_sharedBytes;
 
 	///Constants on GPU
 	hash128_t* m_dag = nullptr;
 	std::vector<hash64_t*> m_light;
 	uint32_t m_dag_size = -1;
 	uint32_t m_device_num;
 
-	volatile uint32_t ** m_search_buf;
+	volatile search_results** m_search_buf;
 	cudaStream_t  * m_streams;
 
 	/// The local work size for the search
 
@@ -3,8 +3,9 @@
 #include "cuda_helper.h"
 
 template <uint32_t _PARALLEL_HASH>
-__device__ __forceinline__ uint64_t compute_hash(
+__device__ __forceinline__ bool compute_hash(
 	uint64_t nonce,
+	uint64_t target,
 	uint2 *mix_hash
 	)
 {
@@ -31,13 +32,8 @@ __device__ __forceinline__ uint64_t compute_hash(
 			uint2 shuffle[8];
 			for (int j = 0; j < 8; j++) 
 			{
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-				shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH);
-				shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH);
-#else
 				shuffle[j].x = __shfl_sync(0xFFFFFFFF,state[j].x, i+p, THREADS_PER_HASH);
 				shuffle[j].y = __shfl_sync(0xFFFFFFFF,state[j].y, i+p, THREADS_PER_HASH);
-#endif
 			}
 			switch (mix_idx)
 			{
@@ -46,11 +42,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 				case 2: mix[p] = vectorize2(shuffle[4], shuffle[5]); break;
 				case 3: mix[p] = vectorize2(shuffle[6], shuffle[7]); break;
 			}
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-			init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
-#else
 			init0[p] = __shfl_sync(0xFFFFFFFF,shuffle[0].x, 0, THREADS_PER_HASH);
-#endif
 		}
 
 		for (uint32_t a = 0; a < ACCESSES; a += 4)
@@ -62,11 +54,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 				for (int p = 0; p < _PARALLEL_HASH; p++)
 				{
 					offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-					offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
-#else
 					offset[p] = __shfl_sync(0xFFFFFFFF,offset[p], t, THREADS_PER_HASH);
-#endif
 				}
 				#pragma unroll
 				for (int p = 0; p < _PARALLEL_HASH; p++)
@@ -86,16 +74,6 @@ __device__ __forceinline__ uint64_t compute_hash(
 			uint32_t thread_mix = fnv_reduce(mix[p]);
 
 			// update mix accross threads
-#if CUDA_VERSION < SHUFFLE_DEPRECATED
-			shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
-			shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
-			shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
-			shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH);
-			shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH);
-			shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
-			shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
-			shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
-#else
 			shuffle[0].x = __shfl_sync(0xFFFFFFFF,thread_mix, 0, THREADS_PER_HASH);
 			shuffle[0].y = __shfl_sync(0xFFFFFFFF,thread_mix, 1, THREADS_PER_HASH);
 			shuffle[1].x = __shfl_sync(0xFFFFFFFF,thread_mix, 2, THREADS_PER_HASH);
@@ -104,7 +82,7 @@ __device__ __forceinline__ uint64_t compute_hash(
 			shuffle[2].y = __shfl_sync(0xFFFFFFFF,thread_mix, 5, THREADS_PER_HASH);
 			shuffle[3].x = __shfl_sync(0xFFFFFFFF,thread_mix, 6, THREADS_PER_HASH);
 			shuffle[3].y = __shfl_sync(0xFFFFFFFF,thread_mix, 7, THREADS_PER_HASH);
-#endif
+
 			if ((i+p) == thread_id) {
 				//move mix into state:
 				state[8] = shuffle[0];
@@ -114,11 +92,16 @@ __device__ __forceinline__ uint64_t compute_hash(
 			}
 		}
 	}
+
+	// keccak_256(keccak_512(header..nonce) .. mix);
+	if (cuda_swab64(keccak_f1600_final(state)) > target)
+		return true;
+
 	mix_hash[0] = state[8];
 	mix_hash[1] = state[9];
 	mix_hash[2] = state[10];
 	mix_hash[3] = state[11];
-	
-	// keccak_256(keccak_512(header..nonce) .. mix);
-	return keccak_f1600_final(state);
+
+	return false;
 }
+