Skip to content
This repository was archived by the owner on Apr 24, 2022. It is now read-only.

Commit a2f014e

Browse files
authoredJan 29, 2018
Merge pull request #649 from ethereum-mining/cuda-opt
Optimize CUDA GPU
2 parents 00a0e54 + d79772d commit a2f014e

16 files changed

+102
-1008
lines changed
 

‎ethminer/MinerAux.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -872,19 +872,19 @@ class MinerCLI
872872
}
873873
this_thread::sleep_for(chrono::milliseconds(_recheckPeriod));
874874
}
875-
bool ok = prpc->eth_submitWork("0x" + toHex(solution.nonce), "0x" + toString(solution.headerHash), "0x" + toString(solution.mixHash));
875+
bool ok = prpc->eth_submitWork("0x" + toHex(solution.nonce), "0x" + toString(solution.work.header), "0x" + toString(solution.mixHash));
876876
if (ok) {
877877
cnote << "Solution found; Submitted to" << _remote;
878878
cnote << " Nonce:" << solution.nonce;
879-
cnote << " headerHash:" << solution.headerHash.hex();
879+
cnote << " headerHash:" << solution.work.header.hex();
880880
cnote << " mixHash:" << solution.mixHash.hex();
881881
cnote << EthLime << " Accepted." << EthReset;
882882
f.acceptedSolution(solution.stale);
883883
}
884884
else {
885885
cwarn << "Solution found; Submitted to" << _remote;
886886
cwarn << " Nonce:" << solution.nonce;
887-
cwarn << " headerHash:" << solution.headerHash.hex();
887+
cwarn << " headerHash:" << solution.work.header.hex();
888888
cwarn << " mixHash:" << solution.mixHash.hex();
889889
cwarn << EthYellow << " Rejected." << EthReset;
890890
f.rejectedSolution(solution.stale);

‎libethash-cl/CLMiner.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ void CLMiner::report(uint64_t _nonce, WorkPackage const& _w)
271271
// TODO: Why re-evaluating?
272272
Result r = EthashAux::eval(_w.seed, _w.header, _nonce);
273273
if (r.value < _w.boundary)
274-
farm.submitProof(Solution{_nonce, r.mixHash, _w.header, _w.seed, _w.boundary, _w.job, _w.job_len, false});
274+
farm.submitProof(Solution{_nonce, r.mixHash, _w, false});
275275
else {
276276
farm.failedSolution();
277277
cwarn << "FAILURE: GPU gave incorrect result!";

‎libethash-cuda/CUDAMiner.cpp

+28-28
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,6 @@ void CUDAMiner::workLoop()
108108
std::this_thread::sleep_for(std::chrono::seconds(3));
109109
continue;
110110
}
111-
112-
//cnote << "set work; seed: " << "#" + w.seed.hex().substr(0, 8) + ", target: " << "#" + w.boundary.hex().substr(0, 12);
113111
if (current.seed != w.seed)
114112
{
115113
if(!init(w.seed))
@@ -324,7 +322,7 @@ bool CUDAMiner::cuda_init(
324322

325323
cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
326324

327-
m_search_buf = new volatile uint32_t *[s_numStreams];
325+
m_search_buf = new volatile search_results *[s_numStreams];
328326
m_streams = new cudaStream_t[s_numStreams];
329327

330328
uint64_t dagSize = ethash_get_datasize(_light->block_number);
@@ -376,7 +374,7 @@ bool CUDAMiner::cuda_init(
376374
cudalog << "Generating mining buffers"; //TODO whats up with this?
377375
for (unsigned i = 0; i != s_numStreams; ++i)
378376
{
379-
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
377+
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(search_results)));
380378
CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
381379
}
382380

@@ -385,8 +383,6 @@ bool CUDAMiner::cuda_init(
385383
m_current_nonce = 0;
386384
m_current_index = 0;
387385

388-
m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
389-
390386
if (!hostDAG)
391387
{
392388
if((m_device_num == dagCreateDevice) || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
@@ -455,7 +451,7 @@ void CUDAMiner::search(
455451
m_current_index = 0;
456452
CUDA_SAFE_CALL(cudaDeviceSynchronize());
457453
for (unsigned int i = 0; i < s_numStreams; i++)
458-
m_search_buf[i][0] = 0;
454+
m_search_buf[i]->count = 0;
459455
}
460456
if (m_starting_nonce != _startN)
461457
{
@@ -472,7 +468,7 @@ void CUDAMiner::search(
472468
m_current_index = 0;
473469
CUDA_SAFE_CALL(cudaDeviceSynchronize());
474470
for (unsigned int i = 0; i < s_numStreams; i++)
475-
m_search_buf[i][0] = 0;
471+
m_search_buf[i]->count = 0;
476472
}
477473
}
478474
uint64_t batch_size = s_gridSize * s_blockSize;
@@ -482,38 +478,42 @@ void CUDAMiner::search(
482478
m_current_nonce += batch_size;
483479
auto stream_index = m_current_index % s_numStreams;
484480
cudaStream_t stream = m_streams[stream_index];
485-
volatile uint32_t* buffer = m_search_buf[stream_index];
481+
volatile search_results* buffer = m_search_buf[stream_index];
486482
uint32_t found_count = 0;
487-
uint64_t nonces[SEARCH_RESULT_ENTRIES];
488-
uint32_t mixes[SEARCH_RESULT_ENTRIES][8];
483+
uint64_t nonces[SEARCH_RESULTS];
484+
uint32_t mixes[SEARCH_RESULTS][8];
489485
uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
490486
if (m_current_index >= s_numStreams)
491487
{
492488
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
493-
found_count = buffer[0];
489+
found_count = buffer->count;
494490
if (found_count) {
495-
buffer[0] = 0;
496-
if (found_count >= SEARCH_RESULT_ENTRIES)
497-
found_count = SEARCH_RESULT_ENTRIES - 1;
498-
for (unsigned int j = 1; j <= found_count; j++) {
499-
nonces[j] = nonce_base + buffer[j];
500-
mixes[j][0] = buffer[j + (SEARCH_RESULT_ENTRIES * 1)];
501-
mixes[j][1] = buffer[j + (SEARCH_RESULT_ENTRIES * 2)];
502-
mixes[j][2] = buffer[j + (SEARCH_RESULT_ENTRIES * 3)];
503-
mixes[j][3] = buffer[j + (SEARCH_RESULT_ENTRIES * 4)];
504-
mixes[j][4] = buffer[j + (SEARCH_RESULT_ENTRIES * 5)];
505-
mixes[j][5] = buffer[j + (SEARCH_RESULT_ENTRIES * 6)];
506-
mixes[j][6] = buffer[j + (SEARCH_RESULT_ENTRIES * 7)];
507-
mixes[j][7] = buffer[j + (SEARCH_RESULT_ENTRIES * 8)];
491+
buffer->count = 0;
492+
if (found_count > SEARCH_RESULTS)
493+
found_count = SEARCH_RESULTS;
494+
for (unsigned int j = 0; j < found_count; j++) {
495+
nonces[j] = nonce_base + buffer->result[j].gid;
496+
mixes[j][0] = buffer->result[j].mix[0];
497+
mixes[j][1] = buffer->result[j].mix[1];
498+
mixes[j][2] = buffer->result[j].mix[2];
499+
mixes[j][3] = buffer->result[j].mix[3];
500+
mixes[j][4] = buffer->result[j].mix[4];
501+
mixes[j][5] = buffer->result[j].mix[5];
502+
mixes[j][6] = buffer->result[j].mix[6];
503+
mixes[j][7] = buffer->result[j].mix[7];
508504
}
509505
}
510506
}
511-
run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash);
507+
run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce, m_parallelHash);
512508
if (m_current_index >= s_numStreams)
513509
{
514510
if (found_count)
515-
for (uint32_t i = 1; i <= found_count; i++)
516-
farm.submitProof(Solution{nonces[i], *((h256 *)mixes[i]), w.header, w.seed, w.boundary, w.job, w.job_len, m_abort});
511+
for (uint32_t i = 0; i < found_count; i++)
512+
farm.submitProof(
513+
Solution{nonces[i],
514+
*((const h256 *)mixes[i]),
515+
w,
516+
m_abort});
517517
addHashCount(batch_size);
518518
if (m_abort || shouldStop())
519519
{

‎libethash-cuda/CUDAMiner.h

+1-5
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ along with cpp-ethereum. If not, see <http://www.gnu.org/licenses/>.
3030
#include "ethash_cuda_miner_kernel.h"
3131
#include "libethash/internal.h"
3232

33-
#define SHUFFLE_MIN_VER 300 //__CUDA_ARCH_
34-
#define SHUFFLE_DEPRECATED 9000 //CUDA_VERSION
35-
3633
namespace dev
3734
{
3835
namespace eth
@@ -117,15 +114,14 @@ class CUDAMiner: public Miner
117114
uint64_t m_current_nonce;
118115
uint64_t m_starting_nonce;
119116
uint64_t m_current_index;
120-
uint32_t m_sharedBytes;
121117

122118
///Constants on GPU
123119
hash128_t* m_dag = nullptr;
124120
std::vector<hash64_t*> m_light;
125121
uint32_t m_dag_size = -1;
126122
uint32_t m_device_num;
127123

128-
volatile uint32_t ** m_search_buf;
124+
volatile search_results** m_search_buf;
129125
cudaStream_t * m_streams;
130126

131127
/// The local work size for the search

‎libethash-cuda/dagger_shared.cuh

-72
This file was deleted.

‎libethash-cuda/dagger_shuffled.cuh

+11-28
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
#include "cuda_helper.h"
44

55
template <uint32_t _PARALLEL_HASH>
6-
__device__ __forceinline__ uint64_t compute_hash(
6+
__device__ __forceinline__ bool compute_hash(
77
uint64_t nonce,
8+
uint64_t target,
89
uint2 *mix_hash
910
)
1011
{
@@ -31,13 +32,8 @@ __device__ __forceinline__ uint64_t compute_hash(
3132
uint2 shuffle[8];
3233
for (int j = 0; j < 8; j++)
3334
{
34-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
35-
shuffle[j].x = __shfl(state[j].x, i+p, THREADS_PER_HASH);
36-
shuffle[j].y = __shfl(state[j].y, i+p, THREADS_PER_HASH);
37-
#else
3835
shuffle[j].x = __shfl_sync(0xFFFFFFFF,state[j].x, i+p, THREADS_PER_HASH);
3936
shuffle[j].y = __shfl_sync(0xFFFFFFFF,state[j].y, i+p, THREADS_PER_HASH);
40-
#endif
4137
}
4238
switch (mix_idx)
4339
{
@@ -46,11 +42,7 @@ __device__ __forceinline__ uint64_t compute_hash(
4642
case 2: mix[p] = vectorize2(shuffle[4], shuffle[5]); break;
4743
case 3: mix[p] = vectorize2(shuffle[6], shuffle[7]); break;
4844
}
49-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
50-
init0[p] = __shfl(shuffle[0].x, 0, THREADS_PER_HASH);
51-
#else
5245
init0[p] = __shfl_sync(0xFFFFFFFF,shuffle[0].x, 0, THREADS_PER_HASH);
53-
#endif
5446
}
5547

5648
for (uint32_t a = 0; a < ACCESSES; a += 4)
@@ -62,11 +54,7 @@ __device__ __forceinline__ uint64_t compute_hash(
6254
for (int p = 0; p < _PARALLEL_HASH; p++)
6355
{
6456
offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t *)&mix[p])[b]) % d_dag_size;
65-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
66-
offset[p] = __shfl(offset[p], t, THREADS_PER_HASH);
67-
#else
6857
offset[p] = __shfl_sync(0xFFFFFFFF,offset[p], t, THREADS_PER_HASH);
69-
#endif
7058
}
7159
#pragma unroll
7260
for (int p = 0; p < _PARALLEL_HASH; p++)
@@ -86,16 +74,6 @@ __device__ __forceinline__ uint64_t compute_hash(
8674
uint32_t thread_mix = fnv_reduce(mix[p]);
8775

8876
// update mix accross threads
89-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
90-
shuffle[0].x = __shfl(thread_mix, 0, THREADS_PER_HASH);
91-
shuffle[0].y = __shfl(thread_mix, 1, THREADS_PER_HASH);
92-
shuffle[1].x = __shfl(thread_mix, 2, THREADS_PER_HASH);
93-
shuffle[1].y = __shfl(thread_mix, 3, THREADS_PER_HASH);
94-
shuffle[2].x = __shfl(thread_mix, 4, THREADS_PER_HASH);
95-
shuffle[2].y = __shfl(thread_mix, 5, THREADS_PER_HASH);
96-
shuffle[3].x = __shfl(thread_mix, 6, THREADS_PER_HASH);
97-
shuffle[3].y = __shfl(thread_mix, 7, THREADS_PER_HASH);
98-
#else
9977
shuffle[0].x = __shfl_sync(0xFFFFFFFF,thread_mix, 0, THREADS_PER_HASH);
10078
shuffle[0].y = __shfl_sync(0xFFFFFFFF,thread_mix, 1, THREADS_PER_HASH);
10179
shuffle[1].x = __shfl_sync(0xFFFFFFFF,thread_mix, 2, THREADS_PER_HASH);
@@ -104,7 +82,7 @@ __device__ __forceinline__ uint64_t compute_hash(
10482
shuffle[2].y = __shfl_sync(0xFFFFFFFF,thread_mix, 5, THREADS_PER_HASH);
10583
shuffle[3].x = __shfl_sync(0xFFFFFFFF,thread_mix, 6, THREADS_PER_HASH);
10684
shuffle[3].y = __shfl_sync(0xFFFFFFFF,thread_mix, 7, THREADS_PER_HASH);
107-
#endif
85+
10886
if ((i+p) == thread_id) {
10987
//move mix into state:
11088
state[8] = shuffle[0];
@@ -114,11 +92,16 @@ __device__ __forceinline__ uint64_t compute_hash(
11492
}
11593
}
11694
}
95+
96+
// keccak_256(keccak_512(header..nonce) .. mix);
97+
if (cuda_swab64(keccak_f1600_final(state)) > target)
98+
return true;
99+
117100
mix_hash[0] = state[8];
118101
mix_hash[1] = state[9];
119102
mix_hash[2] = state[10];
120103
mix_hash[3] = state[11];
121-
122-
// keccak_256(keccak_512(header..nonce) .. mix);
123-
return keccak_f1600_final(state);
104+
105+
return false;
124106
}
107+

0 commit comments

Comments
 (0)
This repository has been archived.