Skip to content
This repository was archived by the owner on Apr 24, 2022. It is now read-only.

Commit e4b5d81

Browse files
committedJan 28, 2018
Consolidate mix and size to powers of 2
Allows CUDA to use shifts instead of multiplies and sequential access of the mix. Assume cuda arch >= 3 and cuda toolkit >= 9 and remove deprecated code and definitions.
1 parent 1a6a24b commit e4b5d81

7 files changed

+57
-942
lines changed
 

‎libethash-cuda/CUDAMiner.cpp

+23-25
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ bool CUDAMiner::cuda_init(
322322

323323
cudalog << "Using device: " << device_props.name << " (Compute " + to_string(device_props.major) + "." + to_string(device_props.minor) + ")";
324324

325-
m_search_buf = new volatile uint32_t *[s_numStreams];
325+
m_search_buf = new volatile search_results *[s_numStreams];
326326
m_streams = new cudaStream_t[s_numStreams];
327327

328328
uint64_t dagSize = ethash_get_datasize(_light->block_number);
@@ -374,7 +374,7 @@ bool CUDAMiner::cuda_init(
374374
cudalog << "Generating mining buffers"; //TODO whats up with this?
375375
for (unsigned i = 0; i != s_numStreams; ++i)
376376
{
377-
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], SEARCH_RESULT_BUFFER_SIZE * sizeof(uint32_t)));
377+
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(search_results)));
378378
CUDA_SAFE_CALL(cudaStreamCreate(&m_streams[i]));
379379
}
380380

@@ -383,8 +383,6 @@ bool CUDAMiner::cuda_init(
383383
m_current_nonce = 0;
384384
m_current_index = 0;
385385

386-
m_sharedBytes = device_props.major * 100 < SHUFFLE_MIN_VER ? (64 * s_blockSize) / 8 : 0 ;
387-
388386
if (!hostDAG)
389387
{
390388
if((m_device_num == dagCreateDevice) || !_cpyToHost){ //if !cpyToHost -> All devices shall generate their DAG
@@ -453,7 +451,7 @@ void CUDAMiner::search(
453451
m_current_index = 0;
454452
CUDA_SAFE_CALL(cudaDeviceSynchronize());
455453
for (unsigned int i = 0; i < s_numStreams; i++)
456-
m_search_buf[i][0] = 0;
454+
m_search_buf[i]->count = 0;
457455
}
458456
if (m_starting_nonce != _startN)
459457
{
@@ -470,7 +468,7 @@ void CUDAMiner::search(
470468
m_current_index = 0;
471469
CUDA_SAFE_CALL(cudaDeviceSynchronize());
472470
for (unsigned int i = 0; i < s_numStreams; i++)
473-
m_search_buf[i][0] = 0;
471+
m_search_buf[i]->count = 0;
474472
}
475473
}
476474
uint64_t batch_size = s_gridSize * s_blockSize;
@@ -480,37 +478,37 @@ void CUDAMiner::search(
480478
m_current_nonce += batch_size;
481479
auto stream_index = m_current_index % s_numStreams;
482480
cudaStream_t stream = m_streams[stream_index];
483-
volatile uint32_t* buffer = m_search_buf[stream_index];
481+
volatile search_results* buffer = m_search_buf[stream_index];
484482
uint32_t found_count = 0;
485-
uint64_t nonces[SEARCH_RESULT_ENTRIES];
486-
uint32_t mixes[SEARCH_RESULT_ENTRIES][8];
483+
uint64_t nonces[SEARCH_RESULTS];
484+
uint32_t mixes[SEARCH_RESULTS][8];
487485
uint64_t nonce_base = m_current_nonce - s_numStreams * batch_size;
488486
if (m_current_index >= s_numStreams)
489487
{
490488
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
491-
found_count = buffer[0];
489+
found_count = buffer->count;
492490
if (found_count) {
493-
buffer[0] = 0;
494-
if (found_count >= SEARCH_RESULT_ENTRIES)
495-
found_count = SEARCH_RESULT_ENTRIES - 1;
496-
for (unsigned int j = 1; j <= found_count; j++) {
497-
nonces[j] = nonce_base + buffer[j];
498-
mixes[j][0] = buffer[j + (SEARCH_RESULT_ENTRIES * 1)];
499-
mixes[j][1] = buffer[j + (SEARCH_RESULT_ENTRIES * 2)];
500-
mixes[j][2] = buffer[j + (SEARCH_RESULT_ENTRIES * 3)];
501-
mixes[j][3] = buffer[j + (SEARCH_RESULT_ENTRIES * 4)];
502-
mixes[j][4] = buffer[j + (SEARCH_RESULT_ENTRIES * 5)];
503-
mixes[j][5] = buffer[j + (SEARCH_RESULT_ENTRIES * 6)];
504-
mixes[j][6] = buffer[j + (SEARCH_RESULT_ENTRIES * 7)];
505-
mixes[j][7] = buffer[j + (SEARCH_RESULT_ENTRIES * 8)];
491+
buffer->count = 0;
492+
if (found_count > SEARCH_RESULTS)
493+
found_count = SEARCH_RESULTS;
494+
for (unsigned int j = 0; j < found_count; j++) {
495+
nonces[j] = nonce_base + buffer->result[j].gid;
496+
mixes[j][0] = buffer->result[j].mix[0];
497+
mixes[j][1] = buffer->result[j].mix[1];
498+
mixes[j][2] = buffer->result[j].mix[2];
499+
mixes[j][3] = buffer->result[j].mix[3];
500+
mixes[j][4] = buffer->result[j].mix[4];
501+
mixes[j][5] = buffer->result[j].mix[5];
502+
mixes[j][6] = buffer->result[j].mix[6];
503+
mixes[j][7] = buffer->result[j].mix[7];
506504
}
507505
}
508506
}
509-
run_ethash_search(s_gridSize, s_blockSize, m_sharedBytes, stream, buffer, m_current_nonce, m_parallelHash);
507+
run_ethash_search(s_gridSize, s_blockSize, stream, buffer, m_current_nonce, m_parallelHash);
510508
if (m_current_index >= s_numStreams)
511509
{
512510
if (found_count)
513-
for (uint32_t i = 1; i <= found_count; i++)
511+
for (uint32_t i = 0; i < found_count; i++)
514512
farm.submitProof(
515513
Solution{nonces[i],
516514
*((const h256 *)mixes[i]),

‎libethash-cuda/CUDAMiner.h

+1-5
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ along with cpp-ethereum. If not, see <http://www.gnu.org/licenses/>.
3030
#include "ethash_cuda_miner_kernel.h"
3131
#include "libethash/internal.h"
3232

33-
#define SHUFFLE_MIN_VER 300 //__CUDA_ARCH_
34-
#define SHUFFLE_DEPRECATED 9000 //CUDA_VERSION
35-
3633
namespace dev
3734
{
3835
namespace eth
@@ -117,15 +114,14 @@ class CUDAMiner: public Miner
117114
uint64_t m_current_nonce;
118115
uint64_t m_starting_nonce;
119116
uint64_t m_current_index;
120-
uint32_t m_sharedBytes;
121117

122118
///Constants on GPU
123119
hash128_t* m_dag = nullptr;
124120
std::vector<hash64_t*> m_light;
125121
uint32_t m_dag_size = -1;
126122
uint32_t m_device_num;
127123

128-
volatile uint32_t ** m_search_buf;
124+
volatile search_results** m_search_buf;
129125
cudaStream_t * m_streams;
130126

131127
/// The local work size for the search

‎libethash-cuda/dagger_shared.cuh

-72
This file was deleted.

‎libethash-cuda/dagger_shuffled.cuh

+1
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,4 @@ __device__ __forceinline__ bool compute_hash(
104104

105105
return false;
106106
}
107+

‎libethash-cuda/ethash_cuda_miner_kernel.cu

+18-55
Original file line numberDiff line numberDiff line change
@@ -12,57 +12,50 @@
1212

1313
#define copy(dst, src, count) for (int i = 0; i != count; ++i) { (dst)[i] = (src)[i]; }
1414

15-
16-
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
17-
#include "keccak_u64.cuh"
18-
#include "dagger_shared.cuh"
19-
#else
2015
#include "keccak.cuh"
2116
#include "dagger_shuffled.cuh"
22-
#endif
2317

2418
template <uint32_t _PARALLEL_HASH>
2519
__global__ void
2620
ethash_search(
27-
volatile uint32_t* g_output,
21+
volatile search_results* g_output,
2822
uint64_t start_nonce
2923
)
3024
{
3125
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
3226
uint2 mix[4];
3327
if (compute_hash<_PARALLEL_HASH>(start_nonce + gid, d_target, mix))
3428
return;
35-
uint32_t index = atomicInc(const_cast<uint32_t*>(g_output), 0xffffffff) + 1;
36-
if (index >= SEARCH_RESULT_ENTRIES)
29+
uint32_t index = atomicInc((uint32_t *)&g_output->count, 0xffffffff);
30+
if (index >= SEARCH_RESULTS)
3731
return;
38-
g_output[index] = gid;
39-
g_output[index + (SEARCH_RESULT_ENTRIES * 1)] = mix[0].x;
40-
g_output[index + (SEARCH_RESULT_ENTRIES * 2)] = mix[0].y;
41-
g_output[index + (SEARCH_RESULT_ENTRIES * 3)] = mix[1].x;
42-
g_output[index + (SEARCH_RESULT_ENTRIES * 4)] = mix[1].y;
43-
g_output[index + (SEARCH_RESULT_ENTRIES * 5)] = mix[2].x;
44-
g_output[index + (SEARCH_RESULT_ENTRIES * 6)] = mix[2].y;
45-
g_output[index + (SEARCH_RESULT_ENTRIES * 7)] = mix[3].x;
46-
g_output[index + (SEARCH_RESULT_ENTRIES * 8)] = mix[3].y;
32+
g_output->result[index].gid = gid;
33+
g_output->result[index].mix[0] = mix[0].x;
34+
g_output->result[index].mix[1] = mix[0].y;
35+
g_output->result[index].mix[2] = mix[1].x;
36+
g_output->result[index].mix[3] = mix[1].y;
37+
g_output->result[index].mix[4] = mix[2].x;
38+
g_output->result[index].mix[5] = mix[2].y;
39+
g_output->result[index].mix[6] = mix[3].x;
40+
g_output->result[index].mix[7] = mix[3].y;
4741
}
4842

4943
void run_ethash_search(
5044
uint32_t blocks,
5145
uint32_t threads,
52-
uint32_t sharedbytes,
5346
cudaStream_t stream,
54-
volatile uint32_t* g_output,
47+
volatile search_results* g_output,
5548
uint64_t start_nonce,
5649
uint32_t parallelHash
5750
)
5851
{
5952
switch (parallelHash)
6053
{
61-
case 1: ethash_search <1> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
62-
case 2: ethash_search <2> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
63-
case 4: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
64-
case 8: ethash_search <8> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
65-
default: ethash_search <4> <<<blocks, threads, sharedbytes, stream >>>(g_output, start_nonce); break;
54+
case 1: ethash_search <1> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
55+
case 2: ethash_search <2> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
56+
case 4: ethash_search <4> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
57+
case 8: ethash_search <8> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
58+
default: ethash_search <4> <<<blocks, threads, 0, stream >>>(g_output, start_nonce); break;
6659
}
6760
CUDA_SAFE_CALL(cudaGetLastError());
6861
}
@@ -86,61 +79,31 @@ ethash_calculate_dag_item(uint32_t start)
8679

8780
for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) {
8881
uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
89-
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
90-
for (unsigned w = 0; w != 4; ++w) {
91-
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], d_light[parent_index].uint4s[w]);
92-
}
93-
#else
9482
for (uint32_t t = 0; t < 4; t++) {
9583

96-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
97-
uint32_t shuffle_index = __shfl(parent_index, t, 4);
98-
#else
9984
uint32_t shuffle_index = __shfl_sync(0xFFFFFFFF,parent_index, t, 4);
100-
#endif
10185

10286
uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
10387
for (int w = 0; w < 4; w++) {
10488

105-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
106-
uint4 s4 = make_uint4(__shfl(p4.x, w, 4), __shfl(p4.y, w, 4), __shfl(p4.z, w, 4), __shfl(p4.w, w, 4));
107-
#else
10889
uint4 s4 = make_uint4(__shfl_sync(0xFFFFFFFF,p4.x, w, 4), __shfl_sync(0xFFFFFFFF,p4.y, w, 4), __shfl_sync(0xFFFFFFFF,p4.z, w, 4), __shfl_sync(0xFFFFFFFF,p4.w, w, 4));
109-
#endif
11090
if (t == thread_id) {
11191
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
11292
}
11393
}
11494
}
115-
116-
117-
#endif
11895
}
11996
SHA3_512(dag_node.uint2s);
12097
hash64_t * dag_nodes = (hash64_t *)d_dag;
12198

122-
#if __CUDA_ARCH__ < SHUFFLE_MIN_VER
123-
for (uint32_t i = 0; i < 4; i++) {
124-
dag_nodes[node_index].uint4s[i] = dag_node.uint4s[i];
125-
}
126-
#else
12799
for (uint32_t t = 0; t < 4; t++) {
128-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
129-
uint32_t shuffle_index = __shfl(node_index, t, 4);
130-
#else
131100
uint32_t shuffle_index = __shfl_sync(0xFFFFFFFF,node_index, t, 4);
132-
#endif
133101
uint4 s[4];
134102
for (uint32_t w = 0; w < 4; w++) {
135-
#if CUDA_VERSION < SHUFFLE_DEPRECATED
136-
s[w] = make_uint4(__shfl(dag_node.uint4s[w].x, t, 4), __shfl(dag_node.uint4s[w].y, t, 4), __shfl(dag_node.uint4s[w].z, t, 4), __shfl(dag_node.uint4s[w].w, t, 4));
137-
#else
138103
s[w] = make_uint4(__shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].x, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].y, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].z, t, 4), __shfl_sync(0xFFFFFFFF,dag_node.uint4s[w].w, t, 4));
139-
#endif
140104
}
141105
dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
142106
}
143-
#endif
144107
}
145108

146109
void ethash_generate_dag(

‎libethash-cuda/ethash_cuda_miner_kernel.h

+14-8
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,22 @@
77

88
// It is virtually impossible to get more than
99
// one solution per stream hash calculation
10-
// Leave room for up to 3 results.
11-
#define SEARCH_RESULT_ENTRIES 4
12-
// One word for gid and 8 for mix hash
13-
#define SEARCH_RESULT_BUFFER_SIZE (SEARCH_RESULT_ENTRIES * 9)
10+
// Leave room for up to 4 results. A power
11+
// of 2 here will yield better CUDA optimization
12+
#define SEARCH_RESULTS 4
13+
14+
typedef struct {
15+
uint32_t count;
16+
struct {
17+
// One word for gid and 8 for mix hash
18+
uint32_t gid;
19+
uint32_t mix[8];
20+
uint32_t pad[7]; // pad to size power of 2
21+
} result[SEARCH_RESULTS];
22+
} search_results;
1423

1524
#define ACCESSES 64
1625
#define THREADS_PER_HASH (128 / 16)
17-
#define SHUFFLE_MIN_VER 300 //__CUDA_ARCH_
18-
#define SHUFFLE_DEPRECATED 9000 //CUDA_VERSION
1926

2027
typedef struct
2128
{
@@ -57,9 +64,8 @@ void set_target(
5764
void run_ethash_search(
5865
uint32_t search_batch_size,
5966
uint32_t workgroup_size,
60-
uint32_t sharedbytes,
6167
cudaStream_t stream,
62-
volatile uint32_t* g_output,
68+
volatile search_results* g_output,
6369
uint64_t start_nonce,
6470
uint32_t parallelHash
6571
);

‎libethash-cuda/keccak_u64.cuh

-777
This file was deleted.

0 commit comments

Comments
 (0)
This repository has been archived.