Skip to content

Commit cef32f1

Browse files
authored
Merge pull request #201 from SChernykh/dev
Added rx/yada support
2 parents 67e8e09 + c6292cf commit cef32f1

9 files changed

+127
-24
lines changed

cmake/CUDA.cmake

+3-2
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ if (NOT CUDA_VERSION VERSION_LESS 11.5)
5757
endif()
5858

5959
if (NOT CUDA_VERSION VERSION_LESS 11.8)
60+
list(APPEND DEFAULT_CUDA_ARCH "89")
6061
list(APPEND DEFAULT_CUDA_ARCH "90")
6162
endif()
6263
list(SORT DEFAULT_CUDA_ARCH)
@@ -228,14 +229,14 @@ if (WITH_RANDOMX)
228229
src/RandomX/graft/configuration.h
229230
src/RandomX/graft/randomx_graft.cu
230231
src/RandomX/hash.hpp
231-
src/RandomX/keva/configuration.h
232-
src/RandomX/keva/randomx_keva.cu
233232
src/RandomX/monero/configuration.h
234233
src/RandomX/monero/randomx_monero.cu
235234
src/RandomX/randomx_cuda.hpp
236235
src/RandomX/randomx.cu
237236
src/RandomX/wownero/configuration.h
238237
src/RandomX/wownero/randomx_wownero.cu
238+
src/RandomX/yada/configuration.h
239+
src/RandomX/yada/randomx_yada.cu
239240
)
240241
else()
241242
set(CUDA_RANDOMX_SOURCES "")

src/RandomX/blake2b_cuda.hpp

+103
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,90 @@ __device__ void blake2b_512_process_double_block(uint64_t *out, uint64_t* m, con
205205
if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15];
206206
}
207207

208+
template<uint32_t out_len>
209+
__device__ void blake2b_512_process_big_block(uint64_t* out, const uint64_t* in, uint32_t in_len, uint32_t nonce, uint32_t nonce_offset)
210+
{
211+
uint64_t h[8] = { Blake2b_IV::iv0 ^ (0x01010000u | out_len), Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4, Blake2b_IV::iv5, Blake2b_IV::iv6, Blake2b_IV::iv7 };
212+
213+
for (uint32_t t = 128; t < in_len; t += 128, in += 16) {
214+
uint64_t m[16] = { in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in[8], in[9], in[10], in[11], in[12], in[13], in[14], in[15] };
215+
216+
const uint32_t k0 = (nonce_offset + 0) - (t - 128);
217+
const uint32_t k1 = (nonce_offset + 1) - (t - 128);
218+
const uint32_t k2 = (nonce_offset + 2) - (t - 128);
219+
const uint32_t k3 = (nonce_offset + 3) - (t - 128);
220+
221+
if (k0 < 128) m[k0 / 8] |= (uint64_t)((nonce >> 0) & 255) << ((k0 % 8) * 8);
222+
if (k1 < 128) m[k1 / 8] |= (uint64_t)((nonce >> 8) & 255) << ((k1 % 8) * 8);
223+
if (k2 < 128) m[k2 / 8] |= (uint64_t)((nonce >> 16) & 255) << ((k2 % 8) * 8);
224+
if (k3 < 128) m[k3 / 8] |= (uint64_t)((nonce >> 24) & 255) << ((k3 % 8) * 8);
225+
226+
uint64_t v[16] = { h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], Blake2b_IV::iv0, Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4 ^ t, Blake2b_IV::iv5, Blake2b_IV::iv6, Blake2b_IV::iv7 };
227+
228+
BLAKE2B_ROUNDS();
229+
230+
h[0] ^= v[0] ^ v[8];
231+
h[1] ^= v[1] ^ v[9];
232+
h[2] ^= v[2] ^ v[10];
233+
h[3] ^= v[3] ^ v[11];
234+
h[4] ^= v[4] ^ v[12];
235+
h[5] ^= v[5] ^ v[13];
236+
h[6] ^= v[6] ^ v[14];
237+
h[7] ^= v[7] ^ v[15];
238+
}
239+
240+
uint32_t k = in_len & 127;
241+
if (k == 0) k = 128;
242+
243+
uint64_t m[16] = {
244+
(k > 0) ? in[0] : 0,
245+
(k > 8) ? in[1] : 0,
246+
(k > 16) ? in[2] : 0,
247+
(k > 24) ? in[3] : 0,
248+
(k > 32) ? in[4] : 0,
249+
(k > 40) ? in[5] : 0,
250+
(k > 48) ? in[6] : 0,
251+
(k > 56) ? in[7] : 0,
252+
(k > 64) ? in[8] : 0,
253+
(k > 72) ? in[9] : 0,
254+
(k > 80) ? in[10] : 0,
255+
(k > 88) ? in[11] : 0,
256+
(k > 96) ? in[12] : 0,
257+
(k > 104) ? in[13] : 0,
258+
(k > 112) ? in[14] : 0,
259+
(k > 120) ? in[15] : 0
260+
};
261+
262+
const uint32_t t = in_len - k;
263+
264+
const uint32_t k0 = nonce_offset + 0 - t;
265+
const uint32_t k1 = nonce_offset + 1 - t;
266+
const uint32_t k2 = nonce_offset + 2 - t;
267+
const uint32_t k3 = nonce_offset + 3 - t;
268+
269+
if (k0 < k) m[k0 / 8] |= (uint64_t)((nonce >> 0) & 255) << ((k0 % 8) * 8);
270+
if (k1 < k) m[k1 / 8] |= (uint64_t)((nonce >> 8) & 255) << ((k1 % 8) * 8);
271+
if (k2 < k) m[k2 / 8] |= (uint64_t)((nonce >> 16) & 255) << ((k2 % 8) * 8);
272+
if (k3 < k) m[k3 / 8] |= (uint64_t)((nonce >> 24) & 255) << ((k3 % 8) * 8);
273+
274+
if (k % 8) {
275+
m[k / 8] &= (uint64_t)(-1) >> (64 - (k % 8) * 8);
276+
}
277+
278+
uint64_t v[16] = { h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], Blake2b_IV::iv0, Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4 ^ in_len, Blake2b_IV::iv5, ~Blake2b_IV::iv6, Blake2b_IV::iv7 };
279+
280+
BLAKE2B_ROUNDS();
281+
282+
if (out_len > 0) out[0] = h[0] ^ v[0] ^ v[8];
283+
if (out_len > 8) out[1] = h[1] ^ v[1] ^ v[9];
284+
if (out_len > 16) out[2] = h[2] ^ v[2] ^ v[10];
285+
if (out_len > 24) out[3] = h[3] ^ v[3] ^ v[11];
286+
if (out_len > 32) out[4] = h[4] ^ v[4] ^ v[12];
287+
if (out_len > 40) out[5] = h[5] ^ v[5] ^ v[13];
288+
if (out_len > 48) out[6] = h[6] ^ v[6] ^ v[14];
289+
if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15];
290+
}
291+
208292
#undef G
209293
#undef ROUND
210294
#undef BLAKE2B_ROUNDS
@@ -280,6 +364,25 @@ __global__ void blake2b_initial_hash_double(void* out, const void* blockTemplate
280364
t[7] = hash[7];
281365
}
282366

367+
__global__ void blake2b_initial_hash_big(void* out, const void* blockTemplate, uint32_t blockTemplateSize, uint32_t start_nonce, uint32_t nonce_offset)
368+
{
369+
const uint32_t global_index = blockIdx.x * blockDim.x + threadIdx.x;
370+
const uint64_t* p = (const uint64_t*)blockTemplate;
371+
372+
uint64_t hash[8];
373+
blake2b_512_process_big_block<64>(hash, p, blockTemplateSize, start_nonce + global_index, nonce_offset);
374+
375+
uint64_t* t = ((uint64_t*) out) + global_index * 8;
376+
t[0] = hash[0];
377+
t[1] = hash[1];
378+
t[2] = hash[2];
379+
t[3] = hash[3];
380+
t[4] = hash[4];
381+
t[5] = hash[5];
382+
t[6] = hash[6];
383+
t[7] = hash[7];
384+
}
385+
283386
template<uint32_t registers_len, uint32_t registers_stride, uint32_t out_len>
284387
__global__ void blake2b_hash_registers(void *out, const void* in)
285388
{

src/RandomX/hash.hpp

+2-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ __global__ void find_shares(const void* hashes, uint64_t target, uint32_t* share
3232
}
3333
}
3434

35-
void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size)
35+
void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size)
3636
{
3737
if (ctx->inputlen <= 128) {
3838
CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
@@ -41,8 +41,7 @@ void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, ui
4141
CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_double << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
4242
}
4343
else {
44-
*rescount = 0;
45-
return;
44+
CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_big << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce, nonce_offset));
4645
}
4746

4847
CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<RANDOMX_SCRATCHPAD_L3, false, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size));

src/RandomX/keva/configuration.h src/RandomX/yada/configuration.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3232
#define RANDOMX_ARGON_MEMORY 262144
3333

3434
//Number of Argon2d iterations for Cache initialization.
35-
#define RANDOMX_ARGON_ITERATIONS 3
35+
#define RANDOMX_ARGON_ITERATIONS 4
3636

3737
//Number of parallel lanes for Cache initialization.
3838
#define RANDOMX_ARGON_LANES 1
3939

4040
//Argon2d salt
41-
#define RANDOMX_ARGON_SALT "RandomKV\x01"
41+
#define RANDOMX_ARGON_SALT "RandomXYadaCoin\x03"
4242

4343
//Number of random Cache accesses per Dataset item. Minimum is 2.
4444
#define RANDOMX_CACHE_ACCESSES 8
4545

4646
//Target latency for SuperscalarHash (in cycles of the reference CPU).
47-
#define RANDOMX_SUPERSCALAR_LATENCY 170
47+
#define RANDOMX_SUPERSCALAR_LATENCY 150
4848

4949
//Dataset base size in bytes. Must be a power of 2.
5050
#define RANDOMX_DATASET_BASE_SIZE 2147483648
@@ -62,10 +62,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6262
#define RANDOMX_PROGRAM_COUNT 8
6363

6464
//Scratchpad L3 size in bytes. Must be a power of 2.
65-
#define RANDOMX_SCRATCHPAD_L3 1048576
65+
#define RANDOMX_SCRATCHPAD_L3 2097152
6666

6767
//Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
68-
#define RANDOMX_SCRATCHPAD_L2 131072
68+
#define RANDOMX_SCRATCHPAD_L2 262144
6969

7070
//Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
7171
#define RANDOMX_SCRATCHPAD_L1 16384

src/RandomX/keva/randomx_keva.cu src/RandomX/yada/randomx_yada.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ along with RandomX CUDA. If not, see<http://www.gnu.org/licenses/>.
2626
#include <cstdint>
2727

2828

29-
namespace RandomX_Keva {
29+
namespace RandomX_Yada {
3030
#include "configuration.h"
3131
#define fillAes4Rx4 fillAes4Rx4_v104
3232
#include "RandomX/common.hpp"

src/crypto/common/Algorithm.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ xmrig_cuda::Algorithm::Id xmrig_cuda::Algorithm::parse(uint32_t id)
4343
CN_UPX2,
4444
# endif
4545
# ifdef XMRIG_ALGO_RANDOMX
46-
RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA,
46+
RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_YADA,
4747
# endif
4848
# ifdef XMRIG_ALGO_ARGON2
4949
AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ,

src/crypto/common/Algorithm.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class Algorithm
5959
RX_ARQ = 0x72121061, // "rx/arq" RandomARQ (Arqma).
6060
RX_GRAFT = 0x72151267, // "rx/graft" RandomGRAFT (Graft).
6161
RX_SFX = 0x72151273, // "rx/sfx" RandomSFX (Safex Cash).
62-
RX_KEVA = 0x7214116b, // "rx/keva" RandomKEVA (Keva).
62+
RX_YADA = 0x72151279, // "rx/yada" RandomYada (YadaCoin).
6363
AR2_CHUKWA = 0x61130000, // "argon2/chukwa" Argon2id (Chukwa).
6464
AR2_CHUKWA_V2 = 0x61140000, // "argon2/chukwav2" Argon2id (Chukwa v2).
6565
AR2_WRKZ = 0x61120000, // "argon2/wrkz" Argon2id (WRKZ)

src/cryptonight.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,11 @@ void cryptonight_extra_cpu_final(nvid_ctx *ctx, uint32_t startNonce, uint64_t ta
121121
void cuda_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len);
122122
void randomx_prepare(nvid_ctx *ctx, const void *dataset, size_t dataset_size, uint32_t batch_size);
123123

124-
namespace RandomX_Arqma { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
125-
namespace RandomX_Monero { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
126-
namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
127-
namespace RandomX_Keva { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
128-
namespace RandomX_Graft { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
124+
namespace RandomX_Arqma { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
125+
namespace RandomX_Monero { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
126+
namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
127+
namespace RandomX_Graft { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
128+
namespace RandomX_Yada { void hash(nvid_ctx* ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t* rescount, uint32_t* resnonce, uint32_t batch_size); }
129129

130130
#ifdef XMRIG_ALGO_KAWPOW
131131
void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes);

src/xmrig-cuda.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -185,23 +185,23 @@ bool rxHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *resco
185185
switch (ctx->algorithm.id()) {
186186
case Algorithm::RX_0:
187187
case Algorithm::RX_SFX:
188-
RandomX_Monero::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
188+
RandomX_Monero::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
189189
break;
190190

191191
case Algorithm::RX_WOW:
192-
RandomX_Wownero::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
192+
RandomX_Wownero::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
193193
break;
194194

195195
case Algorithm::RX_ARQ:
196-
RandomX_Arqma::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
196+
RandomX_Arqma::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
197197
break;
198198

199-
case Algorithm::RX_KEVA:
200-
RandomX_Keva::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
199+
case Algorithm::RX_YADA:
200+
RandomX_Yada::hash(ctx, startNonce, 147, target, rescount, resnonce, ctx->rx_batch_size);
201201
break;
202202

203203
case Algorithm::RX_GRAFT:
204-
RandomX_Graft::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
204+
RandomX_Graft::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
205205
break;
206206

207207
default:

0 commit comments

Comments
 (0)