Merge pull request #201 from SChernykh/dev

xmrig · web-flow · commit cef32f1ceea5 · 2024-08-07T19:56:12.000+07:00
Added rx/yada support
diff --git a/cmake/CUDA.cmake b/cmake/CUDA.cmake
@@ -57,6 +57,7 @@ if (NOT CUDA_VERSION VERSION_LESS 11.5)
 endif()
 
 if (NOT CUDA_VERSION VERSION_LESS 11.8)
+    list(APPEND DEFAULT_CUDA_ARCH "89")
     list(APPEND DEFAULT_CUDA_ARCH "90")
 endif()
 list(SORT DEFAULT_CUDA_ARCH)
@@ -228,14 +229,14 @@ if (WITH_RANDOMX)
         src/RandomX/graft/configuration.h
         src/RandomX/graft/randomx_graft.cu
         src/RandomX/hash.hpp
-        src/RandomX/keva/configuration.h
-        src/RandomX/keva/randomx_keva.cu
         src/RandomX/monero/configuration.h
         src/RandomX/monero/randomx_monero.cu
         src/RandomX/randomx_cuda.hpp
         src/RandomX/randomx.cu
         src/RandomX/wownero/configuration.h
         src/RandomX/wownero/randomx_wownero.cu
+        src/RandomX/yada/configuration.h
+        src/RandomX/yada/randomx_yada.cu
     )
 else()
     set(CUDA_RANDOMX_SOURCES "")
diff --git a/src/RandomX/blake2b_cuda.hpp b/src/RandomX/blake2b_cuda.hpp
@@ -205,6 +205,90 @@ __device__ void blake2b_512_process_double_block(uint64_t *out, uint64_t* m, con
 	if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15];
 }
 
+template<uint32_t out_len>
+__device__ void blake2b_512_process_big_block(uint64_t* out, const uint64_t* in, uint32_t in_len, uint32_t nonce, uint32_t nonce_offset)
+{
+	uint64_t h[8] = { Blake2b_IV::iv0 ^ (0x01010000u | out_len), Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4, Blake2b_IV::iv5, Blake2b_IV::iv6, Blake2b_IV::iv7 };
+
+	for (uint32_t t = 128; t < in_len; t += 128, in += 16) {
+		uint64_t m[16] = { in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in[8], in[9], in[10], in[11],  in[12], in[13], in[14], in[15] };
+
+		const uint32_t k0 = (nonce_offset + 0) - (t - 128);
+		const uint32_t k1 = (nonce_offset + 1) - (t - 128);
+		const uint32_t k2 = (nonce_offset + 2) - (t - 128);
+		const uint32_t k3 = (nonce_offset + 3) - (t - 128);
+
+		if (k0 < 128) m[k0 / 8] |= (uint64_t)((nonce >> 0) & 255) << ((k0 % 8) * 8);
+		if (k1 < 128) m[k1 / 8] |= (uint64_t)((nonce >> 8) & 255) << ((k1 % 8) * 8);
+		if (k2 < 128) m[k2 / 8] |= (uint64_t)((nonce >> 16) & 255) << ((k2 % 8) * 8);
+		if (k3 < 128) m[k3 / 8] |= (uint64_t)((nonce >> 24) & 255) << ((k3 % 8) * 8);
+
+		uint64_t v[16] = { h[0],  h[1],  h[2],  h[3],  h[4],  h[5],  h[6],  h[7],   Blake2b_IV::iv0,   Blake2b_IV::iv1,    Blake2b_IV::iv2,    Blake2b_IV::iv3, Blake2b_IV::iv4 ^ t,    Blake2b_IV::iv5,    Blake2b_IV::iv6,    Blake2b_IV::iv7 };
+
+		BLAKE2B_ROUNDS();
+
+		h[0] ^= v[0] ^ v[8];
+		h[1] ^= v[1] ^ v[9];
+		h[2] ^= v[2] ^ v[10];
+		h[3] ^= v[3] ^ v[11];
+		h[4] ^= v[4] ^ v[12];
+		h[5] ^= v[5] ^ v[13];
+		h[6] ^= v[6] ^ v[14];
+		h[7] ^= v[7] ^ v[15];
+	}
+
+	uint32_t k = in_len & 127;
+	if (k == 0) k = 128;
+
+	uint64_t m[16] = {
+		(k > 0) ? in[0] : 0,
+		(k > 8) ? in[1] : 0,
+		(k > 16) ? in[2] : 0,
+		(k > 24) ? in[3] : 0,
+		(k > 32) ? in[4] : 0,
+		(k > 40) ? in[5] : 0,
+		(k > 48) ? in[6] : 0,
+		(k > 56) ? in[7] : 0,
+		(k > 64) ? in[8] : 0,
+		(k > 72) ? in[9] : 0,
+		(k > 80) ? in[10] : 0,
+		(k > 88) ? in[11] : 0,
+		(k > 96) ? in[12] : 0,
+		(k > 104) ? in[13] : 0,
+		(k > 112) ? in[14] : 0,
+		(k > 120) ? in[15] : 0
+	};
+
+	const uint32_t t = in_len - k;
+
+	const uint32_t k0 = nonce_offset + 0 - t;
+	const uint32_t k1 = nonce_offset + 1 - t;
+	const uint32_t k2 = nonce_offset + 2 - t;
+	const uint32_t k3 = nonce_offset + 3 - t;
+
+	if (k0 < k) m[k0 / 8] |= (uint64_t)((nonce >> 0) & 255) << ((k0 % 8) * 8);
+	if (k1 < k) m[k1 / 8] |= (uint64_t)((nonce >> 8) & 255) << ((k1 % 8) * 8);
+	if (k2 < k) m[k2 / 8] |= (uint64_t)((nonce >> 16) & 255) << ((k2 % 8) * 8);
+	if (k3 < k) m[k3 / 8] |= (uint64_t)((nonce >> 24) & 255) << ((k3 % 8) * 8);
+
+	if (k % 8) {
+		m[k / 8] &= (uint64_t)(-1) >> (64 - (k % 8) * 8);
+	}
+
+	uint64_t v[16] = { h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], Blake2b_IV::iv0, Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4 ^ in_len, Blake2b_IV::iv5, ~Blake2b_IV::iv6, Blake2b_IV::iv7 };
+
+	BLAKE2B_ROUNDS();
+
+	if (out_len > 0) out[0] = h[0] ^ v[0] ^ v[8];
+	if (out_len > 8) out[1] = h[1] ^ v[1] ^ v[9];
+	if (out_len > 16) out[2] = h[2] ^ v[2] ^ v[10];
+	if (out_len > 24) out[3] = h[3] ^ v[3] ^ v[11];
+	if (out_len > 32) out[4] = h[4] ^ v[4] ^ v[12];
+	if (out_len > 40) out[5] = h[5] ^ v[5] ^ v[13];
+	if (out_len > 48) out[6] = h[6] ^ v[6] ^ v[14];
+	if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15];
+}
+
 #undef G
 #undef ROUND
 #undef BLAKE2B_ROUNDS
@@ -280,6 +364,25 @@ __global__ void blake2b_initial_hash_double(void* out, const void* blockTemplate
 	t[7] = hash[7];
 }
 
+__global__ void blake2b_initial_hash_big(void* out, const void* blockTemplate, uint32_t blockTemplateSize, uint32_t start_nonce, uint32_t nonce_offset)
+{
+	const uint32_t global_index = blockIdx.x * blockDim.x + threadIdx.x;
+	const uint64_t* p = (const uint64_t*)blockTemplate;
+
+	uint64_t hash[8];
+	blake2b_512_process_big_block<64>(hash, p, blockTemplateSize, start_nonce + global_index, nonce_offset);
+
+	uint64_t* t = ((uint64_t*) out) + global_index * 8;
+	t[0] = hash[0];
+	t[1] = hash[1];
+	t[2] = hash[2];
+	t[3] = hash[3];
+	t[4] = hash[4];
+	t[5] = hash[5];
+	t[6] = hash[6];
+	t[7] = hash[7];
+}
+
 template<uint32_t registers_len, uint32_t registers_stride, uint32_t out_len>
 __global__ void blake2b_hash_registers(void *out, const void* in)
 {
diff --git a/src/RandomX/hash.hpp b/src/RandomX/hash.hpp
@@ -32,7 +32,7 @@ __global__ void find_shares(const void* hashes, uint64_t target, uint32_t* share
     }
 }
 
-void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size)
+void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size)
 {
     if (ctx->inputlen <= 128) {
         CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
@@ -41,8 +41,7 @@ void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, ui
         CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_double << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));
     }
     else {
-        *rescount = 0;
-        return;
+        CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_big << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce, nonce_offset));
     }
 
     CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<RANDOMX_SCRATCHPAD_L3, false, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size));
diff --git a/src/RandomX/yada/configuration.h b/src/RandomX/yada/configuration.h
@@ -32,19 +32,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_ARGON_MEMORY       262144
 
 //Number of Argon2d iterations for Cache initialization.
-#define RANDOMX_ARGON_ITERATIONS   3
+#define RANDOMX_ARGON_ITERATIONS   4
 
 //Number of parallel lanes for Cache initialization.
 #define RANDOMX_ARGON_LANES        1
 
 //Argon2d salt
-#define RANDOMX_ARGON_SALT         "RandomKV\x01"
+#define RANDOMX_ARGON_SALT         "RandomXYadaCoin\x03"
 
 //Number of random Cache accesses per Dataset item. Minimum is 2.
 #define RANDOMX_CACHE_ACCESSES     8
 
 //Target latency for SuperscalarHash (in cycles of the reference CPU).
-#define RANDOMX_SUPERSCALAR_LATENCY   170
+#define RANDOMX_SUPERSCALAR_LATENCY   150
 
 //Dataset base size in bytes. Must be a power of 2.
 #define RANDOMX_DATASET_BASE_SIZE  2147483648
@@ -62,10 +62,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RANDOMX_PROGRAM_COUNT      8
 
 //Scratchpad L3 size in bytes. Must be a power of 2.
-#define RANDOMX_SCRATCHPAD_L3      1048576
+#define RANDOMX_SCRATCHPAD_L3      2097152
 
 //Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3.
-#define RANDOMX_SCRATCHPAD_L2      131072
+#define RANDOMX_SCRATCHPAD_L2      262144
 
 //Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2.
 #define RANDOMX_SCRATCHPAD_L1      16384
diff --git a/src/RandomX/yada/randomx_yada.cu b/src/RandomX/yada/randomx_yada.cu
@@ -26,7 +26,7 @@ along with RandomX CUDA.  If not, see<http://www.gnu.org/licenses/>.
 #include <cstdint>
 
 
-namespace RandomX_Keva {
+namespace RandomX_Yada {
     #include "configuration.h"
     #define fillAes4Rx4 fillAes4Rx4_v104
     #include "RandomX/common.hpp"
diff --git a/src/crypto/common/Algorithm.cpp b/src/crypto/common/Algorithm.cpp
@@ -43,7 +43,7 @@ xmrig_cuda::Algorithm::Id xmrig_cuda::Algorithm::parse(uint32_t id)
         CN_UPX2,
 #       endif
 #       ifdef XMRIG_ALGO_RANDOMX
-        RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_KEVA,
+        RX_0, RX_WOW, RX_ARQ, RX_GRAFT, RX_SFX, RX_YADA,
 #       endif
 #       ifdef XMRIG_ALGO_ARGON2
         AR2_CHUKWA, AR2_CHUKWA_V2, AR2_WRKZ,
diff --git a/src/crypto/common/Algorithm.h b/src/crypto/common/Algorithm.h
@@ -59,7 +59,7 @@ class Algorithm
         RX_ARQ          = 0x72121061,   // "rx/arq"           RandomARQ (Arqma).
         RX_GRAFT        = 0x72151267,   // "rx/graft"         RandomGRAFT (Graft).
         RX_SFX          = 0x72151273,   // "rx/sfx"           RandomSFX (Safex Cash).
-        RX_KEVA         = 0x7214116b,   // "rx/keva"          RandomKEVA (Keva).
+        RX_YADA         = 0x72151279,   // "rx/yada"          RandomYada (YadaCoin).
         AR2_CHUKWA      = 0x61130000,   // "argon2/chukwa"    Argon2id (Chukwa).
         AR2_CHUKWA_V2   = 0x61140000,   // "argon2/chukwav2"  Argon2id (Chukwa v2).
         AR2_WRKZ        = 0x61120000,   // "argon2/wrkz"      Argon2id (WRKZ)
diff --git a/src/cryptonight.h b/src/cryptonight.h
@@ -121,11 +121,11 @@ void cryptonight_extra_cpu_final(nvid_ctx *ctx, uint32_t startNonce, uint64_t ta
 void cuda_extra_cpu_set_data(nvid_ctx *ctx, const void *data, size_t len);
 void randomx_prepare(nvid_ctx *ctx, const void *dataset, size_t dataset_size, uint32_t batch_size);
 
-namespace RandomX_Arqma   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Monero  { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Keva    { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
-namespace RandomX_Graft   { void hash(nvid_ctx *ctx, uint32_t nonce, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Arqma   { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Monero  { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Wownero { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Graft   { void hash(nvid_ctx *ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t *rescount, uint32_t *resnonce, uint32_t batch_size); }
+namespace RandomX_Yada    { void hash(nvid_ctx* ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t* rescount, uint32_t* resnonce, uint32_t batch_size); }
 
 #ifdef XMRIG_ALGO_KAWPOW
 void kawpow_prepare(nvid_ctx *ctx, const void* cache, size_t cache_size, const void* dag_precalc, size_t dag_size, uint32_t height, const uint64_t* dag_sizes);
diff --git a/src/xmrig-cuda.cpp b/src/xmrig-cuda.cpp
@@ -185,23 +185,23 @@ bool rxHash(nvid_ctx *ctx, uint32_t startNonce, uint64_t target, uint32_t *resco
         switch (ctx->algorithm.id()) {
         case Algorithm::RX_0:
         case Algorithm::RX_SFX:
-            RandomX_Monero::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
+            RandomX_Monero::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
             break;
 
         case Algorithm::RX_WOW:
-            RandomX_Wownero::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
+            RandomX_Wownero::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
             break;
 
         case Algorithm::RX_ARQ:
-            RandomX_Arqma::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
+            RandomX_Arqma::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
             break;
 
-        case Algorithm::RX_KEVA:
-            RandomX_Keva::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
+        case Algorithm::RX_YADA:
+            RandomX_Yada::hash(ctx, startNonce, 147, target, rescount, resnonce, ctx->rx_batch_size);
             break;
 
         case Algorithm::RX_GRAFT:
-            RandomX_Graft::hash(ctx, startNonce, target, rescount, resnonce, ctx->rx_batch_size);
+            RandomX_Graft::hash(ctx, startNonce, 39, target, rescount, resnonce, ctx->rx_batch_size);
             break;
 
         default:

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ __global__ void find_shares(const void* hashes, uint64_t target, uint32_t* share`
`32`	`32`	`}`
`33`	`33`	`}`
`34`	`34`
`35`		`-void hash(nvid_ctx ctx, uint32_t nonce, uint64_t target, uint32_t rescount, uint32_t *resnonce, uint32_t batch_size)`
	`35`	`+void hash(nvid_ctx ctx, uint32_t nonce, uint32_t nonce_offset, uint64_t target, uint32_t rescount, uint32_t *resnonce, uint32_t batch_size)`
`36`	`36`	`{`
`37`	`37`	`if (ctx->inputlen <= 128) {`
`38`	`38`	`CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));`
`@@ -41,8 +41,7 @@ void hash(nvid_ctx ctx, uint32_t nonce, uint64_t target, uint32_t rescount, ui`
`41`	`41`	`CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_double << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce));`
`42`	`42`	`}`
`43`	`43`	`else {`
`44`		`- *rescount = 0;`
`45`		`- return;`
	`44`	`+ CUDA_CHECK_KERNEL(ctx->device_id, blake2b_initial_hash_big << <batch_size / 32, 32 >> > (ctx->d_rx_hashes, ctx->d_input, ctx->inputlen, nonce, nonce_offset));`
`46`	`45`	`}`
`47`	`46`
`48`	`47`	`CUDA_CHECK_KERNEL(ctx->device_id, fillAes1Rx4<RANDOMX_SCRATCHPAD_L3, false, 64><<<batch_size / 32, 32 * 4>>>(ctx->d_rx_hashes, ctx->d_long_state, batch_size));`