@@ -205,6 +205,90 @@ __device__ void blake2b_512_process_double_block(uint64_t *out, uint64_t* m, con
205
205
if (out_len > 56 ) out[7 ] = h[7 ] ^ v[7 ] ^ v[15 ];
206
206
}
207
207
208
+ template <uint32_t out_len>
209
+ __device__ void blake2b_512_process_big_block (uint64_t * out, const uint64_t * in, uint32_t in_len, uint32_t nonce, uint32_t nonce_offset)
210
+ {
211
+ uint64_t h[8 ] = { Blake2b_IV::iv0 ^ (0x01010000u | out_len), Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4, Blake2b_IV::iv5, Blake2b_IV::iv6, Blake2b_IV::iv7 };
212
+
213
+ for (uint32_t t = 128 ; t < in_len; t += 128 , in += 16 ) {
214
+ uint64_t m[16 ] = { in[0 ], in[1 ], in[2 ], in[3 ], in[4 ], in[5 ], in[6 ], in[7 ], in[8 ], in[9 ], in[10 ], in[11 ], in[12 ], in[13 ], in[14 ], in[15 ] };
215
+
216
+ const uint32_t k0 = (nonce_offset + 0 ) - (t - 128 );
217
+ const uint32_t k1 = (nonce_offset + 1 ) - (t - 128 );
218
+ const uint32_t k2 = (nonce_offset + 2 ) - (t - 128 );
219
+ const uint32_t k3 = (nonce_offset + 3 ) - (t - 128 );
220
+
221
+ if (k0 < 128 ) m[k0 / 8 ] |= (uint64_t )((nonce >> 0 ) & 255 ) << ((k0 % 8 ) * 8 );
222
+ if (k1 < 128 ) m[k1 / 8 ] |= (uint64_t )((nonce >> 8 ) & 255 ) << ((k1 % 8 ) * 8 );
223
+ if (k2 < 128 ) m[k2 / 8 ] |= (uint64_t )((nonce >> 16 ) & 255 ) << ((k2 % 8 ) * 8 );
224
+ if (k3 < 128 ) m[k3 / 8 ] |= (uint64_t )((nonce >> 24 ) & 255 ) << ((k3 % 8 ) * 8 );
225
+
226
+ uint64_t v[16 ] = { h[0 ], h[1 ], h[2 ], h[3 ], h[4 ], h[5 ], h[6 ], h[7 ], Blake2b_IV::iv0, Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4 ^ t, Blake2b_IV::iv5, Blake2b_IV::iv6, Blake2b_IV::iv7 };
227
+
228
+ BLAKE2B_ROUNDS ();
229
+
230
+ h[0 ] ^= v[0 ] ^ v[8 ];
231
+ h[1 ] ^= v[1 ] ^ v[9 ];
232
+ h[2 ] ^= v[2 ] ^ v[10 ];
233
+ h[3 ] ^= v[3 ] ^ v[11 ];
234
+ h[4 ] ^= v[4 ] ^ v[12 ];
235
+ h[5 ] ^= v[5 ] ^ v[13 ];
236
+ h[6 ] ^= v[6 ] ^ v[14 ];
237
+ h[7 ] ^= v[7 ] ^ v[15 ];
238
+ }
239
+
240
+ uint32_t k = in_len & 127 ;
241
+ if (k == 0 ) k = 128 ;
242
+
243
+ uint64_t m[16 ] = {
244
+ (k > 0 ) ? in[0 ] : 0 ,
245
+ (k > 8 ) ? in[1 ] : 0 ,
246
+ (k > 16 ) ? in[2 ] : 0 ,
247
+ (k > 24 ) ? in[3 ] : 0 ,
248
+ (k > 32 ) ? in[4 ] : 0 ,
249
+ (k > 40 ) ? in[5 ] : 0 ,
250
+ (k > 48 ) ? in[6 ] : 0 ,
251
+ (k > 56 ) ? in[7 ] : 0 ,
252
+ (k > 64 ) ? in[8 ] : 0 ,
253
+ (k > 72 ) ? in[9 ] : 0 ,
254
+ (k > 80 ) ? in[10 ] : 0 ,
255
+ (k > 88 ) ? in[11 ] : 0 ,
256
+ (k > 96 ) ? in[12 ] : 0 ,
257
+ (k > 104 ) ? in[13 ] : 0 ,
258
+ (k > 112 ) ? in[14 ] : 0 ,
259
+ (k > 120 ) ? in[15 ] : 0
260
+ };
261
+
262
+ const uint32_t t = in_len - k;
263
+
264
+ const uint32_t k0 = nonce_offset + 0 - t;
265
+ const uint32_t k1 = nonce_offset + 1 - t;
266
+ const uint32_t k2 = nonce_offset + 2 - t;
267
+ const uint32_t k3 = nonce_offset + 3 - t;
268
+
269
+ if (k0 < k) m[k0 / 8 ] |= (uint64_t )((nonce >> 0 ) & 255 ) << ((k0 % 8 ) * 8 );
270
+ if (k1 < k) m[k1 / 8 ] |= (uint64_t )((nonce >> 8 ) & 255 ) << ((k1 % 8 ) * 8 );
271
+ if (k2 < k) m[k2 / 8 ] |= (uint64_t )((nonce >> 16 ) & 255 ) << ((k2 % 8 ) * 8 );
272
+ if (k3 < k) m[k3 / 8 ] |= (uint64_t )((nonce >> 24 ) & 255 ) << ((k3 % 8 ) * 8 );
273
+
274
+ if (k % 8 ) {
275
+ m[k / 8 ] &= (uint64_t )(-1 ) >> (64 - (k % 8 ) * 8 );
276
+ }
277
+
278
+ uint64_t v[16 ] = { h[0 ], h[1 ], h[2 ], h[3 ], h[4 ], h[5 ], h[6 ], h[7 ], Blake2b_IV::iv0, Blake2b_IV::iv1, Blake2b_IV::iv2, Blake2b_IV::iv3, Blake2b_IV::iv4 ^ in_len, Blake2b_IV::iv5, ~Blake2b_IV::iv6, Blake2b_IV::iv7 };
279
+
280
+ BLAKE2B_ROUNDS ();
281
+
282
+ if (out_len > 0 ) out[0 ] = h[0 ] ^ v[0 ] ^ v[8 ];
283
+ if (out_len > 8 ) out[1 ] = h[1 ] ^ v[1 ] ^ v[9 ];
284
+ if (out_len > 16 ) out[2 ] = h[2 ] ^ v[2 ] ^ v[10 ];
285
+ if (out_len > 24 ) out[3 ] = h[3 ] ^ v[3 ] ^ v[11 ];
286
+ if (out_len > 32 ) out[4 ] = h[4 ] ^ v[4 ] ^ v[12 ];
287
+ if (out_len > 40 ) out[5 ] = h[5 ] ^ v[5 ] ^ v[13 ];
288
+ if (out_len > 48 ) out[6 ] = h[6 ] ^ v[6 ] ^ v[14 ];
289
+ if (out_len > 56 ) out[7 ] = h[7 ] ^ v[7 ] ^ v[15 ];
290
+ }
291
+
208
292
#undef G
209
293
#undef ROUND
210
294
#undef BLAKE2B_ROUNDS
@@ -280,6 +364,25 @@ __global__ void blake2b_initial_hash_double(void* out, const void* blockTemplate
280
364
t[7 ] = hash[7 ];
281
365
}
282
366
367
+ __global__ void blake2b_initial_hash_big (void * out, const void * blockTemplate, uint32_t blockTemplateSize, uint32_t start_nonce, uint32_t nonce_offset)
368
+ {
369
+ const uint32_t global_index = blockIdx.x * blockDim.x + threadIdx.x ;
370
+ const uint64_t * p = (const uint64_t *)blockTemplate;
371
+
372
+ uint64_t hash[8 ];
373
+ blake2b_512_process_big_block<64 >(hash, p, blockTemplateSize, start_nonce + global_index, nonce_offset);
374
+
375
+ uint64_t * t = ((uint64_t *) out) + global_index * 8 ;
376
+ t[0 ] = hash[0 ];
377
+ t[1 ] = hash[1 ];
378
+ t[2 ] = hash[2 ];
379
+ t[3 ] = hash[3 ];
380
+ t[4 ] = hash[4 ];
381
+ t[5 ] = hash[5 ];
382
+ t[6 ] = hash[6 ];
383
+ t[7 ] = hash[7 ];
384
+ }
385
+
283
386
template <uint32_t registers_len, uint32_t registers_stride, uint32_t out_len>
284
387
__global__ void blake2b_hash_registers (void *out, const void * in)
285
388
{
0 commit comments