From a2ae3604d0f96ca8ce6a111bff742d3e98689ada Mon Sep 17 00:00:00 2001 From: Monkins1010 Date: Tue, 17 Dec 2019 20:46:04 +0000 Subject: [PATCH] prefetch --- Makefile.am | 3 +- ccminer.cpp | 9 +- configure.sh | 2 +- equi/equi-stratum.cpp | 9 +- verus/haraka.c | 83 +--- verus/haraka_portable.c | 745 ++++++++++++++----------------- verus/haraka_portable.h | 59 +-- verus/verus_clhash.cpp | 185 ++++---- verus/verus_clhash.h | 146 +++++- verus/verus_clhash_portable.cpp | 763 +++++++++++++++++++++----------- verus/verus_hash.cpp | 699 +++++++++++++++++++++++------ verus/verus_hash.h | 246 +++++++++- verus/verusscan.cpp | 115 +++-- 13 files changed, 1986 insertions(+), 1078 deletions(-) diff --git a/Makefile.am b/Makefile.am index 9a0f0942de..f8c7d66a70 100644 --- a/Makefile.am +++ b/Makefile.am @@ -22,7 +22,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ ccminer.cpp pools.cpp util.cpp bench.cpp \ api.cpp hashlog.cpp stats.cpp sysinfos.cpp \ equi/equi-stratum.cpp verus/verusscan.cpp \ - verus/haraka_portable.c verus/verus_clhash_portable.cpp + verus/haraka.c verus/verus_clhash.cpp + if HAVE_WINDOWS diff --git a/ccminer.cpp b/ccminer.cpp index 52db9fa7aa..9eebf27b5d 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1,4 +1,4 @@ -/* +/* * Copyright 2010 Jeff Garzik * Copyright 2012-2014 pooler * Copyright 2014-2017 tpruvot @@ -1758,7 +1758,7 @@ static bool wanna_mine(int thr_id) float temp = gpu_temp(cgpu); if (temp > opt_max_temp) { if (!conditional_state[thr_id] && !opt_quiet) - gpulog(LOG_INFO, thr_id, "temperature too high (%.0f°c), waiting...", temp); + gpulog(LOG_INFO, thr_id, "temperature too high (%.0f°c), waiting...", temp); state = false; } else if (opt_max_temp > 0. && opt_resume_temp > 0. && conditional_state[thr_id] && temp > opt_resume_temp) { if (!thr_id && opt_debug) @@ -2312,6 +2312,11 @@ static void *miner_thread(void *userdata) work.valid_nonces = 0; + if (abort_flag) + break; // time to leave the mining loop... + + if (work_restart[thr_id].restart) + continue; /* scan nonces for a proof-of-work hash */ switch (opt_algo) { diff --git a/configure.sh b/configure.sh index b23070c515..d7f7fa5f0a 100755 --- a/configure.sh +++ b/configure.sh @@ -2,5 +2,5 @@ extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16" -./configure CXXFLAGS="-O3 $extracflags" +./configure CXXFLAGS="-O2 $extracflags" diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp index 0ab052344a..f8553ff810 100644 --- a/equi/equi-stratum.cpp +++ b/equi/equi-stratum.cpp @@ -129,9 +129,8 @@ bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params) coinb2 = json_string_value(json_array_get(params, p++)); //blank (reserved) stime = json_string_value(json_array_get(params, p++)); nbits = json_string_value(json_array_get(params, p++)); p++; - solution = json_string_value(json_array_get(params, p++)); clean = json_is_true(json_array_get(params, p)); p++; - + solution = json_string_value(json_array_get(params, p)); if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || strlen(prevhash) != 64 || strlen(version) != 8 || strlen(coinb1) != 64 || strlen(coinb2) != 64 || @@ -178,9 +177,9 @@ bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params) hex2bin(sctx->job.nbits, nbits, 4); hex2bin(sctx->job.ntime, stime, 4); - if(solution) - sctx->job.nreward[0] = solution[0]; //just copy the version - else sctx->job.nreward[0] = 0; +// TODO, parse solution hex into nreward for future PBaaS merged mining support + if(solution) sctx->job.nreward[0] = 3; //verushash v2.1 + else sctx->job.nreward[0] = 1; // verushash v2 sctx->job.clean = clean; diff --git a/verus/haraka.c b/verus/haraka.c index 7a6cac8106..f325718872 100644 --- a/verus/haraka.c +++ b/verus/haraka.c @@ -113,31 +113,31 @@ void test_implementations() { } void haraka256(unsigned char *out, const unsigned char *in) { - __m128i s[2], tmp; + __m128i s[2], tmp; - s[0] = LOAD(in); - s[1] = LOAD(in + 16); + s[0] = LOAD(in); + s[1] = LOAD(in + 16); - AES2(s[0], s[1], 0); - MIX2(s[0], s[1]); + AES2(s[0], s[1], 0); + MIX2(s[0], s[1]); - AES2(s[0], s[1], 4); - MIX2(s[0], s[1]); + AES2(s[0], s[1], 4); + MIX2(s[0], s[1]); - AES2(s[0], s[1], 8); - MIX2(s[0], s[1]); + AES2(s[0], s[1], 8); + MIX2(s[0], s[1]); - AES2(s[0], s[1], 12); - MIX2(s[0], s[1]); + AES2(s[0], s[1], 12); + MIX2(s[0], s[1]); - AES2(s[0], s[1], 16); - MIX2(s[0], s[1]); + AES2(s[0], s[1], 16); + MIX2(s[0], s[1]); - s[0] = _mm_xor_si128(s[0], LOAD(in)); - s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); + s[0] = _mm_xor_si128(s[0], LOAD(in)); + s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); - STORE(out, s[0]); - STORE(out + 16, s[1]); + STORE(out, s[0]); + STORE(out + 16, s[1]); } void haraka256_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) { @@ -146,20 +146,6 @@ void haraka256_keyed(unsigned char *out, const unsigned char *in, const u128 *rc s[0] = LOAD(in); s[1] = LOAD(in + 16); - AES2(s[0], s[1], 0); - MIX2(s[0], s[1]); - - AES2(s[0], s[1], 4); - MIX2(s[0], s[1]); - - AES2(s[0], s[1], 8); - MIX2(s[0], s[1]); - - AES2(s[0], s[1], 12); - MIX2(s[0], s[1]); - - AES2(s[0], s[1], 16); - MIX2(s[0], s[1]); s[0] = _mm_xor_si128(s[0], LOAD(in)); s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); @@ -178,42 +164,7 @@ void haraka256_4x(unsigned char *out, const unsigned char *in) { s[2][0] = LOAD(in + 64); s[2][1] = LOAD(in + 80); s[3][0] = LOAD(in + 96); - s[3][1] = LOAD(in + 112); - - // Round 1 - AES2_4x(s[0], s[1], s[2], s[3], 0); - - MIX2(s[0][0], s[0][1]); - MIX2(s[1][0], s[1][1]); - MIX2(s[2][0], s[2][1]); - MIX2(s[3][0], s[3][1]); - - // Round 2 - AES2_4x(s[0], s[1], s[2], s[3], 4); - - MIX2(s[0][0], s[0][1]); - MIX2(s[1][0], s[1][1]); - MIX2(s[2][0], s[2][1]); - MIX2(s[3][0], s[3][1]); - - // Round 3 - AES2_4x(s[0], s[1], s[2], s[3], 8); - - MIX2(s[0][0], s[0][1]); - MIX2(s[1][0], s[1][1]); - MIX2(s[2][0], s[2][1]); - MIX2(s[3][0], s[3][1]); - - // Round 4 - AES2_4x(s[0], s[1], s[2], s[3], 12); - - MIX2(s[0][0], s[0][1]); - MIX2(s[1][0], s[1][1]); - MIX2(s[2][0], s[2][1]); - MIX2(s[3][0], s[3][1]); - // Round 5 - AES2_4x(s[0], s[1], s[2], s[3], 16); MIX2(s[0][0], s[0][1]); MIX2(s[1][0], s[1][1]); diff --git a/verus/haraka_portable.c b/verus/haraka_portable.c index 5c437121d5..dde77aec00 100644 --- a/verus/haraka_portable.c +++ b/verus/haraka_portable.c @@ -10,46 +10,46 @@ Plain C implementation of the Haraka256 and Haraka512 permutations. #define HARAKAS_RATE 32 static const unsigned char haraka_rc[40][16] = { - { 0x9d, 0x7b, 0x81, 0x75, 0xf0, 0xfe, 0xc5, 0xb2, 0x0a, 0xc0, 0x20, 0xe6, 0x4c, 0x70, 0x84, 0x06 }, - { 0x17, 0xf7, 0x08, 0x2f, 0xa4, 0x6b, 0x0f, 0x64, 0x6b, 0xa0, 0xf3, 0x88, 0xe1, 0xb4, 0x66, 0x8b }, - { 0x14, 0x91, 0x02, 0x9f, 0x60, 0x9d, 0x02, 0xcf, 0x98, 0x84, 0xf2, 0x53, 0x2d, 0xde, 0x02, 0x34 }, - { 0x79, 0x4f, 0x5b, 0xfd, 0xaf, 0xbc, 0xf3, 0xbb, 0x08, 0x4f, 0x7b, 0x2e, 0xe6, 0xea, 0xd6, 0x0e }, - { 0x44, 0x70, 0x39, 0xbe, 0x1c, 0xcd, 0xee, 0x79, 0x8b, 0x44, 0x72, 0x48, 0xcb, 0xb0, 0xcf, 0xcb }, - { 0x7b, 0x05, 0x8a, 0x2b, 0xed, 0x35, 0x53, 0x8d, 0xb7, 0x32, 0x90, 0x6e, 0xee, 0xcd, 0xea, 0x7e }, - { 0x1b, 0xef, 0x4f, 0xda, 0x61, 0x27, 0x41, 0xe2, 0xd0, 0x7c, 0x2e, 0x5e, 0x43, 0x8f, 0xc2, 0x67 }, - { 0x3b, 0x0b, 0xc7, 0x1f, 0xe2, 0xfd, 0x5f, 0x67, 0x07, 0xcc, 0xca, 0xaf, 0xb0, 0xd9, 0x24, 0x29 }, - { 0xee, 0x65, 0xd4, 0xb9, 0xca, 0x8f, 0xdb, 0xec, 0xe9, 0x7f, 0x86, 0xe6, 0xf1, 0x63, 0x4d, 0xab }, - { 0x33, 0x7e, 0x03, 0xad, 0x4f, 0x40, 0x2a, 0x5b, 0x64, 0xcd, 0xb7, 0xd4, 0x84, 0xbf, 0x30, 0x1c }, - { 0x00, 0x98, 0xf6, 0x8d, 0x2e, 0x8b, 0x02, 0x69, 0xbf, 0x23, 0x17, 0x94, 0xb9, 0x0b, 0xcc, 0xb2 }, - { 0x8a, 0x2d, 0x9d, 0x5c, 0xc8, 0x9e, 0xaa, 0x4a, 0x72, 0x55, 0x6f, 0xde, 0xa6, 0x78, 0x04, 0xfa }, - { 0xd4, 0x9f, 0x12, 0x29, 0x2e, 0x4f, 0xfa, 0x0e, 0x12, 0x2a, 0x77, 0x6b, 0x2b, 0x9f, 0xb4, 0xdf }, - { 0xee, 0x12, 0x6a, 0xbb, 0xae, 0x11, 0xd6, 0x32, 0x36, 0xa2, 0x49, 0xf4, 0x44, 0x03, 0xa1, 0x1e }, - { 0xa6, 0xec, 0xa8, 0x9c, 0xc9, 0x00, 0x96, 0x5f, 0x84, 0x00, 0x05, 0x4b, 0x88, 0x49, 0x04, 0xaf }, - { 0xec, 0x93, 0xe5, 0x27, 0xe3, 0xc7, 0xa2, 0x78, 0x4f, 0x9c, 0x19, 0x9d, 0xd8, 0x5e, 0x02, 0x21 }, - { 0x73, 0x01, 0xd4, 0x82, 0xcd, 0x2e, 0x28, 0xb9, 0xb7, 0xc9, 0x59, 0xa7, 0xf8, 0xaa, 0x3a, 0xbf }, - { 0x6b, 0x7d, 0x30, 0x10, 0xd9, 0xef, 0xf2, 0x37, 0x17, 0xb0, 0x86, 0x61, 0x0d, 0x70, 0x60, 0x62 }, - { 0xc6, 0x9a, 0xfc, 0xf6, 0x53, 0x91, 0xc2, 0x81, 0x43, 0x04, 0x30, 0x21, 0xc2, 0x45, 0xca, 0x5a }, - { 0x3a, 0x94, 0xd1, 0x36, 0xe8, 0x92, 0xaf, 0x2c, 0xbb, 0x68, 0x6b, 0x22, 0x3c, 0x97, 0x23, 0x92 }, - { 0xb4, 0x71, 0x10, 0xe5, 0x58, 0xb9, 0xba, 0x6c, 0xeb, 0x86, 0x58, 0x22, 0x38, 0x92, 0xbf, 0xd3 }, - { 0x8d, 0x12, 0xe1, 0x24, 0xdd, 0xfd, 0x3d, 0x93, 0x77, 0xc6, 0xf0, 0xae, 0xe5, 0x3c, 0x86, 0xdb }, - { 0xb1, 0x12, 0x22, 0xcb, 0xe3, 0x8d, 0xe4, 0x83, 0x9c, 0xa0, 0xeb, 0xff, 0x68, 0x62, 0x60, 0xbb }, - { 0x7d, 0xf7, 0x2b, 0xc7, 0x4e, 0x1a, 0xb9, 0x2d, 0x9c, 0xd1, 0xe4, 0xe2, 0xdc, 0xd3, 0x4b, 0x73 }, - { 0x4e, 0x92, 0xb3, 0x2c, 0xc4, 0x15, 0x14, 0x4b, 0x43, 0x1b, 0x30, 0x61, 0xc3, 0x47, 0xbb, 0x43 }, - { 0x99, 0x68, 0xeb, 0x16, 0xdd, 0x31, 0xb2, 0x03, 0xf6, 0xef, 0x07, 0xe7, 0xa8, 0x75, 0xa7, 0xdb }, - { 0x2c, 0x47, 0xca, 0x7e, 0x02, 0x23, 0x5e, 0x8e, 0x77, 0x59, 0x75, 0x3c, 0x4b, 0x61, 0xf3, 0x6d }, - { 0xf9, 0x17, 0x86, 0xb8, 0xb9, 0xe5, 0x1b, 0x6d, 0x77, 0x7d, 0xde, 0xd6, 0x17, 0x5a, 0xa7, 0xcd }, - { 0x5d, 0xee, 0x46, 0xa9, 0x9d, 0x06, 0x6c, 0x9d, 0xaa, 0xe9, 0xa8, 0x6b, 0xf0, 0x43, 0x6b, 0xec }, - { 0xc1, 0x27, 0xf3, 0x3b, 0x59, 0x11, 0x53, 0xa2, 0x2b, 0x33, 0x57, 0xf9, 0x50, 0x69, 0x1e, 0xcb }, - { 0xd9, 0xd0, 0x0e, 0x60, 0x53, 0x03, 0xed, 0xe4, 0x9c, 0x61, 0xda, 0x00, 0x75, 0x0c, 0xee, 0x2c }, - { 0x50, 0xa3, 0xa4, 0x63, 0xbc, 0xba, 0xbb, 0x80, 0xab, 0x0c, 0xe9, 0x96, 0xa1, 0xa5, 0xb1, 0xf0 }, - { 0x39, 0xca, 0x8d, 0x93, 0x30, 0xde, 0x0d, 0xab, 0x88, 0x29, 0x96, 0x5e, 0x02, 0xb1, 0x3d, 0xae }, - { 0x42, 0xb4, 0x75, 0x2e, 0xa8, 0xf3, 0x14, 0x88, 0x0b, 0xa4, 0x54, 0xd5, 0x38, 0x8f, 0xbb, 0x17 }, - { 0xf6, 0x16, 0x0a, 0x36, 0x79, 0xb7, 0xb6, 0xae, 0xd7, 0x7f, 0x42, 0x5f, 0x5b, 0x8a, 0xbb, 0x34 }, - { 0xde, 0xaf, 0xba, 0xff, 0x18, 0x59, 0xce, 0x43, 0x38, 0x54, 0xe5, 0xcb, 0x41, 0x52, 0xf6, 0x26 }, - { 0x78, 0xc9, 0x9e, 0x83, 0xf7, 0x9c, 0xca, 0xa2, 0x6a, 0x02, 0xf3, 0xb9, 0x54, 0x9a, 0xe9, 0x4c }, - { 0x35, 0x12, 0x90, 0x22, 0x28, 0x6e, 0xc0, 0x40, 0xbe, 0xf7, 0xdf, 0x1b, 0x1a, 0xa5, 0x51, 0xae }, - { 0xcf, 0x59, 0xa6, 0x48, 0x0f, 0xbc, 0x73, 0xc1, 0x2b, 0xd2, 0x7e, 0xba, 0x3c, 0x61, 0xc1, 0xa0 }, - { 0xa1, 0x9d, 0xc5, 0xe9, 0xfd, 0xbd, 0xd6, 0x4a, 0x88, 0x82, 0x28, 0x02, 0x03, 0xcc, 0x6a, 0x75 } + {0x9d, 0x7b, 0x81, 0x75, 0xf0, 0xfe, 0xc5, 0xb2, 0x0a, 0xc0, 0x20, 0xe6, 0x4c, 0x70, 0x84, 0x06}, + {0x17, 0xf7, 0x08, 0x2f, 0xa4, 0x6b, 0x0f, 0x64, 0x6b, 0xa0, 0xf3, 0x88, 0xe1, 0xb4, 0x66, 0x8b}, + {0x14, 0x91, 0x02, 0x9f, 0x60, 0x9d, 0x02, 0xcf, 0x98, 0x84, 0xf2, 0x53, 0x2d, 0xde, 0x02, 0x34}, + {0x79, 0x4f, 0x5b, 0xfd, 0xaf, 0xbc, 0xf3, 0xbb, 0x08, 0x4f, 0x7b, 0x2e, 0xe6, 0xea, 0xd6, 0x0e}, + {0x44, 0x70, 0x39, 0xbe, 0x1c, 0xcd, 0xee, 0x79, 0x8b, 0x44, 0x72, 0x48, 0xcb, 0xb0, 0xcf, 0xcb}, + {0x7b, 0x05, 0x8a, 0x2b, 0xed, 0x35, 0x53, 0x8d, 0xb7, 0x32, 0x90, 0x6e, 0xee, 0xcd, 0xea, 0x7e}, + {0x1b, 0xef, 0x4f, 0xda, 0x61, 0x27, 0x41, 0xe2, 0xd0, 0x7c, 0x2e, 0x5e, 0x43, 0x8f, 0xc2, 0x67}, + {0x3b, 0x0b, 0xc7, 0x1f, 0xe2, 0xfd, 0x5f, 0x67, 0x07, 0xcc, 0xca, 0xaf, 0xb0, 0xd9, 0x24, 0x29}, + {0xee, 0x65, 0xd4, 0xb9, 0xca, 0x8f, 0xdb, 0xec, 0xe9, 0x7f, 0x86, 0xe6, 0xf1, 0x63, 0x4d, 0xab}, + {0x33, 0x7e, 0x03, 0xad, 0x4f, 0x40, 0x2a, 0x5b, 0x64, 0xcd, 0xb7, 0xd4, 0x84, 0xbf, 0x30, 0x1c}, + {0x00, 0x98, 0xf6, 0x8d, 0x2e, 0x8b, 0x02, 0x69, 0xbf, 0x23, 0x17, 0x94, 0xb9, 0x0b, 0xcc, 0xb2}, + {0x8a, 0x2d, 0x9d, 0x5c, 0xc8, 0x9e, 0xaa, 0x4a, 0x72, 0x55, 0x6f, 0xde, 0xa6, 0x78, 0x04, 0xfa}, + {0xd4, 0x9f, 0x12, 0x29, 0x2e, 0x4f, 0xfa, 0x0e, 0x12, 0x2a, 0x77, 0x6b, 0x2b, 0x9f, 0xb4, 0xdf}, + {0xee, 0x12, 0x6a, 0xbb, 0xae, 0x11, 0xd6, 0x32, 0x36, 0xa2, 0x49, 0xf4, 0x44, 0x03, 0xa1, 0x1e}, + {0xa6, 0xec, 0xa8, 0x9c, 0xc9, 0x00, 0x96, 0x5f, 0x84, 0x00, 0x05, 0x4b, 0x88, 0x49, 0x04, 0xaf}, + {0xec, 0x93, 0xe5, 0x27, 0xe3, 0xc7, 0xa2, 0x78, 0x4f, 0x9c, 0x19, 0x9d, 0xd8, 0x5e, 0x02, 0x21}, + {0x73, 0x01, 0xd4, 0x82, 0xcd, 0x2e, 0x28, 0xb9, 0xb7, 0xc9, 0x59, 0xa7, 0xf8, 0xaa, 0x3a, 0xbf}, + {0x6b, 0x7d, 0x30, 0x10, 0xd9, 0xef, 0xf2, 0x37, 0x17, 0xb0, 0x86, 0x61, 0x0d, 0x70, 0x60, 0x62}, + {0xc6, 0x9a, 0xfc, 0xf6, 0x53, 0x91, 0xc2, 0x81, 0x43, 0x04, 0x30, 0x21, 0xc2, 0x45, 0xca, 0x5a}, + {0x3a, 0x94, 0xd1, 0x36, 0xe8, 0x92, 0xaf, 0x2c, 0xbb, 0x68, 0x6b, 0x22, 0x3c, 0x97, 0x23, 0x92}, + {0xb4, 0x71, 0x10, 0xe5, 0x58, 0xb9, 0xba, 0x6c, 0xeb, 0x86, 0x58, 0x22, 0x38, 0x92, 0xbf, 0xd3}, + {0x8d, 0x12, 0xe1, 0x24, 0xdd, 0xfd, 0x3d, 0x93, 0x77, 0xc6, 0xf0, 0xae, 0xe5, 0x3c, 0x86, 0xdb}, + {0xb1, 0x12, 0x22, 0xcb, 0xe3, 0x8d, 0xe4, 0x83, 0x9c, 0xa0, 0xeb, 0xff, 0x68, 0x62, 0x60, 0xbb}, + {0x7d, 0xf7, 0x2b, 0xc7, 0x4e, 0x1a, 0xb9, 0x2d, 0x9c, 0xd1, 0xe4, 0xe2, 0xdc, 0xd3, 0x4b, 0x73}, + {0x4e, 0x92, 0xb3, 0x2c, 0xc4, 0x15, 0x14, 0x4b, 0x43, 0x1b, 0x30, 0x61, 0xc3, 0x47, 0xbb, 0x43}, + {0x99, 0x68, 0xeb, 0x16, 0xdd, 0x31, 0xb2, 0x03, 0xf6, 0xef, 0x07, 0xe7, 0xa8, 0x75, 0xa7, 0xdb}, + {0x2c, 0x47, 0xca, 0x7e, 0x02, 0x23, 0x5e, 0x8e, 0x77, 0x59, 0x75, 0x3c, 0x4b, 0x61, 0xf3, 0x6d}, + {0xf9, 0x17, 0x86, 0xb8, 0xb9, 0xe5, 0x1b, 0x6d, 0x77, 0x7d, 0xde, 0xd6, 0x17, 0x5a, 0xa7, 0xcd}, + {0x5d, 0xee, 0x46, 0xa9, 0x9d, 0x06, 0x6c, 0x9d, 0xaa, 0xe9, 0xa8, 0x6b, 0xf0, 0x43, 0x6b, 0xec}, + {0xc1, 0x27, 0xf3, 0x3b, 0x59, 0x11, 0x53, 0xa2, 0x2b, 0x33, 0x57, 0xf9, 0x50, 0x69, 0x1e, 0xcb}, + {0xd9, 0xd0, 0x0e, 0x60, 0x53, 0x03, 0xed, 0xe4, 0x9c, 0x61, 0xda, 0x00, 0x75, 0x0c, 0xee, 0x2c}, + {0x50, 0xa3, 0xa4, 0x63, 0xbc, 0xba, 0xbb, 0x80, 0xab, 0x0c, 0xe9, 0x96, 0xa1, 0xa5, 0xb1, 0xf0}, + {0x39, 0xca, 0x8d, 0x93, 0x30, 0xde, 0x0d, 0xab, 0x88, 0x29, 0x96, 0x5e, 0x02, 0xb1, 0x3d, 0xae}, + {0x42, 0xb4, 0x75, 0x2e, 0xa8, 0xf3, 0x14, 0x88, 0x0b, 0xa4, 0x54, 0xd5, 0x38, 0x8f, 0xbb, 0x17}, + {0xf6, 0x16, 0x0a, 0x36, 0x79, 0xb7, 0xb6, 0xae, 0xd7, 0x7f, 0x42, 0x5f, 0x5b, 0x8a, 0xbb, 0x34}, + {0xde, 0xaf, 0xba, 0xff, 0x18, 0x59, 0xce, 0x43, 0x38, 0x54, 0xe5, 0xcb, 0x41, 0x52, 0xf6, 0x26}, + {0x78, 0xc9, 0x9e, 0x83, 0xf7, 0x9c, 0xca, 0xa2, 0x6a, 0x02, 0xf3, 0xb9, 0x54, 0x9a, 0xe9, 0x4c}, + {0x35, 0x12, 0x90, 0x22, 0x28, 0x6e, 0xc0, 0x40, 0xbe, 0xf7, 0xdf, 0x1b, 0x1a, 0xa5, 0x51, 0xae}, + {0xcf, 0x59, 0xa6, 0x48, 0x0f, 0xbc, 0x73, 0xc1, 0x2b, 0xd2, 0x7e, 0xba, 0x3c, 0x61, 0xc1, 0xa0}, + {0xa1, 0x9d, 0xc5, 0xe9, 0xfd, 0xbd, 0xd6, 0x4a, 0x88, 0x82, 0x28, 0x02, 0x03, 0xcc, 0x6a, 0x75} }; static unsigned char rc[40][16]; @@ -58,439 +58,374 @@ static unsigned char rc_sseed[40][16]; static const unsigned char sbox[256] = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, -0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, -0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, -0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, -0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, -0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, -0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, -0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, -0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, -0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, -0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, -0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, -0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, -0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, -0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, -0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, -0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, -0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, -0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, -0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; + 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, + 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, + 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, + 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, + 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, + 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, + 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, + 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, + 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, + 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, + 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, + 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, + 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, + 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, + 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, + 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, + 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, + 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; #define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) // Simulate _mm_aesenc_si128 instructions from AESNI -void aesenc(unsigned char *s, const unsigned char *rk) +void aesenc(unsigned char *s, const unsigned char *rk) { - - __m128i tmp1, tmp2, tmp3; - // uint8_t s[16] = { i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7], - // i[8], i[9], i[10], i[11], i[12], i[13], i[14], i[15] }; - - tmp1 = _mm_load_si128(s); - tmp2 = _mm_load_si128(rk); - ((__m128i*)&s[0])[0] = _mm_aesenc_si128(tmp1, tmp2); - -// ((__m128i*)&s[0])[0] = _mm_load_si128(&tmp3); - - - /* unsigned char i, t, u, v[4][4]; - for (i = 0; i < 16; ++i) { - v[((i / 4) + 4 - (i%4) ) % 4][i % 4] = sbox[s[i]]; - } - for (i = 0; i < 4; ++i) { - t = v[i][0]; - u = v[i][0] ^ v[i][1] ^ v[i][2] ^ v[i][3]; - v[i][0] ^= u ^ XT(v[i][0] ^ v[i][1]); - v[i][1] ^= u ^ XT(v[i][1] ^ v[i][2]); - v[i][2] ^= u ^ XT(v[i][2] ^ v[i][3]); - v[i][3] ^= u ^ XT(v[i][3] ^ t); - } - for (i = 0; i < 16; ++i) { - s[i] = v[i / 4][i % 4] ^ rk[i]; - } */ + unsigned char i, t, u, v[4][4]; + for (i = 0; i < 16; ++i) { + v[((i / 4) + 4 - (i%4) ) % 4][i % 4] = sbox[s[i]]; + } + for (i = 0; i < 4; ++i) { + t = v[i][0]; + u = v[i][0] ^ v[i][1] ^ v[i][2] ^ v[i][3]; + v[i][0] ^= u ^ XT(v[i][0] ^ v[i][1]); + v[i][1] ^= u ^ XT(v[i][1] ^ v[i][2]); + v[i][2] ^= u ^ XT(v[i][2] ^ v[i][3]); + v[i][3] ^= u ^ XT(v[i][3] ^ t); + } + for (i = 0; i < 16; ++i) { + s[i] = v[i / 4][i % 4] ^ rk[i]; + } } // Simulate _mm_unpacklo_epi32 -void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) +void unpacklo32(unsigned char *t, unsigned char *a, unsigned char *b) { - unsigned char tmp[16]; - memcpy(tmp, a, 4); - memcpy(tmp + 4, b, 4); - memcpy(tmp + 8, a + 4, 4); - memcpy(tmp + 12, b + 4, 4); - memcpy(t, tmp, 16); + unsigned char tmp[16]; + memcpy(tmp, a, 4); + memcpy(tmp + 4, b, 4); + memcpy(tmp + 8, a + 4, 4); + memcpy(tmp + 12, b + 4, 4); + memcpy(t, tmp, 16); } // Simulate _mm_unpackhi_epi32 -void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b) +void unpackhi32(unsigned char *t, unsigned char *a, unsigned char *b) { - unsigned char tmp[16]; - memcpy(tmp, a + 8, 4); - memcpy(tmp + 4, b + 8, 4); - memcpy(tmp + 8, a + 12, 4); - memcpy(tmp + 12, b + 12, 4); - memcpy(t, tmp, 16); + unsigned char tmp[16]; + memcpy(tmp, a + 8, 4); + memcpy(tmp + 4, b + 8, 4); + memcpy(tmp + 8, a + 12, 4); + memcpy(tmp + 12, b + 12, 4); + memcpy(t, tmp, 16); } void load_constants_port() { - /* Use the standard constants to generate tweaked ones. */ - memcpy(rc, haraka_rc, 40 * 16); + /* Use the standard constants to generate tweaked ones. */ + memcpy(rc, haraka_rc, 40*16); } void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed, - unsigned long long seed_length) + unsigned long long seed_length) { - unsigned char buf[40 * 16]; + unsigned char buf[40*16]; - /* Use the standard constants to generate tweaked ones. */ - memcpy(rc, haraka_rc, 40 * 16); + /* Use the standard constants to generate tweaked ones. */ + memcpy(rc, haraka_rc, 40*16); - /* Constants for sk.seed */ - if (sk_seed != NULL) { - haraka_S(buf, 40 * 16, sk_seed, seed_length); - memcpy(rc_sseed, buf, 40 * 16); - } + /* Constants for sk.seed */ + if (sk_seed != NULL) { + haraka_S(buf, 40*16, sk_seed, seed_length); + memcpy(rc_sseed, buf, 40*16); + } - /* Constants for pk.seed */ - haraka_S(buf, 40 * 16, pk_seed, seed_length); - memcpy(rc, buf, 40 * 16); + /* Constants for pk.seed */ + haraka_S(buf, 40*16, pk_seed, seed_length); + memcpy(rc, buf, 40*16); } -static void haraka_S_absorb(unsigned char *s, - const unsigned char *m, unsigned long long mlen, - unsigned char p) +static void haraka_S_absorb(unsigned char *s, + const unsigned char *m, unsigned long long mlen, + unsigned char p) { - unsigned long long i; - - unsigned char t[2]; - - - - while (mlen >= 32) { - // XOR block to state - for (i = 0; i < 32; ++i) { - s[i] ^= m[i]; - } - haraka512_perm(s, s); - mlen -= 32; - m += 32; - } - - for (i = 0; i < 32; ++i) { - t[i] = 0; - } - for (i = 0; i < mlen; ++i) { - t[i] = m[i]; - } - t[i] = p; - t[32 - 1] |= 128; - for (i = 0; i < 32; ++i) { - s[i] ^= t[i]; - } + unsigned long long i; + + unsigned char t[2]; + + + + while (mlen >= 32) { + // XOR block to state + for (i = 0; i < 32; ++i) { + s[i] ^= m[i]; + } + haraka512_perm(s, s); + mlen -= 32; + m += 32; + } + + for (i = 0; i < 32; ++i) { + t[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t[i] = m[i]; + } + t[i] = p; + t[32 - 1] |= 128; + for (i = 0; i < 32; ++i) { + s[i] ^= t[i]; + } } static void haraka_S_squeezeblocks(unsigned char *h, unsigned long long nblocks, - unsigned char *s, unsigned int r) + unsigned char *s, unsigned int r) { - while (nblocks > 0) { - haraka512_perm(s, s); - memcpy(h, s, HARAKAS_RATE); - h += r; - nblocks--; - } + while (nblocks > 0) { + haraka512_perm(s, s); + memcpy(h, s, HARAKAS_RATE); + h += r; + nblocks--; + } } void haraka_S(unsigned char *out, unsigned long long outlen, - const unsigned char *in, unsigned long long inlen) + const unsigned char *in, unsigned long long inlen) { - unsigned long long i; - unsigned char s[64]; - unsigned char d[32]; - - for (i = 0; i < 64; i++) { - s[i] = 0; - } - haraka_S_absorb(s, in, inlen, 0x1F); - - haraka_S_squeezeblocks(out, outlen / 32, s, 32); - out += (outlen / 32) * 32; - - if (outlen % 32) { - haraka_S_squeezeblocks(d, 1, s, 32); - for (i = 0; i < outlen % 32; i++) { - out[i] = d[i]; - } - } + unsigned long long i; + unsigned char s[64]; + unsigned char d[32]; + + for (i = 0; i < 64; i++) { + s[i] = 0; + } + haraka_S_absorb(s, in, inlen, 0x1F); + + haraka_S_squeezeblocks(out, outlen / 32, s, 32); + out += (outlen / 32) * 32; + + if (outlen % 32) { + haraka_S_squeezeblocks(d, 1, s, 32); + for (i = 0; i < outlen % 32; i++) { + out[i] = d[i]; + } + } } -void haraka512_perm(unsigned char *out, const unsigned char *in) +void haraka512_perm(unsigned char *out, const unsigned char *in) { - int i, j; - - unsigned char s[64], tmp[16]; - - memcpy(s, in, 16); - memcpy(s + 16, in + 16, 16); - memcpy(s + 32, in + 32, 16); - memcpy(s + 48, in + 48, 16); - - for (i = 0; i < 5; ++i) { - // aes round(s) - for (j = 0; j < 2; ++j) { - aesenc(s, rc[4 * 2 * i + 4 * j]); - aesenc(s + 16, rc[4 * 2 * i + 4 * j + 1]); - aesenc(s + 32, rc[4 * 2 * i + 4 * j + 2]); - aesenc(s + 48, rc[4 * 2 * i + 4 * j + 3]); - } - - // mixing - unpacklo32(tmp, s, s + 16); - unpackhi32(s, s, s + 16); - unpacklo32(s + 16, s + 32, s + 48); - unpackhi32(s + 32, s + 32, s + 48); - unpacklo32(s + 48, s, s + 32); - unpackhi32(s, s, s + 32); - unpackhi32(s + 32, s + 16, tmp); - unpacklo32(s + 16, s + 16, tmp); - } - - memcpy(out, s, 64); + int i, j; + + unsigned char s[64], tmp[16]; + + memcpy(s, in, 16); + memcpy(s + 16, in + 16, 16); + memcpy(s + 32, in + 32, 16); + memcpy(s + 48, in + 48, 16); + + for (i = 0; i < 5; ++i) { + // aes round(s) + for (j = 0; j < 2; ++j) { + aesenc(s, rc[4*2*i + 4*j]); + aesenc(s + 16, rc[4*2*i + 4*j + 1]); + aesenc(s + 32, rc[4*2*i + 4*j + 2]); + aesenc(s + 48, rc[4*2*i + 4*j + 3]); + } + + // mixing + unpacklo32(tmp, s, s + 16); + unpackhi32(s, s, s + 16); + unpacklo32(s + 16, s + 32, s + 48); + unpackhi32(s + 32, s + 32, s + 48); + unpacklo32(s + 48, s, s + 32); + unpackhi32(s, s, s + 32); + unpackhi32(s + 32, s + 16, tmp); + unpacklo32(s + 16, s + 16, tmp); + } + + memcpy(out, s, 64); } -void haraka512_perm_keyed(unsigned char *out, const unsigned char *in, const __m128i *rc) +void haraka512_perm_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) { - int i, j; - - unsigned char s[64], tmp[16]; - - memcpy(s, in, 16); - memcpy(s + 16, in + 16, 16); - memcpy(s + 32, in + 32, 16); - memcpy(s + 48, in + 48, 16); - - for (i = 0; i < 5; ++i) { - // aes round(s) - for (j = 0; j < 2; ++j) { - aesenc(s, (const unsigned char *)&rc[4 * 2 * i + 4 * j]); - aesenc(s + 16, (const unsigned char *)&rc[4 * 2 * i + 4 * j + 1]); - aesenc(s + 32, (const unsigned char *)&rc[4 * 2 * i + 4 * j + 2]); - aesenc(s + 48, (const unsigned char *)&rc[4 * 2 * i + 4 * j + 3]); - } - - // mixing - unpacklo32(tmp, s, s + 16); - unpackhi32(s, s, s + 16); - unpacklo32(s + 16, s + 32, s + 48); - unpackhi32(s + 32, s + 32, s + 48); - unpacklo32(s + 48, s, s + 32); - unpackhi32(s, s, s + 32); - unpackhi32(s + 32, s + 16, tmp); - unpacklo32(s + 16, s + 16, tmp); - } - - memcpy(out, s, 64); + int i, j; + + unsigned char s[64], tmp[16]; + + memcpy(s, in, 16); + memcpy(s + 16, in + 16, 16); + memcpy(s + 32, in + 32, 16); + memcpy(s + 48, in + 48, 16); + + for (i = 0; i < 5; ++i) { + // aes round(s) + for (j = 0; j < 2; ++j) { + aesenc(s, (const unsigned char *)&rc[4*2*i + 4*j]); + aesenc(s + 16, (const unsigned char *)&rc[4*2*i + 4*j + 1]); + aesenc(s + 32, (const unsigned char *)&rc[4*2*i + 4*j + 2]); + aesenc(s + 48, (const unsigned char *)&rc[4*2*i + 4*j + 3]); + } + + // mixing + unpacklo32(tmp, s, s + 16); + unpackhi32(s, s, s + 16); + unpacklo32(s + 16, s + 32, s + 48); + unpackhi32(s + 32, s + 32, s + 48); + unpacklo32(s + 48, s, s + 32); + unpackhi32(s, s, s + 32); + unpackhi32(s + 32, s + 16, tmp); + unpacklo32(s + 16, s + 16, tmp); + } + + memcpy(out, s, 64); } void haraka512_port(unsigned char *out, const unsigned char *in) { - int i; + int i; - unsigned char buf[64]; + unsigned char buf[64]; - haraka512_perm(buf, in); - /* Feed-forward */ - for (i = 0; i < 64; i++) { - buf[i] = buf[i] ^ in[i]; - } + haraka512_perm(buf, in); + /* Feed-forward */ + for (i = 0; i < 64; i++) { + buf[i] = buf[i] ^ in[i]; + } - /* Truncated */ - memcpy(out, buf + 8, 8); - memcpy(out + 8, buf + 24, 8); - memcpy(out + 16, buf + 32, 8); - memcpy(out + 24, buf + 48, 8); + /* Truncated */ + memcpy(out, buf + 8, 8); + memcpy(out + 8, buf + 24, 8); + memcpy(out + 16, buf + 32, 8); + memcpy(out + 24, buf + 48, 8); } -#define AES4(s0, s1, s2, s3, rci) \ - s0 = _mm_aesenc_si128(s0, rc[rci]); \ - s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \ - s2 = _mm_aesenc_si128(s2, rc[rci + 2]); \ - s3 = _mm_aesenc_si128(s3, rc[rci + 3]); \ - s0 = _mm_aesenc_si128(s0, rc[rci + 4]); \ - s1 = _mm_aesenc_si128(s1, rc[rci + 5]); \ - s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \ - s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \ - -#define MIX4(s0, s1, s2, s3) \ - tmp = _mm_unpacklo_epi32(s0, s1); \ - s0 = _mm_unpackhi_epi32(s0, s1); \ - s1 = _mm_unpacklo_epi32(s2, s3); \ - s2 = _mm_unpackhi_epi32(s2, s3); \ - s3 = _mm_unpacklo_epi32(s0, s2); \ - s0 = _mm_unpackhi_epi32(s0, s2); \ - s2 = _mm_unpackhi_epi32(s1, tmp); \ - s1 = _mm_unpacklo_epi32(s1, tmp); - -#define MIX4_OPP(s0, s1, s2, s3) \ - tmp = _mm_unpacklo_epi32(s0, s1); \ - s0 = _mm_unpackhi_epi32(s0, s1); \ - s1 = _mm_unpacklo_epi32(s2, s3); \ - s2 = _mm_unpackhi_epi32(s2, s3); \ - s3 = _mm_unpacklo_epi32(s0, s2); \ - s0 = _mm_unpackhi_epi32(s0, s2); \ - s2 = _mm_unpackhi_epi32(s1, tmp); \ - s1 = _mm_unpacklo_epi32(s1, tmp); - -#define MIX4_LAST(s0, s1, s2, s3) \ - tmp = _mm_unpacklo_epi32(s0, s1); \ - s1 = _mm_unpacklo_epi32(s2, s3); \ - s2 = _mm_unpackhi_epi32(s1, tmp); - -#define AES4_LAST(s0, s1, s2, s3, rci) \ - s2 = _mm_aesenc_si128(s2, rc[rci + 2]); \ - s2 = _mm_aesenc_si128(s2, rc[rci + 6]); - -#define LOAD(src) _mm_load_si128((__m128i *)(src)) -#define STORE(dest,src) _mm_storeu_si128((__m128i *)(dest),src) - -void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const __m128i *rc) -{ - __m128i s[4], tmp; - - s[0] = LOAD(in); - s[1] = LOAD(in + 16); - s[2] = LOAD(in + 32); - s[3] = LOAD(in + 48); - - AES4(s[0], s[1], s[2], s[3], 0); - MIX4(s[0], s[1], s[2], s[3]); - AES4(s[0], s[1], s[2], s[3], 8); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 16); - MIX4(s[0], s[1], s[2], s[3]); - - AES4(s[0], s[1], s[2], s[3], 24); - MIX4_LAST(s[0], s[1], s[2], s[3]); +void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) +{ + int i; - AES4_LAST(s[0], s[1], s[2], s[3], 32); + unsigned char buf[64]; + haraka512_perm_keyed(buf, in, rc); + /* Feed-forward */ + for (i = 0; i < 64; i++) { + buf[i] = buf[i] ^ in[i]; + } - // s[0] = _mm_xor_si128(s[0], LOAD(in)); - // s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); - // s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); - // s[3] = _mm_xor_si128(s[0], LOAD(in + 48)); - ((uint32_t*)&out[0])[7] = ((uint32_t*)&s[0])[10] ^ ((uint32_t*)&in[52])[0]; + /* Truncated */ + memcpy(out, buf + 8, 8); + memcpy(out + 8, buf + 24, 8); + memcpy(out + 16, buf + 32, 8); + memcpy(out + 24, buf + 48, 8); } -void haraka512_perm_zero(unsigned char *out, const unsigned char *in) +void haraka512_perm_zero(unsigned char *out, const unsigned char *in) { - int i, j; - - unsigned char s[64], tmp[16]; - - memcpy(s, in, 16); - memcpy(s + 16, in + 16, 16); - memcpy(s + 32, in + 32, 16); - memcpy(s + 48, in + 48, 16); - - for (i = 0; i < 5; ++i) { - // aes round(s) - for (j = 0; j < 2; ++j) { - aesenc(s, rc0[4 * 2 * i + 4 * j]); - aesenc(s + 16, rc0[4 * 2 * i + 4 * j + 1]); - aesenc(s + 32, rc0[4 * 2 * i + 4 * j + 2]); - aesenc(s + 48, rc0[4 * 2 * i + 4 * j + 3]); - } - - // mixing - unpacklo32(tmp, s, s + 16); - unpackhi32(s, s, s + 16); - unpacklo32(s + 16, s + 32, s + 48); - unpackhi32(s + 32, s + 32, s + 48); - unpacklo32(s + 48, s, s + 32); - unpackhi32(s, s, s + 32); - unpackhi32(s + 32, s + 16, tmp); - unpacklo32(s + 16, s + 16, tmp); - } - - memcpy(out, s, 64); + int i, j; + + unsigned char s[64], tmp[16]; + + memcpy(s, in, 16); + memcpy(s + 16, in + 16, 16); + memcpy(s + 32, in + 32, 16); + memcpy(s + 48, in + 48, 16); + + for (i = 0; i < 5; ++i) { + // aes round(s) + for (j = 0; j < 2; ++j) { + aesenc(s, rc0[4*2*i + 4*j]); + aesenc(s + 16, rc0[4*2*i + 4*j + 1]); + aesenc(s + 32, rc0[4*2*i + 4*j + 2]); + aesenc(s + 48, rc0[4*2*i + 4*j + 3]); + } + + // mixing + unpacklo32(tmp, s, s + 16); + unpackhi32(s, s, s + 16); + unpacklo32(s + 16, s + 32, s + 48); + unpackhi32(s + 32, s + 32, s + 48); + unpacklo32(s + 48, s, s + 32); + unpackhi32(s, s, s + 32); + unpackhi32(s + 32, s + 16, tmp); + unpacklo32(s + 16, s + 16, tmp); + } + + memcpy(out, s, 64); } void haraka512_port_zero(unsigned char *out, const unsigned char *in) { - int i; + int i; - unsigned char buf[64]; + unsigned char buf[64]; - haraka512_perm_zero(buf, in); - /* Feed-forward */ - for (i = 0; i < 64; i++) { - buf[i] = buf[i] ^ in[i]; - } + haraka512_perm_zero(buf, in); + /* Feed-forward */ + for (i = 0; i < 64; i++) { + buf[i] = buf[i] ^ in[i]; + } - /* Truncated */ - memcpy(out, buf + 8, 8); - memcpy(out + 8, buf + 24, 8); - memcpy(out + 16, buf + 32, 8); - memcpy(out + 24, buf + 48, 8); + /* Truncated */ + memcpy(out, buf + 8, 8); + memcpy(out + 8, buf + 24, 8); + memcpy(out + 16, buf + 32, 8); + memcpy(out + 24, buf + 48, 8); } -void haraka256_port(unsigned char *out, const unsigned char *in) +void haraka256_port(unsigned char *out, const unsigned char *in) { - int i, j; - - unsigned char s[32], tmp[16]; - - memcpy(s, in, 16); - memcpy(s + 16, in + 16, 16); - - for (i = 0; i < 5; ++i) { - // aes round(s) - for (j = 0; j < 2; ++j) { - aesenc(s, rc[2 * 2 * i + 2 * j]); - aesenc(s + 16, rc[2 * 2 * i + 2 * j + 1]); - } - - // mixing - unpacklo32(tmp, s, s + 16); - unpackhi32(s + 16, s, s + 16); - memcpy(s, tmp, 16); - } - - /* Feed-forward */ - for (i = 0; i < 32; i++) { - out[i] = in[i] ^ s[i]; - } + int i, j; + + unsigned char s[32], tmp[16]; + + memcpy(s, in, 16); + memcpy(s + 16, in + 16, 16); + + for (i = 0; i < 5; ++i) { + // aes round(s) + for (j = 0; j < 2; ++j) { + aesenc(s, rc[2*2*i + 2*j]); + aesenc(s + 16, rc[2*2*i + 2*j + 1]); + } + + // mixing + unpacklo32(tmp, s, s + 16); + unpackhi32(s + 16, s, s + 16); + memcpy(s, tmp, 16); + } + + /* Feed-forward */ + for (i = 0; i < 32; i++) { + out[i] = in[i] ^ s[i]; + } } void haraka256_sk(unsigned char *out, const unsigned char *in) { - int i, j; - - unsigned char s[32], tmp[16]; - - memcpy(s, in, 16); - memcpy(s + 16, in + 16, 16); - - for (i = 0; i < 5; ++i) { - // aes round(s) - for (j = 0; j < 2; ++j) { - aesenc(s, rc_sseed[2 * 2 * i + 2 * j]); - aesenc(s + 16, rc_sseed[2 * 2 * i + 2 * j + 1]); - } - - // mixing - unpacklo32(tmp, s, s + 16); - unpackhi32(s + 16, s, s + 16); - memcpy(s, tmp, 16); - } - - /* Feed-forward */ - for (i = 0; i < 32; i++) { - out[i] = in[i] ^ s[i]; - } -} \ No newline at end of file + int i, j; + + unsigned char s[32], tmp[16]; + + memcpy(s, in, 16); + memcpy(s + 16, in + 16, 16); + + for (i = 0; i < 5; ++i) { + // aes round(s) + for (j = 0; j < 2; ++j) { + aesenc(s, rc_sseed[2*2*i + 2*j]); + aesenc(s + 16, rc_sseed[2*2*i + 2*j + 1]); + } + + // mixing + unpacklo32(tmp, s, s + 16); + unpackhi32(s + 16, s, s + 16); + memcpy(s, tmp, 16); + } + + /* Feed-forward */ + for (i = 0; i < 32; i++) { + out[i] = in[i] ^ s[i]; + } +} diff --git a/verus/haraka_portable.h b/verus/haraka_portable.h index 35eaa97947..2ef8a9e4bc 100644 --- a/verus/haraka_portable.h +++ b/verus/haraka_portable.h @@ -2,9 +2,6 @@ #define SPX_HARAKA_H #include "immintrin.h" -//#include "SSE2NEON.h" -//#include "arm_neon.h" -//typedef int32x4_t __m128i; #define NUMROUNDS 5 @@ -13,9 +10,10 @@ typedef unsigned long long u64; #else typedef unsigned long u64; #endif +typedef __m128i u128; + +extern void aesenc(unsigned char *s, const unsigned char *rk); -//extern void aesenc(unsigned char *s, const unsigned char *rk); -void aesenc(unsigned char *s, const unsigned char *rk); #define AES2_EMU(s0, s1, rci) \ aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci])); \ aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 1])); \ @@ -26,49 +24,41 @@ typedef unsigned int uint32_t; static inline __m128i _mm_unpacklo_epi32_emu(__m128i a, __m128i b) { - uint32_t result[4]; - uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b; - result[0] = tmp1[0]; - result[1] = tmp2[0]; - result[2] = tmp1[1]; - result[3] = tmp2[1]; - return *(__m128i *)result; + uint32_t result[4]; + uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b; + result[0] = tmp1[0]; + result[1] = tmp2[0]; + result[2] = tmp1[1]; + result[3] = tmp2[1]; + return *(__m128i *)result; } static inline __m128i _mm_unpackhi_epi32_emu(__m128i a, __m128i b) { - uint32_t result[4]; - uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b; - result[0] = tmp1[2]; - result[1] = tmp2[2]; - result[2] = tmp1[3]; - result[3] = tmp2[3]; - return *(__m128i *)result; + uint32_t result[4]; + uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b; + result[0] = tmp1[2]; + result[1] = tmp2[2]; + result[2] = tmp1[3]; + result[3] = tmp2[3]; + return *(__m128i *)result; } #define MIX2_EMU(s0, s1) \ - tmp = _mm_unpacklo_epi32(s0, s1); \ - s1 = _mm_unpackhi_epi32(s0, s1); \ + tmp = _mm_unpacklo_epi32_emu(s0, s1); \ + s1 = _mm_unpackhi_epi32_emu(s0, s1); \ s0 = tmp; -#define MIX2_EMU_REV(s0, s1) \ - temp1 = _mm_unpacklo_epi32(s0, s1); \ - s1 = _mm_unpackhi_epi32(s0, s1); - /* load constants */ - - - - void load_constants_port(); /* Tweak constants with seed */ -void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed, - unsigned long long seed_length); +void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed, + unsigned long long seed_length); /* Haraka Sponge */ void haraka_S(unsigned char *out, unsigned long long outlen, - const unsigned char *in, unsigned long long inlen); + const unsigned char *in, unsigned long long inlen); /* Applies the 512-bit Haraka permutation to in. */ void haraka512_perm(unsigned char *out, const unsigned char *in); @@ -77,7 +67,7 @@ void haraka512_perm(unsigned char *out, const unsigned char *in); void haraka512_port(unsigned char *out, const unsigned char *in); /* Implementation of Haraka-512 */ -void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const __m128i *rc); +void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const u128 *rc); /* Applies the 512-bit Haraka permutation to in, using zero key. */ void haraka512_perm_zero(unsigned char *out, const unsigned char *in); @@ -91,5 +81,4 @@ void haraka256_port(unsigned char *out, const unsigned char *in); /* Implementation of Haraka-256 using sk.seed constants */ void haraka256_sk(unsigned char *out, const unsigned char *in); - -#endif \ No newline at end of file +#endif diff --git a/verus/verus_clhash.cpp b/verus/verus_clhash.cpp index 48a8a6fe32..93b7a6e17b 100644 --- a/verus/verus_clhash.cpp +++ b/verus/verus_clhash.cpp @@ -18,36 +18,18 @@ **/ -#include "verus_hash.h" +#include "verus_clhash.h" -//#include "./boost/thread.hpp" #include #include -#include +//#include //#include "cpu_verushash.hpp" #ifdef _WIN32 #define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) #endif -thread_local thread_specific_ptr verusclhasher_key; -thread_local thread_specific_ptr verusclhasher_descr; - -#ifdef _WIN32 -// attempt to workaround horrible mingw/gcc destructor bug on Windows, which passes garbage in the this pointer -// we use the opportunity of control here to clean up all of our tls variables. we could keep a list, but this is a quick hack -thread_specific_ptr::~thread_specific_ptr() { - if (verusclhasher_key.ptr) - { - verusclhasher_key.reset(); - } - if (verusclhasher_descr.ptr) - { - verusclhasher_descr.reset(); - } -} -#endif int __cpuverusoptimized = 0x80; @@ -62,7 +44,7 @@ static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) { static inline __m128i precompReduction64_si128( __m128i A) { //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) - const __m128i C = _mm_cvtsi32_si128(27); + const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01); __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), _mm_srli_si128(Q2,8)); @@ -76,33 +58,27 @@ static inline uint64_t precompReduction64( __m128i A) { } // verus intermediate hash extra -static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict randomsource, const __m128i * __restrict buf, uint64_t keyMask, uint32_t * __restrict fixrand, uint32_t * __restrict fixrandex) +static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, + uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) { - __m128i const *pbuf; + __m128i *pbuf; + __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] }; // divide key mask by 16 from bytes to __m128i keyMask >>= 4; __m128i acc = _mm_load_si128(randomsource + (keyMask + 2)); -#ifdef VERUSHASHDEBUG - printf("[CPU]BUF ito verusclmulithout C++ : "); - for (int i = 0; i < 64; i++) - printf("%02x", ((uint8_t*)buf)[i]); - printf("\n"); - printf("[CPU]KEy ito verusclmulithout C++ : "); - for (int i = 0; i < 64; i++) - printf("%02x", ((uint8_t*)&randomsource[0])[i]); - printf("\n"); - printf("[CPU]ACC ito verusclmulithout C++ : "); - for (int i = 0; i < 16; i++) - printf("%02x", ((uint8_t*)&acc)[i]); - printf("\n"); -#endif + // the random buffer must have at least 32 16 byte dwords after the keymask to work with this // algorithm. we take the value from the last element inside the keyMask + 2, as that will never // be used to xor into the accumulator before it is hashed with other values first +#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0) + +#define LIONELK_FETCH_DIST 0 + +#pragma unroll 32 - for (uint64_t i = 0; i < 32; i++) + for (uint64_t i = 0; i < 32; i++) { const uint64_t selector = _mm_cvtsi128_si64(acc); @@ -112,38 +88,17 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict __m128i *prandex = randomsource + ((selector >> 32) & keyMask); // select random start and order of pbuf processing - pbuf = buf + (selector & 3); + pbuf = pbuf_copy + (selector & 3); uint32_t prand_idx = (selector >> 5) & keyMask; uint32_t prandex_idx = (selector >>32) & keyMask; - -#ifdef VERUSHASHDEBUG - uint64_t case_v; - case_v = selector & 0x1cu; - uint64_t egg, nog, salad; - printf("[CPU]*****LOOP[%d]**********\n", i); - egg = selector & 0x03u; - nog = ((selector >> 32) & keyMask); - salad = ((selector >> 5) & keyMask); - printf("[CPU]selector: %llx\n case: %llx selector &3: ", selector, case_v); - printf("%llx \n", egg); - printf("[CPU]((selector >> 32) & keyMask) %d", nog); - printf("[CPU]((selector >> 5) & keyMask) %d", salad); - printf("\nacc : "); - printf("%016llx%016llx", ((uint64_t*)&acc)[0], ((uint64_t*)&acc)[1]); - printf("\n"); - - printf("[CPU]prand : "); - //for (int e = 0; e < 4; e++) - printf("%016llx%016llx", ((uint64_t*)prand)[0], ((uint64_t*)prand)[1]); - printf("\n"); - printf("[CPU]prandex : "); - //for (int e = 0; e < 16; e++) - printf("%016llx%016llx", ((uint64_t*)prandex)[0], ((uint64_t*)prandex)[1]); - printf("\n"); + g_prand[i] = prand[0]; + g_prandex[i] = prandex[0]; + fixrand[i] = prand_idx; + fixrandex[i] = prandex_idx; -#endif switch (selector & 0x1c) { + case 0: { const __m128i temp1 = _mm_load_si128(prandex); @@ -166,6 +121,9 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); _mm_store_si128(prandex, tempb2); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + break; } case 4: @@ -191,6 +149,9 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); _mm_store_si128(prand, tempb2); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + break; } case 8: @@ -216,6 +177,9 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); _mm_store_si128(prandex, tempb2); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + break; } case 0xc: @@ -258,6 +222,8 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict _mm_store_si128(prandex, tempa2); _mm_store_si128(prand, tempb3); } + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } @@ -289,6 +255,8 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict const __m128i tempa4 = _mm_load_si128(prandex); _mm_store_si128(prandex, tempa3); _mm_store_si128(prand, tempa4); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } @@ -305,7 +273,7 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict do { - loop_c = selector & (0x10000000 << rounds); + loop_c = selector & (((uint64_t)0x10000000) << rounds); if (loop_c) { onekey = _mm_load_si128(rc++); @@ -337,24 +305,50 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict const __m128i tempa4 = _mm_load_si128(prandex); _mm_store_si128(prandex, tempa3); _mm_store_si128(prand, tempa4); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } case 0x18: { - const __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - const __m128i temp2 = _mm_load_si128(prand); - const __m128i add1 = _mm_xor_si128(temp1, temp2); - const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - - const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2); - const __m128i tempa2 = _mm_xor_si128(tempa1, temp2); - - const __m128i tempb3 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa2); - _mm_store_si128(prand, tempb3); - break; + __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + __m128i tmp; // used by MIX2 + + uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times + __m128i *rc = prand; + uint64_t aesroundoffset = 0; + __m128i onekey; + + do + { + if (selector & (((uint64_t)0x10000000) << rounds)) + { + onekey = _mm_load_si128(rc++); + __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); + __m128i add1 = _mm_xor_si128(onekey, temp2); + // cannot be zero here, may be negative + int32_t divisor = (uint32_t)selector; + int64_t dividend = _mm_cvtsi128_si64(add1); + __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); + acc = _mm_xor_si128(modulo, acc); + } + else + { + onekey = _mm_load_si128(rc++); + __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); + __m128i add1 = _mm_xor_si128(onekey, temp2); + __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + __m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1); + acc = _mm_xor_si128(clprod2, acc); + } + } while (rounds--); + + __m128i tempa3 = _mm_load_si128(prandex); + __m128i tempa4 = _mm_xor_si128(tempa3, acc); + _mm_store_si128(prandex, tempa4); + _mm_store_si128(prand, onekey); + break; } case 0x1c: { @@ -389,13 +383,15 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3); _mm_store_si128(prandex, tempb2); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); break; } } + - fixrand[i] = prand_idx; - fixrandex[i] = prandex_idx; + } return acc; @@ -403,10 +399,10 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, // returning a 64 bit hash value -uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t * __restrict fixrand, uint32_t * __restrict fixrandex) { - const __m128i lazy = _mm_cvtsi32_si128( 0x00010000); - __m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex); - acc = _mm_xor_si128(acc, lazy); +uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, + u128 *g_prand, u128 *g_prandex) { + __m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex, g_prand, g_prandex); + acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64)); return precompReduction64(acc); @@ -430,20 +426,15 @@ inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, c MIX4(s[0], s[1], s[2], s[3]); AES4(s[0], s[1], s[2], s[3], 24); + MIX4(s[0], s[1], s[2], s[3]); - //MIX4_LASTBUT1(s[0], s[1], s[2], s[3]); - - // AES4_LAST(s[2], 32); - -// MIX4(s[0], s[1], s[2], s[3]); - - // AES4(s[0], s[1], s[2], s[3], 32); - // MIX4LAST(s[0], s[1], s[2], s[3]); + AES4(s[0], s[1], s[2], s[3], 32); + MIX4(s[0], s[1], s[2], s[3]); - // s[0] = _mm_xor_si128(s[0], LOAD(in)); - // s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); - s[2] = _mm_xor_si128(s[2], LOAD(in + 46)); - // s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); + s[0] = _mm_xor_si128(s[0], LOAD(in)); + s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); + s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); + s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); // TRUNCSTORE(out, s[0], s[1], s[2], s[3]); } diff --git a/verus/verus_clhash.h b/verus/verus_clhash.h index 2b28fae8d2..ea3e94b3a2 100644 --- a/verus/verus_clhash.h +++ b/verus/verus_clhash.h @@ -1,2 +1,146 @@ +/* + * This uses veriations of the clhash algorithm for Verus Coin, licensed + * with the Apache-2.0 open source license. + * + * Copyright (c) 2018 Michael Toutonghi + * Distributed under the Apache 2.0 software license, available in the original form for clhash + * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a + * + * CLHash is a very fast hashing function that uses the + * carry-less multiplication and SSE instructions. + * + * Original CLHash code (C) 2017, 2018 Daniel Lemire and Owen Kaser + * Faster 64-bit universal hashing + * using carry-less multiplications, Journal of Cryptographic Engineering (to appear) + * + * Best used on recent x64 processors (Haswell or better). + * + **/ -uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, uint16_t * __restrict fixrand, uint16_t * __restrict fixrandex); +#ifndef INCLUDE_VERUS_CLHASH_H +#define INCLUDE_VERUS_CLHASH_H + + +//#include + +#ifndef _WIN32 +#include +#else +#include +#endif // !WIN32 + + +#include +#include +#include +#include +//#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) + + typedef unsigned char u_char; + +typedef unsigned char u_char; + +#endif +#include "haraka.h" +#include "haraka_portable.h" +enum { + // Verus Key size must include the equivalent size of a Haraka key + // after the first part. + // Any excess over a power of 2 will not get mutated, and any excess over + // power of 2 + Haraka sized key will not be used + VERUSKEYSIZE = 1024 * 8 + (40 * 16), + VERUSHHASH_SOLUTION_VERSION = 1 +}; + + + +extern int __cpuverusoptimized; + +inline bool IsCPUVerusOptimized() +{ + +#ifndef _WIN32 + unsigned int eax, ebx, ecx, edx; + + if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) + { + return false; + } + return ((ecx & (bit_AVX | bit_AES)) == (bit_AVX | bit_AES)); +#else + + // https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/cpuid.h +#define bit_AVX (1 << 28) +#define bit_AES (1 << 25) + // https://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/ + // bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false; + + int cpuInfo[4]; + __cpuid(cpuInfo, 1); + return ((cpuInfo[2] & (bit_AVX | bit_AES)) == (bit_AVX | bit_AES)); + +#endif + + + if (__cpuverusoptimized & 0x80) + { +#ifdef _WIN32 + #define bit_AVX (1 << 28) + #define bit_AES (1 << 25) + #define bit_PCLMUL (1 << 1) + // https://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/ + // bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false; + + int cpuInfo[4]; + __cpuid(cpuInfo, 1); + __cpuverusoptimized = ((cpuInfo[2] & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL)); +#else + unsigned int eax,ebx,ecx,edx; + + if (!__get_cpuid(1,&eax,&ebx,&ecx,&edx)) + { + __cpuverusoptimized = false; + } + else + { + __cpuverusoptimized = ((ecx & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL)); + } +#endif //WIN32 + } + return __cpuverusoptimized; + +}; + +inline void ForceCPUVerusOptimized(bool trueorfalse) +{ + __cpuverusoptimized = trueorfalse; +}; + +uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, + u128 *g_prand, u128 *g_prandex); +uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, + u128 *g_prand, u128 *g_prandex); + +void *alloc_aligned_buffer(uint64_t bufSize); + +#ifdef __cplusplus +} // extern "C" +#endif + +#ifdef __cplusplus + +#include +#include + +// special high speed hasher for VerusHash 2.0 + +#endif // #ifdef __cplusplus + +#endif // INCLUDE_VERUS_CLHASH_H diff --git a/verus/verus_clhash_portable.cpp b/verus/verus_clhash_portable.cpp index 06a402de85..7738cf0c9c 100644 --- a/verus/verus_clhash_portable.cpp +++ b/verus/verus_clhash_portable.cpp @@ -1,40 +1,29 @@ /* -* This uses veriations of the clhash algorithm for Verus Coin, licensed -* with the Apache-2.0 open source license. -* -* Copyright (c) 2018 Michael Toutonghi -* Distributed under the Apache 2.0 software license, available in the original form for clhash -* here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a -* -* Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser -* Faster 64-bit universal hashing -* using carry-less multiplications, Journal of Cryptographic Engineering (to appear) -* -* Best used on recent x64 processors (Haswell or better). -* -* This implements an intermediate step in the last part of a Verus block hash. The intent of this step -* is to more effectively equalize FPGAs over GPUs and CPUs. -* -**/ - - -#include "haraka_portable.h" -#include "stdint.h" + * This uses veriations of the clhash algorithm for Verus Coin, licensed + * with the Apache-2.0 open source license. + * + * Copyright (c) 2018 Michael Toutonghi + * Distributed under the Apache 2.0 software license, available in the original form for clhash + * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a + * + * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser + * Faster 64-bit universal hashing + * using carry-less multiplications, Journal of Cryptographic Engineering (to appear) + * + * Best used on recent x64 processors (Haswell or better). + * + * This implements an intermediate step in the last part of a Verus block hash. The intent of this step + * is to more effectively equalize FPGAs over GPUs and CPUs. + * + **/ + + +#include "verus_hash.h" + #include #include -#include -#include "miner.h" - -#if defined(__GNUC__) || defined(__clang__) -# pragma push_macro("FORCE_INLINE") -# pragma push_macro("ALIGN_STRUCT") -# define FORCE_INLINE static inline __attribute__((always_inline)) -# define ALIGN_STRUCT(x) __attribute__((aligned(x))) -#else -# define FORCE_INLINE static inline -# define ALIGN_STRUCT(x) __declspec(align(x)) -#endif +//#include #ifdef __APPLE__ @@ -43,70 +32,325 @@ #ifdef _WIN32 #pragma warning (disable : 4146) -#include +#include #else #include -//#include "arm_neon.h" +#endif //WIN32 -//# include "SSE2NEON.h" -//#include "softaesnc.h" -//typedef int32x4_t __m128i; +void clmul64(uint64_t a, uint64_t b, uint64_t* r) +{ + uint8_t s = 4, i; //window size + uint64_t two_s = 1 << s; //2^s + uint64_t smask = two_s - 1; //s 1 bits + uint64_t u[16]; + uint64_t tmp; + uint64_t ifmask; + //Precomputation + u[0] = 0; + u[1] = b; + for (i = 2; i < two_s; i += 2) { + u[i] = u[i >> 1] << 1; //even indices: left shift + u[i + 1] = u[i] ^ b; //odd indices: xor b + } + //Multiply + r[0] = u[a & smask]; //first window only affects lower word + r[1] = 0; + for (i = s; i < 64; i += s) { + tmp = u[a >> i & smask]; + r[0] ^= tmp << i; + r[1] ^= tmp >> (64 - i); + } + //Repair + uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110 + for (i = 1; i < s; i++) { + tmp = ((a & m) >> i); + m &= m << 1; //shift mask to exclude all bit j': j' mod s = i + ifmask = -((b >> (64 - i)) & 1); //if the (64-i)th bit of b is 1 + r[1] ^= (tmp & ifmask); + } +} -#endif //WIN32 +u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm) +{ + uint64_t result[2]; + clmul64(*((uint64_t*)&a + (imm & 1)), *((uint64_t*)&b + ((imm & 0x10) >> 4)), result); + + /* + // TEST + const __m128i tmp1 = _mm_load_si128(&a); + const __m128i tmp2 = _mm_load_si128(&b); + imm = imm & 0x11; + const __m128i testresult = (imm == 0x10) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x10) : ((imm == 0x01) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x01) : ((imm == 0x00) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x00) : _mm_clmulepi64_si128(tmp1, tmp2, 0x11))); + if (!memcmp(&testresult, &result, 16)) + { + printf("_mm_clmulepi64_si128_emu: Portable version passed!\n"); + } + else + { + printf("_mm_clmulepi64_si128_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, imm: %x, emu: %lxh %lxl, intrin: %lxh %lxl\n", + *((uint64_t *)&a + 1), *(uint64_t *)&a, + *((uint64_t *)&b + 1), *(uint64_t *)&b, + imm, + *((uint64_t *)result + 1), *(uint64_t *)result, + *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult); + return testresult; + } + */ + + return *(__m128i *)result; +} + +u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b) +{ + int16_t result[8]; + int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b; + for (int i = 0; i < 8; i++) + { + result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15); + } + + /* + const __m128i testresult = _mm_mulhrs_epi16(_a, _b); + if (!memcmp(&testresult, &result, 16)) + { + printf("_mm_mulhrs_epi16_emu: Portable version passed!\n"); + } + else + { + printf("_mm_mulhrs_epi16_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, emu: %lxh %lxl, intrin: %lxh %lxl\n", + *((uint64_t *)&a + 1), *(uint64_t *)&a, + *((uint64_t *)&b + 1), *(uint64_t *)&b, + *((uint64_t *)result + 1), *(uint64_t *)result, + *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult); + } + */ + + return *(__m128i *)result; +} + +inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo) +{ + __m128i result; + ((uint64_t *)&result)[0] = lo; + ((uint64_t *)&result)[1] = hi; + return result; +} + +inline u128 _mm_cvtsi64_si128_emu(uint64_t lo) +{ + __m128i result; + ((uint64_t *)&result)[0] = lo; + ((uint64_t *)&result)[1] = 0; + return result; +} -#define AES2(s0, s1, rci) \ - s0 = _mm_aesenc_si128(s0, rc[rci]); \ - s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \ - s0 = _mm_aesenc_si128(s0, rc[rci + 2]); \ - s1 = _mm_aesenc_si128(s1, rc[rci + 3]); +inline int64_t _mm_cvtsi128_si64_emu(__m128i &a) +{ + return *(int64_t *)&a; +} + +inline int32_t _mm_cvtsi128_si32_emu(__m128i &a) +{ + return *(int32_t *)&a; +} + +inline u128 _mm_cvtsi32_si128_emu(uint32_t lo) +{ + __m128i result; + ((uint32_t *)&result)[0] = lo; + ((uint32_t *)&result)[1] = 0; + ((uint64_t *)&result)[1] = 0; + + /* + const __m128i testresult = _mm_cvtsi32_si128(lo); + if (!memcmp(&testresult, &result, 16)) + { + printf("_mm_cvtsi32_si128_emu: Portable version passed!\n"); + } + else + { + printf("_mm_cvtsi32_si128_emu: Portable version failed!\n"); + } + */ + + return result; +} + +u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15) +{ + __m128i result; + ((uint8_t *)&result)[0] = c0; + ((uint8_t *)&result)[1] = c1; + ((uint8_t *)&result)[2] = c2; + ((uint8_t *)&result)[3] = c3; + ((uint8_t *)&result)[4] = c4; + ((uint8_t *)&result)[5] = c5; + ((uint8_t *)&result)[6] = c6; + ((uint8_t *)&result)[7] = c7; + ((uint8_t *)&result)[8] = c8; + ((uint8_t *)&result)[9] = c9; + ((uint8_t *)&result)[10] = c10; + ((uint8_t *)&result)[11] = c11; + ((uint8_t *)&result)[12] = c12; + ((uint8_t *)&result)[13] = c13; + ((uint8_t *)&result)[14] = c14; + ((uint8_t *)&result)[15] = c15; + + /* + const __m128i testresult = _mm_setr_epi8(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15); + if (!memcmp(&testresult, &result, 16)) + { + printf("_mm_setr_epi8_emu: Portable version passed!\n"); + } + else + { + printf("_mm_setr_epi8_emu: Portable version failed!\n"); + } + */ + + return result; +} + +inline __m128i _mm_srli_si128_emu(__m128i a, int imm8) +{ + unsigned char result[16]; + uint8_t shift = imm8 & 0xff; + if (shift > 15) shift = 16; + + int i; + for (i = 0; i < (16 - shift); i++) + { + result[i] = ((unsigned char *)&a)[shift + i]; + } + for (; i < 16; i++) + { + result[i] = 0; + } + + /* + const __m128i tmp1 = _mm_load_si128(&a); + __m128i testresult = _mm_srli_si128(tmp1, imm8); + if (!memcmp(&testresult, result, 16)) + { + printf("_mm_srli_si128_emu: Portable version passed!\n"); + } + else + { + printf("_mm_srli_si128_emu: Portable version failed! val: %lx%lx imm: %x emu: %lx%lx, intrin: %lx%lx\n", + *((uint64_t *)&a + 1), *(uint64_t *)&a, + imm8, + *((uint64_t *)result + 1), *(uint64_t *)result, + *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult); + } + */ + + return *(__m128i *)result; +} + +inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b) +{ +#ifdef _WIN32 + uint64_t result[2]; + result[0] = *(uint64_t *)&a ^ *(uint64_t *)&b; + result[1] = *((uint64_t *)&a + 1) ^ *((uint64_t *)&b + 1); + return *(__m128i *)result; +#else + return a ^ b; +#endif +} + +inline __m128i _mm_load_si128_emu(const void *p) +{ + return *(__m128i *)p; +} + +inline void _mm_store_si128_emu(void *p, __m128i val) +{ + *(__m128i *)p = val; +} + +__m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b) +{ + __m128i result; + for (int i = 0; i < 16; i++) + { + if (((uint8_t *)&b)[i] & 0x80) + { + ((uint8_t *)&result)[i] = 0; + } + else + { + ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf]; + } + } + + /* + const __m128i tmp1 = _mm_load_si128(&a); + const __m128i tmp2 = _mm_load_si128(&b); + __m128i testresult = _mm_shuffle_epi8(tmp1, tmp2); + if (!memcmp(&testresult, &result, 16)) + { + printf("_mm_shuffle_epi8_emu: Portable version passed!\n"); + } + else + { + printf("_mm_shuffle_epi8_emu: Portable version failed!\n"); + } + */ + + return result; +} // portable -__m128i lazyLengthHash_port (uint64_t keylength, uint64_t length) { - const __m128i lengthvector = _mm_set_epi64x(keylength, length); - const __m128i clprod1 = _mm_clmulepi64_si128(lengthvector, lengthvector, 0x10); +static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) { + const __m128i lengthvector = _mm_set_epi64x_emu(keylength, length); + const __m128i clprod1 = _mm_clmulepi64_si128_emu(lengthvector, lengthvector, 0x10); return clprod1; } // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64 -__m128i precompReduction64_si128_port(__m128i A) { +static inline __m128i precompReduction64_si128_port(__m128i A) { //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) - const __m128i C = _mm_cvtsi64_si128((1U << 4) + (1U << 3) + (1U << 1) + (1U << 0)); - __m128i Q2 = _mm_clmulepi64_si128(A, C, 0x01); - __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), - _mm_srli_si128(Q2, 8)); - __m128i Q4 = _mm_xor_si128(Q2, A); - const __m128i final = _mm_xor_si128(Q3, Q4); + const __m128i C = _mm_cvtsi64_si128_emu((1U << 4) + (1U << 3) + (1U << 1) + (1U << 0)); + __m128i Q2 = _mm_clmulepi64_si128_emu(A, C, 0x01); + __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), + _mm_srli_si128_emu(Q2, 8)); + __m128i Q4 = _mm_xor_si128_emu(Q2, A); + const __m128i final = _mm_xor_si128_emu(Q3, Q4); return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE } -uint64_t precompReduction64_port(__m128i A) { +static inline uint64_t precompReduction64_port(__m128i A) { __m128i tmp = precompReduction64_si128_port(A); - return _mm_cvtsi128_si64(tmp); + return _mm_cvtsi128_si64_emu(tmp); } - - - // verus intermediate hash extra -__inline __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, uint16_t * __restrict fixrand, uint16_t * __restrict fixrandex) +static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask) { - __m128i *pbuf; + __m128i const *pbuf; + + /* + std::cout << "Random key start: "; + std::cout << LEToHex(*randomsource) << ", "; + std::cout << LEToHex(*(randomsource + 1)); + std::cout << std::endl; + */ - __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] }; // divide key mask by 16 from bytes to __m128i keyMask >>= 4; // the random buffer must have at least 32 16 byte dwords after the keymask to work with this // algorithm. we take the value from the last element inside the keyMask + 2, as that will never // be used to xor into the accumulator before it is hashed with other values first - __m128i acc = _mm_load_si128(randomsource + (keyMask + 2)); + __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2)); for (int64_t i = 0; i < 32; i++) { //std::cout << "LOOP " << i << " acc: " << LEToHex(acc) << std::endl; - uint64_t selector = _mm_cvtsi128_si64(acc); + const uint64_t selector = _mm_cvtsi128_si64_emu(acc); // get two random locations in the key, which will be mutated and swapped __m128i *prand = randomsource + ((selector >> 5) & keyMask); @@ -115,281 +359,290 @@ __inline __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *rand // select random start and order of pbuf processing - pbuf = pbuf_copy + (selector & 3); - uint32_t prand_idx = (selector >> 5) & keyMask; - uint32_t prandex_idx = (selector >> 32) & keyMask; + pbuf = buf + (selector & 3); - // printf("[i]=%d \t acc = %08x, prand_idx = %d\t, prandex_idx = %d\t selector = %d prand %08x, prandex %08x\n", i, _mm_cvtsi128_si64(acc), prand_idx, prandex_idx, (selector & 0x1c)>>2, _mm_cvtsi128_si64(prand[0]), _mm_cvtsi128_si64(prandex[0])); - //printf("pbuf %08x%08x%08x%08x\n", _mm_cvtsi128_si64(buf[0]), _mm_cvtsi128_si64(buf[1]), _mm_cvtsi128_si64(buf[2]), _mm_cvtsi128_si64(buf[3])); switch (selector & 0x1c) { case 0: { - __m128i temp1 = _mm_load_si128(prandex); - __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - __m128i add1 = _mm_xor_si128(temp1, temp2); - __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - - __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - - __m128i temp12 = _mm_load_si128(prand); - _mm_store_si128(prand, tempa2); - - __m128i temp22 = _mm_load_si128(pbuf); - __m128i add12 = _mm_xor_si128(temp12, temp22); - __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); - acc = _mm_xor_si128(clprod12, acc); - - __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prandex, tempb2); + const __m128i temp1 = _mm_load_si128_emu(prandex); + const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); + acc = _mm_xor_si128_emu(clprod1, acc); + + /* + std::cout << "temp1: " << LEToHex(temp1) << std::endl; + std::cout << "temp2: " << LEToHex(temp2) << std::endl; + std::cout << "add1: " << LEToHex(add1) << std::endl; + std::cout << "clprod1: " << LEToHex(clprod1) << std::endl; + std::cout << "acc: " << LEToHex(acc) << std::endl; + */ + + const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); + const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128_emu(prand); + _mm_store_si128_emu(prand, tempa2); + + const __m128i temp22 = _mm_load_si128_emu(pbuf); + const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); + acc = _mm_xor_si128_emu(clprod12, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); + const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); + _mm_store_si128_emu(prandex, tempb2); break; } case 4: { - __m128i temp1 = _mm_load_si128(prand); - __m128i temp2 = _mm_load_si128(pbuf); - __m128i add1 = _mm_xor_si128(temp1, temp2); - __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); - __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10); - acc = _mm_xor_si128(clprod2, acc); - - __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - - __m128i temp12 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa2); - - __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - __m128i add12 = _mm_xor_si128(temp12, temp22); - acc = _mm_xor_si128(add12, acc); - - __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prand, tempb2); + const __m128i temp1 = _mm_load_si128_emu(prand); + const __m128i temp2 = _mm_load_si128_emu(pbuf); + const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); + acc = _mm_xor_si128_emu(clprod1, acc); + const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10); + acc = _mm_xor_si128_emu(clprod2, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); + const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128_emu(prandex); + _mm_store_si128_emu(prandex, tempa2); + + const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); + acc = _mm_xor_si128_emu(add12, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); + const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); + _mm_store_si128_emu(prand, tempb2); break; } case 8: { - __m128i temp1 = _mm_load_si128(prandex); - __m128i temp2 = _mm_load_si128(pbuf); - __m128i add1 = _mm_xor_si128(temp1, temp2); - acc = _mm_xor_si128(add1, acc); - - __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - __m128i tempa2 = _mm_xor_si128(tempa1, temp1); - - __m128i temp12 = _mm_load_si128(prand); - _mm_store_si128(prand, tempa2); - - __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - __m128i add12 = _mm_xor_si128(temp12, temp22); - __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); - acc = _mm_xor_si128(clprod12, acc); - __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); - acc = _mm_xor_si128(clprod22, acc); - - __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prandex, tempb2); + const __m128i temp1 = _mm_load_si128_emu(prandex); + const __m128i temp2 = _mm_load_si128_emu(pbuf); + const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); + acc = _mm_xor_si128_emu(add1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); + const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128_emu(prand); + _mm_store_si128_emu(prand, tempa2); + + const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); + acc = _mm_xor_si128_emu(clprod12, acc); + const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10); + acc = _mm_xor_si128_emu(clprod22, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); + const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); + _mm_store_si128_emu(prandex, tempb2); break; } case 0xc: { - __m128i temp1 = _mm_load_si128(prand); - __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i temp1 = _mm_load_si128_emu(prand); + const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); // cannot be zero here - int32_t divisor = (uint32_t)selector; + const int32_t divisor = (uint32_t)selector; - acc = _mm_xor_si128(add1, acc); + acc = _mm_xor_si128_emu(add1, acc); - int64_t dividend = _mm_cvtsi128_si64(acc); - __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); - acc = _mm_xor_si128(modulo, acc); + const int64_t dividend = _mm_cvtsi128_si64_emu(acc); + const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor); + acc = _mm_xor_si128_emu(modulo, acc); - __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); - __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1); + const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1); if (dividend & 1) { - __m128i temp12 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa2); - - __m128i temp22 = _mm_load_si128(pbuf); - __m128i add12 = _mm_xor_si128(temp12, temp22); - __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); - acc = _mm_xor_si128(clprod12, acc); - __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); - acc = _mm_xor_si128(clprod22, acc); - - __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); - __m128i tempb2 = _mm_xor_si128(tempb1, temp12); - _mm_store_si128(prand, tempb2); + const __m128i temp12 = _mm_load_si128_emu(prandex); + _mm_store_si128_emu(prandex, tempa2); + + const __m128i temp22 = _mm_load_si128_emu(pbuf); + const __m128i add12 = _mm_xor_si128_emu(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); + acc = _mm_xor_si128_emu(clprod12, acc); + const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10); + acc = _mm_xor_si128_emu(clprod22, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12); + const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12); + _mm_store_si128_emu(prand, tempb2); } else { - __m128i tempb3 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa2); - _mm_store_si128(prand, tempb3); + const __m128i tempb3 = _mm_load_si128_emu(prandex); + _mm_store_si128_emu(prandex, tempa2); + _mm_store_si128_emu(prand, tempb3); } break; } case 0x10: { // a few AES operations - __m128i *rc = prand; + const __m128i *rc = prand; __m128i tmp; - __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); - __m128i temp2 = _mm_load_si128(pbuf); + __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + __m128i temp2 = _mm_load_si128_emu(pbuf); - AES2(temp1, temp2, 0); + AES2_EMU(temp1, temp2, 0); MIX2_EMU(temp1, temp2); - AES2(temp1, temp2, 4); + AES2_EMU(temp1, temp2, 4); MIX2_EMU(temp1, temp2); - AES2(temp1, temp2, 8); + AES2_EMU(temp1, temp2, 8); MIX2_EMU(temp1, temp2); - acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc)); + acc = _mm_xor_si128_emu(temp1, acc); + acc = _mm_xor_si128_emu(temp2, acc); - __m128i tempa1 = _mm_load_si128(prand); - __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); - __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + const __m128i tempa1 = _mm_load_si128_emu(prand); + const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2); - __m128i tempa4 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa3); - _mm_store_si128(prand, tempa4); + const __m128i tempa4 = _mm_load_si128_emu(prandex); + _mm_store_si128_emu(prandex, tempa3); + _mm_store_si128_emu(prand, tempa4); break; } case 0x14: { - // we'll just call this one the monkins loop, inspired by Chris - modified to cast to uint64_t on shift for more variability in the loop - __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + // we'll just call this one the monkins loop, inspired by Chris + const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); __m128i tmp; // used by MIX2 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times __m128i *rc = prand; - uint64_t aesroundoffset = 0; + uint64_t aesround = 0; __m128i onekey; do { - if (selector & (((uint64_t)0x10000000) << rounds)) + //std::cout << "acc: " << LEToHex(acc) << ", round check: " << LEToHex((selector & (0x10000000 << rounds))) << std::endl; + + // note that due to compiler and CPUs, we expect this to do: + // if (selector & ((0x10000000 << rounds) & 0xffffffff) if rounds != 3 else selector & 0xffffffff80000000): + if (selector & (0x10000000 << rounds)) { - onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); - __m128i add1 = _mm_xor_si128(onekey, temp2); - __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); + onekey = _mm_load_si128_emu(rc++); + const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); + const __m128i add1 = _mm_xor_si128_emu(onekey, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); + acc = _mm_xor_si128_emu(clprod1, acc); } else { - onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); - AES2(onekey, temp2, aesroundoffset); - aesroundoffset += 4; + onekey = _mm_load_si128_emu(rc++); + __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); + const uint64_t roundidx = aesround++ << 2; + AES2_EMU(onekey, temp2, roundidx); + + /* + std::cout << " onekey1: " << LEToHex(onekey) << std::endl; + std::cout << " temp21: " << LEToHex(temp2) << std::endl; + std::cout << "roundkey: " << LEToHex(rc[roundidx]) << std::endl; + + aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx])); + + std::cout << "onekey2: " << LEToHex(onekey) << std::endl; + std::cout << "roundkey: " << LEToHex(rc[roundidx + 1]) << std::endl; + + aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 1])); + + std::cout << " temp22: " << LEToHex(temp2) << std::endl; + std::cout << "roundkey: " << LEToHex(rc[roundidx + 2]) << std::endl; + + aesenc((unsigned char *)&onekey, (unsigned char *)&(rc[roundidx + 2])); + + std::cout << "onekey2: " << LEToHex(onekey) << std::endl; + + aesenc((unsigned char *)&temp2, (unsigned char *)&(rc[roundidx + 3])); + + std::cout << " temp22: " << LEToHex(temp2) << std::endl; + */ + MIX2_EMU(onekey, temp2); - acc = _mm_xor_si128(onekey, acc); - acc = _mm_xor_si128(temp2, acc); + + /* + std::cout << "onekey3: " << LEToHex(onekey) << std::endl; + */ + + acc = _mm_xor_si128_emu(onekey, acc); + acc = _mm_xor_si128_emu(temp2, acc); } } while (rounds--); - __m128i tempa1 = _mm_load_si128(prand); - __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); - __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + const __m128i tempa1 = _mm_load_si128_emu(prand); + const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2); - __m128i tempa4 = _mm_load_si128(prandex); - _mm_store_si128(prandex, tempa3); - _mm_store_si128(prand, tempa4); + const __m128i tempa4 = _mm_load_si128_emu(prandex); + _mm_store_si128_emu(prandex, tempa3); + _mm_store_si128_emu(prand, tempa4); break; } case 0x18: { - __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); - __m128i tmp; // used by MIX2 - - uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times - __m128i *rc = prand; - uint64_t aesroundoffset = 0; - __m128i onekey; - - do - { - if (selector & (((uint64_t)0x10000000) << rounds)) - { - onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); - __m128i add1 = _mm_xor_si128(onekey, temp2); - // cannot be zero here, may be negative - int32_t divisor = (uint32_t)selector; - int64_t dividend = _mm_cvtsi128_si64(add1); - __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); - acc = _mm_xor_si128(modulo, acc); - } - else - { - onekey = _mm_load_si128(rc++); - __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); - __m128i add1 = _mm_xor_si128(onekey, temp2); - __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - __m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1); - acc = _mm_xor_si128(clprod2, acc); - } - } while (rounds--); - - __m128i tempa3 = _mm_load_si128(prandex); - __m128i tempa4 = _mm_xor_si128(tempa3, acc); - _mm_store_si128(prandex, tempa4); - _mm_store_si128(prand, onekey); + const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const __m128i temp2 = _mm_load_si128_emu(prand); + const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); + acc = _mm_xor_si128_emu(clprod1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2); + const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2); + + const __m128i tempb3 = _mm_load_si128_emu(prandex); + _mm_store_si128_emu(prandex, tempa2); + _mm_store_si128_emu(prand, tempb3); break; } case 0x1c: { - __m128i temp1 = _mm_load_si128(pbuf); - __m128i temp2 = _mm_load_si128(prandex); - __m128i add1 = _mm_xor_si128(temp1, temp2); - __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); - acc = _mm_xor_si128(clprod1, acc); + const __m128i temp1 = _mm_load_si128_emu(pbuf); + const __m128i temp2 = _mm_load_si128_emu(prandex); + const __m128i add1 = _mm_xor_si128_emu(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); + acc = _mm_xor_si128_emu(clprod1, acc); - __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2); - __m128i tempa2 = _mm_xor_si128(tempa1, temp2); + const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2); + const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2); - __m128i tempa3 = _mm_load_si128(prand); - _mm_store_si128(prand, tempa2); + const __m128i tempa3 = _mm_load_si128_emu(prand); + _mm_store_si128_emu(prand, tempa2); - acc = _mm_xor_si128(tempa3, acc); + acc = _mm_xor_si128_emu(tempa3, acc); - __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3); - __m128i tempb2 = _mm_xor_si128(tempb1, tempa3); - _mm_store_si128(prandex, tempb2); + const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3); + const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3); + _mm_store_si128_emu(prandex, tempb2); break; } } - fixrand[i] = prand_idx; - fixrandex[i] = prandex_idx; - } -// printf("acc = %08x\n", _mm_cvtsi128_si64(acc)); - -// exit(0); return acc; } // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, // returning a 64 bit hash value -uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, uint16_t * __restrict fixrand, uint16_t * __restrict fixrandex) { - const unsigned int m = 128;// we process the data in chunks of 16 cache lines - __m128i * rs64 = (__m128i *)random; - const __m128i * string = (const __m128i *) buf; - - __m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask, fixrand, fixrandex); - acc = _mm_xor_si128(acc, lazyLengthHash_port(1024, 64)); - return precompReduction64_port(acc); -} \ No newline at end of file +uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) { + const unsigned int m = 128;// we process the data in chunks of 16 cache lines + __m128i * rs64 = (__m128i *)random; + const __m128i * string = (const __m128i *) buf; + + __m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask); + acc = _mm_xor_si128_emu(acc, lazyLengthHash_port(1024, 64)); + return precompReduction64_port(acc); +} diff --git a/verus/verus_hash.cpp b/verus/verus_hash.cpp index 8b153e83cd..93b7a6e17b 100644 --- a/verus/verus_hash.cpp +++ b/verus/verus_hash.cpp @@ -1,181 +1,590 @@ -// (C) 2018 The Verus Developers -// Distributed under the MIT software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. - /* -This provides the PoW hash function for Verus, a CPU-optimized hash -function with a Haraka V2 core. Unlike Haraka, which is made for short -inputs only, Verus Hash takes any length of input and produces a 256 -bit output. -*/ + * This uses veriations of the clhash algorithm for Verus Coin, licensed + * with the Apache-2.0 open source license. + * + * Copyright (c) 2018 Michael Toutonghi + * Distributed under the Apache 2.0 software license, available in the original form for clhash + * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a + * + * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser + * Faster 64-bit universal hashing + * using carry-less multiplications, Journal of Cryptographic Engineering (to appear) + * + * Best used on recent x64 processors (Haswell or better). + * + * This implements an intermediate step in the last part of a Verus block hash. The intent of this step + * is to more effectively equalize FPGAs over GPUs and CPUs. + * + **/ + + +#include "verus_clhash.h" + + +#include #include -//#include "common.h" -#include "verus_hash.h" +//#include +//#include "cpu_verushash.hpp" -void (*CVerusHash::haraka512Function)(unsigned char *out, const unsigned char *in); +#ifdef _WIN32 +#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) +#endif -void CVerusHash::Hash(void *result, const void *data, size_t _len) -{ - unsigned char buf[128]; - unsigned char *bufPtr = buf; - int nextOffset = 64; - uint32_t pos = 0, len = _len; - unsigned char *bufPtr2 = bufPtr + nextOffset; - unsigned char *ptr = (unsigned char *)data; - - // put our last result or zero at beginning of buffer each time - memset(bufPtr, 0, 32); - - // digest up to 32 bytes at a time - for ( ; pos < len; pos += 32) - { - if (len - pos >= 32) - { - memcpy(bufPtr + 32, ptr + pos, 32); - } - else - { - int i = (int)(len - pos); - memcpy(bufPtr + 32, ptr + pos, i); - memset(bufPtr + 32 + i, 0, 32 - i); - } - (*haraka512Function)(bufPtr2, bufPtr); - bufPtr2 = bufPtr; - bufPtr += nextOffset; - nextOffset *= -1; - } - memcpy(result, bufPtr, 32); -}; -void CVerusHash::init() -{ - - haraka512Function = &haraka512_port_zero; - +int __cpuverusoptimized = 0x80; + +// multiply the length and the some key, no modulo +static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) { + const __m128i lengthvector = _mm_set_epi64x(keylength,length); + const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10); + return clprod1; +} + +// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64 +static inline __m128i precompReduction64_si128( __m128i A) { + + //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0) + const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); + __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01); + __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153), + _mm_srli_si128(Q2,8)); + __m128i Q4 = _mm_xor_si128(Q2,A); + const __m128i final = _mm_xor_si128(Q3,Q4); + return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE +} + +static inline uint64_t precompReduction64( __m128i A) { + return _mm_cvtsi128_si64(precompReduction64_si128(A)); } -CVerusHash &CVerusHash::Write(const unsigned char *data, size_t _len) +// verus intermediate hash extra +static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, + uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) { - unsigned char *tmp; - uint32_t pos, len = _len; + __m128i *pbuf; + __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] }; + + // divide key mask by 16 from bytes to __m128i + keyMask >>= 4; + + __m128i acc = _mm_load_si128(randomsource + (keyMask + 2)); + + // the random buffer must have at least 32 16 byte dwords after the keymask to work with this + // algorithm. we take the value from the last element inside the keyMask + 2, as that will never + // be used to xor into the accumulator before it is hashed with other values first +#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0) - // digest up to 32 bytes at a time - for ( pos = 0; pos < len; ) +#define LIONELK_FETCH_DIST 0 + +#pragma unroll 32 + + for (uint64_t i = 0; i < 32; i++) { - uint32_t room = 32 - curPos; + + const uint64_t selector = _mm_cvtsi128_si64(acc); - if (len - pos >= room) - { - memcpy(curBuf + 32 + curPos, data + pos, room); - (*haraka512Function)(result, curBuf); - tmp = curBuf; - curBuf = result; - result = tmp; - pos += room; - curPos = 0; - } - else + // get two random locations in the key, which will be mutated and swapped + __m128i *prand = randomsource + ((selector >> 5) & keyMask); + __m128i *prandex = randomsource + ((selector >> 32) & keyMask); + + // select random start and order of pbuf processing + pbuf = pbuf_copy + (selector & 3); + uint32_t prand_idx = (selector >> 5) & keyMask; + uint32_t prandex_idx = (selector >>32) & keyMask; + g_prand[i] = prand[0]; + g_prandex[i] = prandex[0]; + fixrand[i] = prand_idx; + fixrandex[i] = prandex_idx; + + switch (selector & 0x1c) { - memcpy(curBuf + 32 + curPos, data + pos, len - pos); - curPos += len - pos; - pos = len; + + case 0: + { + const __m128i temp1 = _mm_load_si128(prandex); + const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128(prand); + _mm_store_si128(prand, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + acc = _mm_xor_si128(clprod12, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prandex, tempb2); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + + break; + } + case 4: + { + const __m128i temp1 = _mm_load_si128(prand); + const __m128i temp2 = _mm_load_si128(pbuf); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10); + acc = _mm_xor_si128(clprod2, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + acc = _mm_xor_si128(add12, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prand, tempb2); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + + break; + } + case 8: + { + const __m128i temp1 = _mm_load_si128(prandex); + const __m128i temp2 = _mm_load_si128(pbuf); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + acc = _mm_xor_si128(add1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + const __m128i temp12 = _mm_load_si128(prand); + _mm_store_si128(prand, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + acc = _mm_xor_si128(clprod12, acc); + const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); + acc = _mm_xor_si128(clprod22, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prandex, tempb2); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + + break; + } + case 0xc: + { + const __m128i temp1 = _mm_load_si128(prand); + const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + + // cannot be zero here + const int32_t divisor = (uint32_t)selector; + + acc = _mm_xor_si128(add1, acc); + + const int64_t dividend = _mm_cvtsi128_si64(acc); + const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); + acc = _mm_xor_si128(modulo, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp1); + + if (dividend & 1) + { + const __m128i temp12 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa2); + + const __m128i temp22 = _mm_load_si128(pbuf); + const __m128i add12 = _mm_xor_si128(temp12, temp22); + const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10); + acc = _mm_xor_si128(clprod12, acc); + const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10); + acc = _mm_xor_si128(clprod22, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12); + const __m128i tempb2 = _mm_xor_si128(tempb1, temp12); + _mm_store_si128(prand, tempb2); + } + else + { + const __m128i tempb3 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa2); + _mm_store_si128(prand, tempb3); + } + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + + break; + } + case 0x10: + { + // a few AES operations + const __m128i *rc = prand; + __m128i tmp; + + __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1)); + __m128i temp2 = _mm_load_si128(pbuf); + + AES2(temp1, temp2, 0); + + MIX2(temp1, temp2); + + AES2(temp1, temp2, 4); + MIX2(temp1, temp2); + + AES2(temp1, temp2, 8); + MIX2(temp1, temp2); + + acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc)); + + const __m128i tempa1 = _mm_load_si128(prand); + const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + + const __m128i tempa4 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa3); + _mm_store_si128(prand, tempa4); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + + break; + } + case 0x14: + { + // we'll just call this one the monkins loop, inspired by Chris + const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + __m128i tmp; // used by MIX2 + + uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times + __m128i *rc = prand; + uint64_t aesroundoffset = 0,loop_c; + __m128i onekey; + + do + { + loop_c = selector & (((uint64_t)0x10000000) << rounds); + if (loop_c) + { + onekey = _mm_load_si128(rc++); + const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); + const __m128i add1 = _mm_xor_si128(onekey, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + } + else + { + onekey = _mm_load_si128(rc++); + __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); + + AES2(onekey, temp2, aesroundoffset); + + aesroundoffset += 4; + MIX2(onekey, temp2); + + acc = _mm_xor_si128(onekey, acc); + acc = _mm_xor_si128(temp2, acc); + } + + } while (rounds--); + + const __m128i tempa1 = _mm_load_si128(prand); + const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1); + const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2); + + const __m128i tempa4 = _mm_load_si128(prandex); + _mm_store_si128(prandex, tempa3); + _mm_store_si128(prand, tempa4); + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + + break; + } + case 0x18: + { + __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1); + __m128i tmp; // used by MIX2 + + uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times + __m128i *rc = prand; + uint64_t aesroundoffset = 0; + __m128i onekey; + + do + { + if (selector & (((uint64_t)0x10000000) << rounds)) + { + onekey = _mm_load_si128(rc++); + __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp); + __m128i add1 = _mm_xor_si128(onekey, temp2); + // cannot be zero here, may be negative + int32_t divisor = (uint32_t)selector; + int64_t dividend = _mm_cvtsi128_si64(add1); + __m128i modulo = _mm_cvtsi32_si128(dividend % divisor); + acc = _mm_xor_si128(modulo, acc); + } + else + { + onekey = _mm_load_si128(rc++); + __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf); + __m128i add1 = _mm_xor_si128(onekey, temp2); + __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + __m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1); + acc = _mm_xor_si128(clprod2, acc); + } + } while (rounds--); + + __m128i tempa3 = _mm_load_si128(prandex); + __m128i tempa4 = _mm_xor_si128(tempa3, acc); + _mm_store_si128(prandex, tempa4); + _mm_store_si128(prand, onekey); + break; + } + case 0x1c: + { + const __m128i temp1 = _mm_load_si128(pbuf); + const __m128i temp2 = _mm_load_si128(prandex); + const __m128i add1 = _mm_xor_si128(temp1, temp2); + const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10); + acc = _mm_xor_si128(clprod1, acc); + + const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2); + const __m128i tempa2 = _mm_xor_si128(tempa1, temp2); + + const __m128i tempa3 = _mm_load_si128(prand); +#ifdef VERUSHASHDEBUGo + + printf("[cpu] tempa1 : "); + printf("%016llx%016llx", ((uint64_t*)&tempa1)[0], ((uint64_t*)&tempa1)[1]); + printf("\n"); + printf("[cpu] tempa2 : "); + printf("%016llx%016llx", ((uint64_t*)&tempa2)[0], ((uint64_t*)&tempa2)[1]); + printf("\n"); + printf("[cpu] tempa3 : "); + printf("%016llx%016llx", ((uint64_t*)&tempa3)[0], ((uint64_t*)&tempa3)[1]); + printf("\n"); + +#endif + _mm_store_si128(prand, tempa2); + + acc = _mm_xor_si128(tempa3, acc); + + const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3); + const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3); + _mm_store_si128(prandex, tempb2); + + PREFETCH_T0(randomsource + ((selector >> 5) & keyMask), LIONELK_FETCH_DIST); + PREFETCH_T0(randomsource + ((selector >> 32) & keyMask), LIONELK_FETCH_DIST); + + break; + } } + + + } - return *this; -} -// to be declared and accessed from C -void verus_hash(void *result, const void *data, size_t len) -{ - return CVerusHash::Hash(result, data, len); + return acc; } -void (*CVerusHashV2::haraka512Function)(unsigned char *out, const unsigned char *in); -void (*CVerusHashV2::haraka512KeyedFunction)(unsigned char *out, const unsigned char *in, const u128 *rc); -void (*CVerusHashV2::haraka256Function)(unsigned char *out, const unsigned char *in); +// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, +// returning a 64 bit hash value +uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex, + u128 *g_prand, u128 *g_prandex) { + __m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex, g_prand, g_prandex); + acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64)); -void CVerusHashV2::init() -{ - if (IsCPUVerusOptimized()) - { - load_constants(); - haraka512Function = &haraka512; - haraka512KeyedFunction = &haraka512_keyed; - haraka256Function = &haraka256; - } - else - { - // load the haraka constants - load_constants_port(); - haraka512Function = &haraka512_port; - haraka512KeyedFunction = &haraka512_port_keyed; - haraka256Function = &haraka256_port; - } + + return precompReduction64(acc); } -void CVerusHashV2::Hash(void *result, const void *data, size_t len) +inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, const u128 *rc) { + u128 s[4], tmp; + + s[0] = LOAD(in); + s[1] = LOAD(in + 16); + s[2] = LOAD(in + 32); + s[3] = LOAD(in + 48); + + AES4(s[0], s[1], s[2], s[3], 0); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 8); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 16); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 24); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 32); + MIX4(s[0], s[1], s[2], s[3]); + + s[0] = _mm_xor_si128(s[0], LOAD(in)); + s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); + s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); + s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); + + // TRUNCSTORE(out, s[0], s[1], s[2], s[3]); +} +/* +void cpu_verushash::solve_verus_v2_opt(CBlockHeader &bh, + arith_uint256 &target, + std::function cancelf, + std::function&, size_t, const unsigned char*)> solutionf, + std::function hashdonef, + cpu_verushash &device_context) { - unsigned char buf[128]; - unsigned char *bufPtr = buf; - int pos = 0, nextOffset = 64; - unsigned char *bufPtr2 = bufPtr + nextOffset; - unsigned char *ptr = (unsigned char *)data; + CVerusHashV2bWriter &vhw = *(device_context.pVHW2b); + CVerusHashV2 &vh = vhw.GetState(); + verusclhasher &vclh = vh.vclh; + + alignas(32) uint256 curHash, curTarget = ArithToUint256(target); + + const uint64_t *compResult = (uint64_t *)&curHash; + const uint64_t *compTarget = (uint64_t *)&curTarget; + + u128 *hashKey = (u128 *)verusclhasher_key.get(); + verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get(); + void *hasherrefresh = ((unsigned char *)hashKey) + pdesc->keySizeInBytes; + const int keyrefreshsize = vclh.keyrefreshsize(); // number of 256 bit blocks + + bh.nSolution = std::vector(1344); + bh.nSolution[0] = VERUSHHASH_SOLUTION_VERSION; // earliest VerusHash 2.0 solution version + + // prepare the hash state + vhw.Reset(); + vhw << bh; - // put our last result or zero at beginning of buffer each time - memset(bufPtr, 0, 32); + int64_t *extraPtr = vhw.xI64p(); + unsigned char *curBuf = vh.CurBuffer(); - // digest up to 32 bytes at a time - for ( ; pos < len; pos += 32) + // skip keygen if it is the current key + if (pdesc->seed != *((uint256 *)curBuf)) { - if (len - pos >= 32) + // generate a new key by chain hashing with Haraka256 from the last curbuf + // assume 256 bit boundary + int n256blks = pdesc->keySizeInBytes >> 5; + unsigned char *pkey = ((unsigned char *)hashKey); + unsigned char *psrc = curBuf; + for (int i = 0; i < n256blks; i++) { - memcpy(bufPtr + 32, ptr + pos, 32); + haraka256(pkey, psrc); + psrc = pkey; + pkey += 32; } - else - { - int i = (int)(len - pos); - memcpy(bufPtr + 32, ptr + pos, i); - memset(bufPtr + 32 + i, 0, 32 - i); - } - (*haraka512Function)(bufPtr2, bufPtr); - bufPtr2 = bufPtr; - bufPtr += nextOffset; - nextOffset *= -1; + pdesc->seed = *((uint256 *)curBuf); + memcpy(hasherrefresh, hashKey, pdesc->keySizeInBytes); } - memcpy(result, bufPtr, 32); -}; -CVerusHashV2 &CVerusHashV2::Write(const unsigned char *data, size_t len) -{ - unsigned char *tmp; + const __m128i shuf1 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0); + const __m128i fill1 = _mm_shuffle_epi8(_mm_load_si128((u128 *)curBuf), shuf1); + const __m128i shuf2 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0); + unsigned char ch = curBuf[0]; - // digest up to 32 bytes at a time - for ( int pos = 0; pos < len; ) - { - int room = 32 - curPos; + // loop the requested number of times or until canceled. determine if we + // found a winner, and send all winners found as solutions. count only one hash. + // hashrate is determined by multiplying hash by VERUSHASHES_PER_SOLVE, with VerusHash, only + // hashrate and sharerate are valid, solutionrate will equal sharerate + for (int64_t i = 0; i < VERUSHASHES_PER_SOLVE; i++) + { + *extraPtr = i; - if (len - pos >= room) - { - memcpy(curBuf + 32 + curPos, data + pos, room); - (*haraka512Function)(result, curBuf); - tmp = curBuf; - curBuf = result; - result = tmp; - pos += room; - curPos = 0; - } - else + // prepare the buffer + _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill1); + curBuf[32 + 15] = ch; + + // run verusclhash on the buffer + const uint64_t intermediate = vclh(curBuf, hashKey); + + // fill buffer to the end with the result and final hash + __m128i fill2 = _mm_shuffle_epi8(_mm_loadl_epi64((u128 *)&intermediate), shuf2); + _mm_store_si128((u128 *)(&curBuf[32 + 16]), fill2); + curBuf[32 + 15] = *((unsigned char *)&intermediate); + + haraka512_keyed_local((unsigned char *)&curHash, curBuf, hashKey + vh.IntermediateTo128Offset(intermediate)); + + if (compResult[3] > compTarget[3] || (compResult[3] == compTarget[3] && compResult[2] > compTarget[2]) || + (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] > compTarget[1]) || + (compResult[3] == compTarget[3] && compResult[2] == compTarget[2] && compResult[1] == compTarget[1] && compResult[0] > compTarget[0])) { - memcpy(curBuf + 32 + curPos, data + pos, len - pos); - curPos += len - pos; - pos = len; + // refresh the key + memcpy(hashKey, hasherrefresh, keyrefreshsize); + continue; } - } - return *this; + + std::vector solution = bh.nSolution; + int extraSpace = (solution.size() % 32) + 15; + assert(solution.size() > 32); + *((int64_t *)&(solution.data()[solution.size() - extraSpace])) = i; + + solutionf(std::vector(0), solution.size(), solution.data()); + if (cancelf()) return; + + // refresh the key + memcpy(hashKey, hasherrefresh, keyrefreshsize); + } + hashdonef(); } -// to be declared and accessed from C -void verus_hash_v2(void *result, const void *data, size_t len) + +void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) { + u128 s[4], tmp; + + s[0] = LOAD(in); + s[1] = LOAD(in + 16); + s[2] = LOAD(in + 32); + s[3] = LOAD(in + 48); + + AES4(s[0], s[1], s[2], s[3], 0); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 8); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 16); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 24); + MIX4(s[0], s[1], s[2], s[3]); + + AES4(s[0], s[1], s[2], s[3], 32); + MIX4(s[0], s[1], s[2], s[3]); + + s[0] = _mm_xor_si128(s[0], LOAD(in)); + s[1] = _mm_xor_si128(s[1], LOAD(in + 16)); + s[2] = _mm_xor_si128(s[2], LOAD(in + 32)); + s[3] = _mm_xor_si128(s[3], LOAD(in + 48)); + + TRUNCSTORE(out, s[0], s[1], s[2], s[3]); +} +*/ + +#ifdef _WIN32 + +#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) +#endif + +void *alloc_aligned_buffer(uint64_t bufSize) { - return CVerusHashV2::Hash(result, data, len); + void *answer = NULL; + if (posix_memalign(&answer, sizeof(__m256i), bufSize)) + { + return NULL; + } + else + { + return answer; + } } diff --git a/verus/verus_hash.h b/verus/verus_hash.h index 3805719279..70ef6071e0 100644 --- a/verus/verus_hash.h +++ b/verus/verus_hash.h @@ -1,3 +1,245 @@ -#include +// (C) 2018 Michael Toutonghi +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. -uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t * __restrict fixrand, uint32_t * __restrict fixrandex); +/* +This provides the PoW hash function for Verus, enabling CPU mining. +*/ +#ifndef VERUS_HASH_H_ +#define VERUS_HASH_H_ + +// verbose output when defined +//#define VERUSHASHDEBUG 1 + +#include +#include + +#include "uint256.h" +#include "verus_clhash.h" + +extern "C" +{ +#include "haraka.h" +#include "haraka_portable.h" + +} + +class CVerusHash +{ + public: + static void Hash(void *result, const void *data, size_t len); + static void (*haraka512Function)(unsigned char *out, const unsigned char *in); + + static void init(); + + CVerusHash() { } + + CVerusHash &Write(const unsigned char *data, size_t len); + + CVerusHash &Reset() + { + curBuf = buf1; + result = buf2; + curPos = 0; + std::fill(buf1, buf1 + sizeof(buf1), 0); + return *this; + } + + int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); } + void ClearExtra() + { + if (curPos) + { + std::fill(curBuf + 32 + curPos, curBuf + 64, 0); + } + } + void ExtraHash(unsigned char hash[32]) { (*haraka512Function)(hash, curBuf); } + + void Finalize(unsigned char hash[32]) + { + if (curPos) + { + std::fill(curBuf + 32 + curPos, curBuf + 64, 0); + (*haraka512Function)(hash, curBuf); + } + else + std::memcpy(hash, curBuf, 32); + } + + private: + // only buf1, the first source, needs to be zero initialized + unsigned char buf1[64] = {0}, buf2[64]; + unsigned char *curBuf = buf1, *result = buf2; + size_t curPos = 0; +}; + +class CVerusHashV2 +{ + public: + static void Hash(void *result, const void *data, size_t len); + static void (*haraka512Function)(unsigned char *out, const unsigned char *in); + static void (*haraka512KeyedFunction)(unsigned char *out, const unsigned char *in, const u128 *rc); + static void (*haraka256Function)(unsigned char *out, const unsigned char *in); + + static void init(); + + verusclhasher vclh; + + CVerusHashV2() : vclh() { + // we must have allocated key space, or can't run + if (!verusclhasher_key.get()) + { + printf("ERROR: failed to allocate hash buffer - terminating\n"); + assert(false); + } + } + + CVerusHashV2 &Write(const unsigned char *data, size_t len); + + inline CVerusHashV2 &Reset() + { + curBuf = buf1; + result = buf2; + curPos = 0; + std::fill(buf1, buf1 + sizeof(buf1), 0); + + return *this; + + return *this; + + } + + inline int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); } + inline void ClearExtra() + { + if (curPos) + { + std::fill(curBuf + 32 + curPos, curBuf + 64, 0); + } + } + + template + inline void FillExtra(const T *_data) + { + unsigned char *data = (unsigned char *)_data; + int pos = curPos; + int left = 32 - pos; + do + { + int len = left > sizeof(T) ? sizeof(T) : left; + std::memcpy(curBuf + 32 + pos, data, len); + pos += len; + left -= len; + } while (left > 0); + } + inline void ExtraHash(unsigned char hash[32]) { (*haraka512Function)(hash, curBuf); } + inline void ExtraHashKeyed(unsigned char hash[32], u128 *key) { (*haraka512KeyedFunction)(hash, curBuf, key); } + + void Finalize(unsigned char hash[32]) + { + if (curPos) + { + std::fill(curBuf + 32 + curPos, curBuf + 64, 0); + (*haraka512Function)(hash, curBuf); + } + else + std::memcpy(hash, curBuf, 32); + } + + // chains Haraka256 from 32 bytes to fill the key + static u128 *GenNewCLKey(unsigned char *seedBytes32) + { + + unsigned char *key = (unsigned char *)verusclhasher_key.get(); + verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get(); + // skip keygen if it is the current key + if (pdesc->seed != *((uint256 *)seedBytes32)) + { + // generate a new key by chain hashing with Haraka256 from the last curbuf + int n256blks = pdesc->keySizeInBytes >> 5; + int nbytesExtra = pdesc->keySizeInBytes & 0x1f; + unsigned char *pkey = key + pdesc->keySizeInBytes; + unsigned char *psrc = seedBytes32; + for (int i = 0; i < n256blks; i++) + { + (*haraka256Function)(pkey, psrc); + + psrc = pkey; + pkey += 32; + } + if (nbytesExtra) + { + unsigned char buf[32]; + (*haraka256Function)(buf, psrc); + memcpy(pkey, buf, nbytesExtra); + } + pdesc->seed = *((uint256 *)seedBytes32); + } + memcpy(key, key + pdesc->keySizeInBytes, pdesc->keySizeInBytes); + return (u128 *)key; + } + + inline uint64_t IntermediateTo128Offset(uint64_t intermediate) + { + // the mask is where we wrap + uint64_t mask = vclh.keyMask >> 4; + return intermediate & mask; + } + + void Finalize2b(unsigned char hash[32]) + { + // fill buffer to the end with the beginning of it to prevent any foreknowledge of + // bits that may contain zero + //uint8_t temp[64] = { 0x0c, 0x4b, 0x23, 0x67, 0x8e, 0x9d, 0xc3, 0x5e, 0xaa, 0xed, 0x49, 0x3e, 0x32, 0x27, 0x3b, 0x24, 0x3b, 0xae, 0xc9, 0x7b, 0x9a, 0xcc, 0x02, 0x72, 0x38, 0x61, 0xb0, 0xc6, 0x58, 0x30, 0x23, 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x4b, 0x23, 0x67, 0x8e, 0x9d, 0xc3, 0x5e, 0xaa, 0xed, 0x49, 0x3e, 0x32, 0x27, 0x3b, 0x24, 0x0c }; + + // memcpy(curBuf, temp, 64); + FillExtra((u128 *)curBuf); + + u128 *key = GenNewCLKey(curBuf); + + uint64_t intermediate = vclh(curBuf, key); + + FillExtra(&intermediate); + + + // get the final hash with a mutated dynamic key for each hash result + (*haraka512KeyedFunction)(hash, curBuf, key + IntermediateTo128Offset(intermediate)); +#ifdef VERUSHASHDEBUG + printf("[cpu]Final hash : "); + for (int i = 0; i < 32; i++) + printf("%02x", ((uint8_t*)&hash[0])[i]); + printf("\n"); +#endif + /* + // TEST BEGIN + // test against the portable version + uint256 testHash1 = *(uint256 *)hash, testHash2; + FillExtra((u128 *)curBuf); + u128 *hashKey = ((u128 *)vclh.gethashkey()); + uint64_t temp = verusclhash_port(key, curBuf, vclh.keyMask); + FillExtra(&temp); + haraka512_keyed((unsigned char *)&testHash2, curBuf, hashKey + IntermediateTo128Offset(intermediate)); + if (testHash1 != testHash2) + { + printf("Portable version failed! intermediate1: %lx, intermediate2: %lx\n", intermediate, temp); + } + // END TEST + */ + } + + inline unsigned char *CurBuffer() + { + return curBuf; + } + + private: + // only buf1, the first source, needs to be zero initialized + alignas(32) unsigned char buf1[64] = {0}, buf2[64]; + unsigned char *curBuf = buf1, *result = buf2; + size_t curPos = 0; +}; + +extern void verus_hash(void *result, const void *data, size_t len); +extern void verus_hash_v2(void *result, const void *data, size_t len); + +#endif diff --git a/verus/verusscan.cpp b/verus/verusscan.cpp index 93156519a0..4d5c2d536c 100644 --- a/verus/verusscan.cpp +++ b/verus/verusscan.cpp @@ -12,20 +12,16 @@ #define VERUS_KEY_SIZE128 552 #include #include - #include "verus_clhash.h" #include "uint256.h" //#include "hash.h" #include //#include "primitives/block.h" +//extern "C" +//{ +//#include "haraka.h" -//#include "SSE2NEON.h" -extern "C" -{ - //#include "haraka.h" -#include "haraka_portable.h" -} - +//} enum { // primary actions @@ -51,7 +47,7 @@ static __thread uint32_t throughput = 0; #define htobe32(x) swab32(x) #endif -void GenNewCLKey(unsigned char *seedBytes32, __m128i *keyback) +extern "C" void GenNewCLKey(unsigned char *seedBytes32, u128 *keyback) { // generate a new key by chain hashing with Haraka256 from the last curbuf int n256blks = VERUS_KEY_SIZE >> 5; //8832 >> 5 @@ -60,7 +56,7 @@ void GenNewCLKey(unsigned char *seedBytes32, __m128i *keyback) unsigned char *psrc = seedBytes32; for (int i = 0; i < n256blks; i++) { - haraka256_port(pkey, psrc); + haraka256(pkey, psrc); psrc = pkey; pkey += 32; @@ -68,17 +64,19 @@ void GenNewCLKey(unsigned char *seedBytes32, __m128i *keyback) if (nbytesExtra) { unsigned char buf[32]; - haraka256_port(buf, psrc); + haraka256(buf, psrc); memcpy(pkey, buf, nbytesExtra); } } -extern "C" void FixKey(uint16_t *fixrand, uint16_t *fixrandex, __m128i *keyback, __m128i *keyback_master) +extern "C" void FixKey(uint32_t *fixrand, uint32_t *fixrandex, u128 *keyback, + u128 * g_prand, u128 *g_prandex) { - for (int i = 0; i < 32; i++) + u128 buf1, buf2; + for (int i = 31; i > -1; i--) { - keyback[fixrand[i]] = keyback_master[fixrand[i]]; - keyback[fixrandex[i]] = keyback_master[fixrandex[i]]; + keyback[fixrandex[i]] = g_prandex[i]; + keyback[fixrand[i]] = g_prand[i]; } } @@ -97,7 +95,7 @@ extern "C" void VerusHashHalf(void *result2, unsigned char *data, size_t len) unsigned char *tmp; - load_constants_port(); + load_constants(); // digest up to 32 bytes at a time for (int pos = 0; pos < len; ) @@ -107,7 +105,7 @@ extern "C" void VerusHashHalf(void *result2, unsigned char *data, size_t len) if (len - pos >= room) { memcpy(curBuf + 32 + curPos, data + pos, room); - haraka512_port(result, curBuf); + haraka512(result, curBuf); tmp = curBuf; curBuf = result; result = tmp; @@ -124,51 +122,43 @@ extern "C" void VerusHashHalf(void *result2, unsigned char *data, size_t len) memcpy(curBuf + 47, curBuf, 16); memcpy(curBuf + 63, curBuf, 1); - // FillExtra((__m128i *)curBuf); + // FillExtra((u128 *)curBuf); memcpy(result2, curBuf, 64); }; -__inline void Verus2hash(unsigned char *hash, unsigned char *curBuf, uint32_t nonce, - __m128i * __restrict data_key, uint8_t *gpu_init, uint16_t * __restrict fixrand, uint16_t * __restrict fixrandex, __m128i * __restrict data_key_master) + + +extern "C" void Verus2hash(unsigned char *hash, unsigned char *curBuf, uint32_t nonce, + u128 *data_key, uint8_t *gpu_init, uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex) { - uint64_t mask = VERUS_KEY_SIZE128; //552 - if (!gpu_init[0]) { - GenNewCLKey(curBuf, data_key); //data_key a global static 2D array data_key[16][8832]; - memcpy(data_key_master, data_key, VERUS_KEY_SIZE); - gpu_init[0] = 1; + //uint64_t mask = VERUS_KEY_SIZE128; //552 - } memcpy(curBuf + 47, curBuf, 16); memcpy(curBuf + 63, curBuf, 1); - // FillExtra((__m128i *)curBuf); - - ((uint32_t*)&curBuf[0])[8] = nonce; - uint64_t intermediate = verusclhash_port(data_key, curBuf, 8191, fixrand, fixrandex); - //FillExtra + // FillExtra((u128 *)curBuf); + + ((uint32_t*)&curBuf[0])[8] = nonce; + uint64_t intermediate = verusclhash(data_key,curBuf, 8191, fixrand, fixrandex, g_prand, g_prandex); + //FillExtra memcpy(curBuf + 47, &intermediate, 8); memcpy(curBuf + 55, &intermediate, 8); memcpy(curBuf + 63, &intermediate, 1); intermediate &= 511; - haraka512_port_keyed(hash, curBuf, data_key + intermediate); - //printf("%08x\n", ((uint32_t*)hash)[7]); exit(0); - FixKey(fixrand, fixrandex, data_key, data_key_master); + haraka512_keyed(hash, curBuf, data_key + intermediate); + FixKey(fixrand, fixrandex, data_key, g_prand, g_prandex); } #ifdef _WIN32 -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) +#define posix_memalign(p, a, s) (((*(p)) = (u128*) _aligned_malloc((s), (a))), *(p) ?0 :errno) #endif -// char *testt = "04000100ffee80b5b3adcc4191edb2f0fd8657b08e4458503cd6bcd54b17f2c2978ac6c571d8c6e549d23f9f5d860a6b72665d4210a6d54401c985c54ebdc7d82e6b93757ec28fb834e1e7db86d3ba2400ef28989318486c035597ad2fe236d67bbc1348ca14ef5d5e4b031dffffff4600000000000000000000000000000000000001000000000000000000fd4005010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000cb9443000000000000000000000000"; - extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) { // unsigned char data[] = { // 0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0x40, 0x05, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - // }; - - - uint32_t _ALIGN(64) endiandata[35]; + //}; + uint32_t _ALIGN(64) endiandata[35]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -176,16 +166,17 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, uint8_t gpuinit = 0; struct timeval tv_start, tv_end, diff; double secs, solps; - //__m128i *data_key; - - //posix_memalign((void**)&data_key,32,VERUS_KEY_SIZE); - - //__m128i *data_key_master = (__m128i *)malloc(VERUS_KEY_SIZE); - __m128i _ALIGN(32) register data_key[VERUS_KEY_SIZE128] = { 0 }; // 552 required - __m128i _ALIGN(32) data_key_master[VERUS_KEY_SIZE128] = { 0 }; + u128 *data_key = (u128*)malloc(VERUS_KEY_SIZE); + + //u128 *data_key_master = NULL; +// posix_memalign((void**)&data_key, sizeof(__m128i), VERUS_KEY_SIZE); + u128 data_key_prand[32]; + u128 data_key_prandex[32]; + //u128 data_key[VERUS_KEY_SIZE128] = { 0 }; // 552 required + //u128 data_key_master[VERUS_KEY_SIZE128] = { 0 }; uint32_t nonce_buf = 0; - uint16_t fixrand[32]; - uint16_t fixrandex[32]; + uint32_t fixrand[32]; + uint32_t fixrandex[32]; unsigned char block_41970[] = { 0xfd, 0x40, 0x05, 0x03 }; uint8_t _ALIGN(64) full_data[140 + 3 + 1344] = { 0 }; @@ -194,15 +185,12 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, memcpy(endiandata, pdata, 140); memcpy(sol_data, block_41970, 4); memcpy(full_data, endiandata, 140); - //memcpy(full_data, data, 1487); - - //for (int i = 0, j = 0; i < 1487; ++i, j += 2) - // sprintf(full_data + j, "%02x", testt[i] & 0xff); - + // memcpy(full_data, data, 1487); uint32_t _ALIGN(64) vhash[8] = { 0 }; VerusHashHalf(blockhash_half, (unsigned char*)full_data, 1487); + GenNewCLKey((unsigned char*)blockhash_half, data_key); //data_key a global static 2D array data_key[16][8832]; @@ -213,9 +201,10 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, do { *hashes_done = nonce_buf + throughput; - Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, data_key, &gpuinit, fixrand, fixrandex, data_key_master); +Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, data_key, + &gpuinit, fixrand, fixrandex , data_key_prand, data_key_prandex); - if (vhash[7] <= Htarg) + if (vhash[7] <= Htarg ) { *((uint32_t *)full_data + 368) = nonce_buf; work->valid_nonces++; @@ -241,13 +230,13 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce, out: - gettimeofday(&tv_end, NULL); - timeval_subtract(&diff, &tv_end, &tv_start); - secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec); - solps = (double)nonce_buf / secs; +// gettimeofday(&tv_end, NULL); +// timeval_subtract(&diff, &tv_end, &tv_start); +// secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec); +// solps = (double)nonce_buf / secs; pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1; - //free(data_key); + free(data_key); //free(data_key_master); return work->valid_nonces; } @@ -261,4 +250,4 @@ void free_verushash(int thr_id) init[thr_id] = false; -} \ No newline at end of file +}