From 13e333d3e245d438337cb2a966c56a8e3924b113 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 2 Apr 2018 08:07:33 +0200 Subject: [PATCH] x16: optimised echo512-64 kernel for maxwell+ which keep the SM 3.0 compat with x11 kernel --- Makefile.am | 1 + ccminer.vcxproj | 3 + ccminer.vcxproj.filters | 3 + x16r/cuda_x16_echo512_64.cu | 248 ++++++++++++++++++++++++++++++++++++ x16r/cuda_x16r.h | 4 + x16r/x16r.cu | 12 +- x16r/x16s.cu | 12 +- 7 files changed, 279 insertions(+), 4 deletions(-) create mode 100644 x16r/cuda_x16_echo512_64.cu diff --git a/Makefile.am b/Makefile.am index 0d7ade93fa..901448b42e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -78,6 +78,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \ x16r/x16r.cu x16r/x16s.cu x16r/cuda_x16_echo512.cu x16r/cuda_x16_fugue512.cu \ x16r/cuda_x16_shabal512.cu x16r/cuda_x16_simd512_80.cu \ + x16r/cuda_x16_echo512_64.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x11/phi.cu x11/cuda_streebog_maxwell.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 01bc4e26fd..38fc198834 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -596,6 +596,9 @@ + + compute_50,sm_50;compute_52,sm_52 + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 0035ff52a6..66abda24ec 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -985,6 +985,9 @@ Source Files\CUDA\x16r + + Source Files\CUDA\x16r + Source Files\CUDA\x16r diff --git a/x16r/cuda_x16_echo512_64.cu b/x16r/cuda_x16_echo512_64.cu new file mode 100644 index 0000000000..ac18ff6885 --- /dev/null +++ b/x16r/cuda_x16_echo512_64.cu @@ -0,0 +1,248 @@ +/** + * Echo512-64 kernel for maxwell, based on alexis work + */ + +#include +#include +#include + +#define INTENSIVE_GMF +#include "tribus/cuda_echo512_aes.cuh" + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#define atomicExch(p,y) (*p) = y +#endif + +__device__ +static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0) +{ + // Big Sub Words + #pragma unroll 16 + for (int idx = 0; idx < 16; idx++) + AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0); + + // Shift Rows + #pragma unroll 4 + for (int i = 0; i < 4; i++){ + uint32_t t[4]; + /// 1, 5, 9, 13 + t[0] = W[i+ 4]; + t[1] = W[i+ 8]; + t[2] = W[i+24]; + t[3] = W[i+60]; + W[i + 4] = W[i + 20]; + W[i + 8] = W[i + 40]; + W[i +24] = W[i + 56]; + W[i +60] = W[i + 44]; + + W[i +20] = W[i +36]; + W[i +40] = t[1]; + W[i +56] = t[2]; + W[i +44] = W[i +28]; + + W[i +28] = W[i +12]; + W[i +12] = t[3]; + W[i +36] = W[i +52]; + W[i +52] = t[0]; + } + // Mix Columns + #pragma unroll 4 + for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t + #pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte + uint32_t a[4]; + a[0] = W[idx + i]; + a[1] = W[idx + i + 4]; + a[2] = W[idx + i + 8]; + a[3] = W[idx + i +12]; + + uint32_t ab = a[0] ^ a[1]; + uint32_t bc = a[1] ^ a[2]; + uint32_t cd = a[2] ^ a[3]; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[idx + i] = bc ^ a[3] ^ abx; + W[idx + i + 4] = a[0] ^ cd ^ bcx; + W[idx + i + 8] = ab ^ a[3] ^ cdx; + W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx); + } + } +} + +__global__ __launch_bounds__(128, 5) /* will force 80 registers */ +static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) +{ + __shared__ uint32_t sharedMemory[4][256]; + + aes_gpu_init128(sharedMemory); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t k0; + uint32_t h[16]; + uint32_t hash[16]; + if (thread < threads) + { + uint32_t *Hash = &g_hash[thread<<4]; + + *(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]); + *(uint2x4*)&h[ 8] = __ldg4((uint2x4*)&Hash[ 8]); + + *(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0]; + *(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8]; + + __syncthreads(); + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //8-12 + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //21-25 + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //34-38 + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + //58-61 + }; + + k0 = 520; + + #pragma unroll 4 + for (uint32_t idx = 0; idx < 16; idx += 4) { + AES_2ROUND(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + } + k0 += 4; + + uint32_t W[64]; + + #pragma unroll 4 + for (uint32_t i = 0; i < 4; i++) + { + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + + uint32_t t = (ab & 0x80808080); + uint32_t t2 = (bc & 0x80808080); + uint32_t t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[i] = abx ^ bc ^ d; + W[i + 4] = bcx ^ a ^ cd; + W[i + 8] = cdx ^ ab ^ d; + W[i +12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[i +12]; + b = h[i + 4]; + c = P[i +16]; + d = P[i +20]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[16 + i] = bc ^ d ^ abx; + W[16 + i + 4] = a ^ cd ^ bcx; + W[16 + i + 8] = d ^ ab ^ cdx; + W[16 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + + a = h[i]; + b = P[24 + i + 0]; + c = P[24 + i + 4]; + d = P[24 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[32 + i] = bc ^ d ^ abx; + W[32 + i + 4] = a ^ cd ^ bcx; + W[32 + i + 8] = d ^ ab ^ cdx; + W[32 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + + a = P[36 + i ]; + b = P[36 + i + 4]; + c = P[36 + i + 8]; + d = h[i + 12]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[48 + i] = bc ^ d ^ abx; + W[48 + i + 4] = a ^ cd ^ bcx; + W[48 + i + 8] = d ^ ab ^ cdx; + W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + + } + + for (int k = 1; k < 10; k++) + echo_round_alexis(sharedMemory,W,k0); + + #pragma unroll 4 + for (int i = 0; i < 16; i += 4) + { + W[i] ^= W[32 + i] ^ 512; + W[i + 1] ^= W[32 + i + 1]; + W[i + 2] ^= W[32 + i + 2]; + W[i + 3] ^= W[32 + i + 3]; + } + *(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0]; + *(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8]; + } +} + +__host__ +void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ + + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_echo512_gpu_hash_64<<>>(threads, d_hash); +} diff --git a/x16r/cuda_x16r.h b/x16r/cuda_x16r.h index 1eecf38694..67b205a2a3 100644 --- a/x16r/cuda_x16r.h +++ b/x16r/cuda_x16r.h @@ -22,6 +22,10 @@ extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); +// ---- optimised but non compatible kernels + +void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); + // ---- 80 bytes kernels void quark_bmw512_cpu_setBlock_80(void *pdata); diff --git a/x16r/x16r.cu b/x16r/x16r.cu index 5dfd05c785..3ead3209ab 100644 --- a/x16r/x16r.cu +++ b/x16r/x16r.cu @@ -227,6 +227,7 @@ void whirlpool_midstate(void *state, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; //#define _DEBUG #define _DEBUG_PREFIX "x16r-" @@ -257,6 +258,11 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + quark_blake512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); @@ -267,7 +273,6 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, x11_luffa512_cpu_init(thr_id, throughput); // 64 x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); // 64 - x11_echo512_cpu_init(thr_id, throughput); x16_echo512_cuda_init(thr_id, throughput); x13_hamsi512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput); @@ -484,7 +489,10 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, TRACE("simd :"); break; case ECHO: - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; TRACE("echo :"); break; case HAMSI: diff --git a/x16r/x16s.cu b/x16r/x16s.cu index 48e0698e0d..0d6c7b26c9 100644 --- a/x16r/x16s.cu +++ b/x16r/x16s.cu @@ -229,6 +229,7 @@ void whirlpool_midstate(void *state, const void *input) #endif static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; //#define _DEBUG #define _DEBUG_PREFIX "x16s-" @@ -255,6 +256,11 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + quark_blake512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); @@ -265,7 +271,6 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, x11_luffa512_cpu_init(thr_id, throughput); // 64 x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); // 64 - x11_echo512_cpu_init(thr_id, throughput); x16_echo512_cuda_init(thr_id, throughput); x13_hamsi512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput); @@ -482,7 +487,10 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, TRACE("simd :"); break; case ECHO: - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; TRACE("echo :"); break; case HAMSI: