diff --git a/kernel.cu b/kernel.cu index 6af55b8..532bc24 100644 --- a/kernel.cu +++ b/kernel.cu @@ -172,17 +172,17 @@ void vblake512_compress(uint64_t *h, const uint64_t *block, const uint8_t((*sigm } h[0] ^= v[0] ^ v[8]; - h[1] ^= v[1] ^ v[9]; - h[2] ^= v[2] ^ v[10]; +// h[1] ^= v[1] ^ v[9]; +// h[2] ^= v[2] ^ v[10]; h[3] ^= v[3] ^ v[11]; - h[4] ^= v[4] ^ v[12]; - h[5] ^= v[5] ^ v[13]; +// h[4] ^= v[4] ^ v[12]; +// h[5] ^= v[5] ^ v[13]; h[6] ^= v[6] ^ v[14]; - h[7] ^= v[7] ^ v[15]; +// h[7] ^= v[7] ^ v[15]; h[0] ^= h[3] ^ h[6]; //copied from the java - h[1] ^= h[4] ^ h[7]; - h[2] ^= h[5]; + //h[1] ^= h[4] ^ h[7]; + //h[2] ^= h[5]; } __device__ __forceinline__ uint64_t vBlake2(const uint64_t h0, const uint64_t h1, const uint64_t h2, const uint64_t h3, const uint64_t h4, const uint64_t h5, const uint64_t h6, const uint64_t h7) @@ -235,6 +235,7 @@ bool verboseOutput = false; /* * Kernel function to search a range of nonces for a solution falling under the macro-configured difficulty (CPU=2^24, GPU=2^32). */ +__launch_bounds__(256, 2) __global__ void vblakeHasher(uint32_t *nonceStart, uint32_t *nonceOut, uint64_t *hashStartOut, uint64_t const *headerIn) { // Generate a unique starting nonce for each thread that doesn't overlap with the work of any other thread