diff --git a/CMakeLists.txt b/CMakeLists.txt index 0031296..173f599 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) GIT_TAG main CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release ) + set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE) FetchContent_MakeAvailable(google_benchmark) endif() diff --git a/examples/pi_cuda.cu b/examples/pi_cuda.cu index dc9e427..5936f0c 100644 --- a/examples/pi_cuda.cu +++ b/examples/pi_cuda.cu @@ -32,7 +32,10 @@ */ #include +#include #include +#include +#include #include #include @@ -42,7 +45,7 @@ const int SAMPLES_PER_THREAD = 1000; // Number of samples per thread const int NTHREADS = N / SAMPLES_PER_THREAD; // Number of threads const int THREADS_PER_BLOCK = 256; // Number of threads per block -typedef openrand::Tyche RNG; +typedef openrand::Phillox RNG; __global__ void monteCarloPi(int *d_sum) { int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -77,9 +80,11 @@ int main() { int h_sum; cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost); - float pi = 4.0 * (float)h_sum / N; + double pi_estimate = 4.0 * (float)h_sum / N; - std::cout << "Approximated value of Pi: " << pi << std::endl; + constexpr double pi = 3.14159265358979323846; + std::cout << "pi_estimate: " << pi_estimate << std::endl; + std::cout << "log10(|pi - pi_estimate|): " << std::log10(std::abs(pi - pi_estimate)) << std::endl; cudaFree(d_sum); diff --git a/examples/pi_openmp.cpp b/examples/pi_openmp.cpp index 656213c..74eac6b 100644 --- a/examples/pi_openmp.cpp +++ b/examples/pi_openmp.cpp @@ -70,6 +70,8 @@ int main() { double pi_estimate = compute_pi(); constexpr double pi = 3.14159265358979323846; + + std::cout << "pi_estimate: " << pi_estimate << std::endl; std::cout << "log10(|pi - pi_estimate|): " << std::log10(std::abs(pi - pi_estimate)) << std::endl; return 0; diff --git a/include/openrand/phillox.h b/include/openrand/phillox.h index 8e4b114..cd33f8d 100644 --- a/include/openrand/phillox.h +++ b/include/openrand/phillox.h @@ -35,30 +35,10 @@ #include #include -namespace { - -constexpr uint32_t PHILOX_M4x32_0 = 0xD2511F53; -constexpr uint32_t PHILOX_M4x32_1 = 0xCD9E8D57; -constexpr uint32_t PHILOX_W32_0 = 0x9E3779B9; -constexpr uint32_t PHILOX_W32_1 = 0xBB67AE85; - -inline DEVICE uint32_t mulhilo(uint32_t L, uint32_t R, uint32_t *hip) { - uint64_t product = static_cast(L) * static_cast(R); - *hip = product >> 32; - return static_cast(product); -} - -inline DEVICE void round(const uint32_t (&key)[2], uint32_t (&ctr)[4]) { - uint32_t hi0; - uint32_t hi1; - uint32_t lo0 = mulhilo(PHILOX_M4x32_0, ctr[0], &hi0); - uint32_t lo1 = mulhilo(PHILOX_M4x32_1, ctr[2], &hi1); - ctr[0] = hi1 ^ ctr[1] ^ key[0]; - ctr[1] = lo1; - ctr[2] = hi0 ^ ctr[3] ^ key[1]; - ctr[3] = lo0; -} -} // namespace +#define PHILOX_W0 0x9E3779B9 +#define PHILOX_W1 0xBB67AE85 +#define PHILOX_M0 0xD2511F53 +#define PHILOX_M1 0xCD9E8D57 namespace openrand { @@ -66,6 +46,7 @@ namespace openrand { * @class Phillox * @brief Phillox generator * @note This is a modified version of Phillox generator from Random123 library. + * This uses 4x 32-bit counter, 2x 32-bit key along with 10 rounds. */ class Phillox : public BaseRNG { public: @@ -118,14 +99,31 @@ class Phillox : public BaseRNG { for (int r = 0; r < 10; r++) { if (r > 0) { - key[0] += PHILOX_W32_0; - key[1] += PHILOX_W32_1; + key[0] += PHILOX_W0; + key[1] += PHILOX_W1; } round(key, _out); } _ctr++; } + inline DEVICE uint32_t mulhilo(uint32_t L, uint32_t R, uint32_t *hip) { + uint64_t product = static_cast(L) * static_cast(R); + *hip = product >> 32; + return static_cast(product); + } + + inline DEVICE void round(const uint32_t (&key)[2], uint32_t (&ctr)[4]) { + uint32_t hi0; + uint32_t hi1; + uint32_t lo0 = mulhilo(PHILOX_M0, ctr[0], &hi0); + uint32_t lo1 = mulhilo(PHILOX_M1, ctr[2], &hi1); + ctr[0] = hi1 ^ ctr[1] ^ key[0]; + ctr[1] = lo1; + ctr[2] = hi0 ^ ctr[3] ^ key[1]; + ctr[3] = lo0; + } + // User provided seed and counter broken up, constant throughout // the lifetime of the object const uint32_t seed_hi, seed_lo; diff --git a/tests/run_stat_tests.py b/tests/run_stat_tests.py index 321cc82..8c2a0de 100644 --- a/tests/run_stat_tests.py +++ b/tests/run_stat_tests.py @@ -22,7 +22,7 @@ PRACT_RND_EXEC = os.path.join(BUILD_DIR, "Practrand", "RNG_test") for gen in ["philox", "tyche", "threefry", "squares"]: - command = f"{BUILD_DIR}/tests/pract_rand_multi {gen} | {PRACT_RND_EXEC} stdin32 -multithreaded -tlmax 4GB > {RES_DIR}/practrandm_{gen}.txt" + command = f"{BUILD_DIR}/tests/pract_rand_multi {gen} | {PRACT_RND_EXEC} stdin32 -multithreaded -tlmax 8GB > {RES_DIR}/practrandm_{gen}.txt" p = subprocess.Popen(command, shell=True) p.name = f"practrand_{gen}"