From a4ad680ff9d245843eb3b9c3c8aa1e0c28e43598 Mon Sep 17 00:00:00 2001 From: Stefan Hermann Date: Fri, 12 Apr 2024 10:19:30 +0200 Subject: [PATCH 1/5] aligned input strings to benchmark framework --- src/build.cpp | 7 ++---- src/util.hpp | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/build.cpp b/src/build.cpp index fc0e0ca..e65bd71 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -481,11 +481,8 @@ int main(int argc, char** argv) { input.close(); } build(parser, keys.begin(), keys.size()); - } else { // use num_keys random 64-bit keys - std::vector keys; - keys.reserve(num_keys); - for (size_t i = 0; i < num_keys; ++i) { keys.push_back(std::to_string(i)); } - build(parser, keys.begin(), keys.size()); + } else { // use num_keys random strings + build(parser, generateBenchmarkInput(num_keys).begin(), num_keys); } return 0; diff --git a/src/util.hpp b/src/util.hpp index 461dba8..12e2dea 100644 --- a/src/util.hpp +++ b/src/util.hpp @@ -11,6 +11,10 @@ #include "utils/util.hpp" #include "essentials.hpp" +#include +#include +#include +#include namespace pthash { @@ -162,6 +166,69 @@ std::vector distinct_keys(uint64_t num_keys, uint64_t seed = constants::in return keys; } + +class XorShift64 { +private: + uint64_t x64; +public: + explicit XorShift64(uint64_t seed = 88172645463325252ull) : x64(seed) { + } + + inline uint64_t operator()() { + x64 ^= x64 << 13; + x64 ^= x64 >> 7; + x64 ^= x64 << 17; + return x64; + } + + inline uint64_t operator()(uint64_t range) { +#ifdef __SIZEOF_INT128__ // then we know we have a 128-bit int + return (uint64_t)(((__uint128_t)operator()() * (__uint128_t)range) >> 64); +#elif defined(_MSC_VER) && defined(_WIN64) + // supported in Visual Studio 2005 and better + uint64_t highProduct; + _umul128(operator()(), range, &highProduct); // ignore output + return highProduct; + unsigned __int64 _umul128( + unsigned __int64 Multiplier, + unsigned __int64 Multiplicand, + unsigned __int64 *HighProduct + ); +#else + return word / (UINT64_MAX / p); // fallback +#endif // __SIZEOF_INT128__ + } +}; + +std::vector generateBenchmarkInput(size_t n) { + std::vector inputData; + inputData.reserve(n); + auto time = std::chrono::system_clock::now(); + long constructionTime = std::chrono::duration_cast(time.time_since_epoch()).count(); + XorShift64 prng(constructionTime); + std::cout<<"Generating input"< bool check(Iterator keys, Function const& f) { __uint128_t n = f.num_keys(); From b1a026953c365e74544514ad2211ab670c94357e Mon Sep 17 00:00:00 2001 From: Stefan Hermann Date: Fri, 12 Apr 2024 14:38:49 +0200 Subject: [PATCH 2/5] fixed several bugs related to table bucketer for large n when no partitioning is used --- include/utils/bucketers.hpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/utils/bucketers.hpp b/include/utils/bucketers.hpp index 9a959c0..aa0bfd3 100644 --- a/include/utils/bucketers.hpp +++ b/include/utils/bucketers.hpp @@ -15,22 +15,21 @@ struct table_bucketer { base.init(num_buckets, lambda, table_size, alpha); fulcrums.push_back(0); - for (size_t xi = 0; xi < FULCS - 1; xi++) { - double x = double(xi) / double(FULCS - 1); + for (size_t xi = 1; xi < FULCS; xi++) { + double x = double(xi) / double(FULCS); double y = base.bucketRelative(x); - auto fulcV = uint64_t(y * double(num_buckets << 16)); + double fulcV = y * num_buckets; fulcrums.push_back(fulcV); } - fulcrums.push_back(num_buckets << 16); + fulcrums.push_back(num_buckets); } inline uint64_t bucket(const uint64_t hash) const { - uint64_t z = (hash & 0xFFFFFFFF) * uint64_t(FULCS - 1); - uint64_t index = z >> 32; - uint64_t part = z & 0xFFFFFFFF; - uint64_t v1 = (fulcrums[index + 0] * part) >> 32; - uint64_t v2 = (fulcrums[index + 1] * (0xFFFFFFFF - part)) >> 32; - return (v1 + v2) >> 16; + uint64_t index = hash % FULCS; + auto hashD = double(hash) / double(~0ul); + uint64_t v1 = (fulcrums[index + 0] * hashD); + uint64_t v2 = (fulcrums[index + 1] * (1.0 - hashD)); + return v1 + v2; } uint64_t num_buckets() const { @@ -38,7 +37,7 @@ struct table_bucketer { } size_t num_bits() const { - return base.num_buckets() + fulcrums.size() * 64; + return base.num_buckets() + fulcrums.size() * sizeof(double) * 8; } template @@ -50,7 +49,7 @@ struct table_bucketer { private: Bucketer base; static const uint64_t FULCS = 2048; - std::vector fulcrums; + std::vector fulcrums; }; struct opt_bucketer { @@ -63,6 +62,7 @@ struct opt_bucketer { void init(const uint64_t num_buckets, const double lambda, const uint64_t table_size, const double alpha) { constexpr double local_collision_factor = 0.3; + constexpr double max_bucket_size_expected = 130.0; m_num_buckets = num_buckets; m_alpha = alpha; if (alpha > 0.9999) { @@ -70,8 +70,8 @@ struct opt_bucketer { } else { m_alpha_factor = 1.0 / baseFunc(alpha); } - slope = std::max( - 0.05, std::min(1.0, local_collision_factor * lambda / std::sqrt((double)table_size))); + slope = std::min(1.0, std::max(lambda / max_bucket_size_expected, local_collision_factor * lambda / std::sqrt((double)table_size))); + } inline double bucketRelative(const double normalized_hash) const { From bfd518af37cb33fa1322db58d0585d84d068b0cd Mon Sep 17 00:00:00 2001 From: Stefan Hermann Date: Fri, 12 Apr 2024 18:25:51 +0200 Subject: [PATCH 3/5] revert bucketer changed --- include/utils/bucketers.hpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/include/utils/bucketers.hpp b/include/utils/bucketers.hpp index aa0bfd3..d797dca 100644 --- a/include/utils/bucketers.hpp +++ b/include/utils/bucketers.hpp @@ -15,21 +15,22 @@ struct table_bucketer { base.init(num_buckets, lambda, table_size, alpha); fulcrums.push_back(0); - for (size_t xi = 1; xi < FULCS; xi++) { - double x = double(xi) / double(FULCS); + for (size_t xi = 0; xi < FULCS - 1; xi++) { + double x = double(xi) / double(FULCS - 1); double y = base.bucketRelative(x); - double fulcV = y * num_buckets; + auto fulcV = uint64_t(y * double(num_buckets << 16)); fulcrums.push_back(fulcV); } - fulcrums.push_back(num_buckets); + fulcrums.push_back(num_buckets << 16); } inline uint64_t bucket(const uint64_t hash) const { - uint64_t index = hash % FULCS; - auto hashD = double(hash) / double(~0ul); - uint64_t v1 = (fulcrums[index + 0] * hashD); - uint64_t v2 = (fulcrums[index + 1] * (1.0 - hashD)); - return v1 + v2; + uint64_t z = (hash & 0xFFFFFFFF) * uint64_t(FULCS - 1); + uint64_t index = z >> 32; + uint64_t part = z & 0xFFFFFFFF; + uint64_t v1 = (fulcrums[index + 0] * part) >> 32; + uint64_t v2 = (fulcrums[index + 1] * (0xFFFFFFFF - part)) >> 32; + return (v1 + v2) >> 16; } uint64_t num_buckets() const { @@ -37,7 +38,7 @@ struct table_bucketer { } size_t num_bits() const { - return base.num_buckets() + fulcrums.size() * sizeof(double) * 8; + return base.num_buckets() + fulcrums.size() * 64; } template @@ -49,7 +50,7 @@ struct table_bucketer { private: Bucketer base; static const uint64_t FULCS = 2048; - std::vector fulcrums; + std::vector fulcrums; }; struct opt_bucketer { @@ -70,7 +71,7 @@ struct opt_bucketer { } else { m_alpha_factor = 1.0 / baseFunc(alpha); } - slope = std::min(1.0, std::max(lambda / max_bucket_size_expected, local_collision_factor * lambda / std::sqrt((double)table_size))); + slope = std::min(1.0, std::max(0.0, local_collision_factor * lambda / std::sqrt((double)table_size))); } From 75caf0e94082a023763c4771f2903acd26612bc8 Mon Sep 17 00:00:00 2001 From: Stefan Hermann Date: Sun, 14 Apr 2024 20:04:25 +0200 Subject: [PATCH 4/5] revert bucketer changed --- include/utils/bucketers.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/utils/bucketers.hpp b/include/utils/bucketers.hpp index d797dca..9a959c0 100644 --- a/include/utils/bucketers.hpp +++ b/include/utils/bucketers.hpp @@ -63,7 +63,6 @@ struct opt_bucketer { void init(const uint64_t num_buckets, const double lambda, const uint64_t table_size, const double alpha) { constexpr double local_collision_factor = 0.3; - constexpr double max_bucket_size_expected = 130.0; m_num_buckets = num_buckets; m_alpha = alpha; if (alpha > 0.9999) { @@ -71,8 +70,8 @@ struct opt_bucketer { } else { m_alpha_factor = 1.0 / baseFunc(alpha); } - slope = std::min(1.0, std::max(0.0, local_collision_factor * lambda / std::sqrt((double)table_size))); - + slope = std::max( + 0.05, std::min(1.0, local_collision_factor * lambda / std::sqrt((double)table_size))); } inline double bucketRelative(const double normalized_hash) const { From cc8572b9f5a4a52e2e99d8d076c2a293479b6aa1 Mon Sep 17 00:00:00 2001 From: Stefan Hermann Date: Tue, 16 Apr 2024 20:52:40 +0200 Subject: [PATCH 5/5] refactor multi--> inter --- include/encoders/dense_encoders.hpp | 32 +++++++++++++-------------- src/build.cpp | 34 ++++++++++++++--------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/include/encoders/dense_encoders.hpp b/include/encoders/dense_encoders.hpp index 3fbf510..8825e84 100644 --- a/include/encoders/dense_encoders.hpp +++ b/include/encoders/dense_encoders.hpp @@ -50,7 +50,7 @@ struct diff { }; template -struct mono_interleaved { +struct dense_mono { template void encode(Iterator begin, // const uint64_t num_partitions, // @@ -89,7 +89,7 @@ struct mono_interleaved { }; template -struct multi_interleaved { +struct dense_interleaved { template void encode(Iterator begin, // const uint64_t num_partitions, // @@ -127,7 +127,7 @@ struct multi_interleaved { } static std::string name() { - return "multi-" + Encoder::name(); + return "inter-" + Encoder::name(); } inline uint64_t access(const uint64_t partition, const uint64_t bucket) const { @@ -151,7 +151,7 @@ struct multi_interleaved { }; template -struct dual_interleaved { +struct dense_dual { template void encode(Iterator begin, // const uint64_t num_partitions, // @@ -207,19 +207,19 @@ struct dual_interleaved { Back m_back; }; -typedef mono_interleaved mono_R; -typedef multi_interleaved multi_R; -typedef mono_interleaved mono_C; -typedef multi_interleaved multi_C; -typedef mono_interleaved mono_D; -typedef multi_interleaved multi_D; -typedef mono_interleaved mono_EF; -typedef multi_interleaved multi_EF; +typedef dense_mono mono_R; +typedef dense_interleaved inter_R; +typedef dense_mono mono_C; +typedef dense_interleaved inter_C; +typedef dense_mono mono_D; +typedef dense_interleaved inter_D; +typedef dense_mono mono_EF; +typedef dense_interleaved inter_EF; /* dual_interleaved encoders */ -typedef dual_interleaved mono_C_mono_R; -typedef dual_interleaved multi_C_multi_R; -typedef dual_interleaved mono_D_mono_R; -typedef dual_interleaved multi_D_multi_R; +typedef dense_dual mono_C_mono_R; +typedef dense_dual inter_C_inter_R; +typedef dense_dual mono_D_mono_R; +typedef dense_dual inter_D_inter_R; } // namespace pthash \ No newline at end of file diff --git a/src/build.cpp b/src/build.cpp index e65bd71..8408c45 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -152,7 +152,7 @@ void choose_dual_encoder_tradeoff(build_parameters const& params, if (tradeoff == uint64_t(std::round(params.dual_encoder_tradeoff * granularity))) { choose_needs_free_array< Builder, Iterator, search_type, - dual_interleaved>(builder, timings, + dense_dual>(builder, timings, params, config); } if constexpr (tradeoff > 0) { @@ -232,48 +232,48 @@ void choose_encoder(build_parameters const& params, build_configuratio choose_needs_free_array(builder, timings, params, config); } - if (encode_all or params.encoder_type == "multi-R") { - choose_needs_free_array(builder, timings, + if (encode_all or params.encoder_type == "inter-R") { + choose_needs_free_array(builder, timings, params, config); } if (encode_all or params.encoder_type == "mono-C") { choose_needs_free_array(builder, timings, params, config); } - if (encode_all or params.encoder_type == "multi-C") { - choose_needs_free_array(builder, timings, + if (encode_all or params.encoder_type == "inter-C") { + choose_needs_free_array(builder, timings, params, config); } if (encode_all or params.encoder_type == "mono-D") { choose_needs_free_array(builder, timings, params, config); } - if (encode_all or params.encoder_type == "multi-D") { - choose_needs_free_array(builder, timings, + if (encode_all or params.encoder_type == "inter-D") { + choose_needs_free_array(builder, timings, params, config); } if (encode_all or params.encoder_type == "mono-EF") { choose_needs_free_array(builder, timings, params, config); } - if (encode_all or params.encoder_type == "multi-EF") { - choose_needs_free_array(builder, timings, + if (encode_all or params.encoder_type == "inter-EF") { + choose_needs_free_array(builder, timings, params, config); } if (encode_all or params.encoder_type == "mono-C-mono-R") { choose_dual_encoder_tradeoff(params, config, builder, timings); } - if (encode_all or params.encoder_type == "multi-C-multi-R") { - choose_dual_encoder_tradeoff(params, config, + if (encode_all or params.encoder_type == "inter-C-inter-R") { + choose_dual_encoder_tradeoff(params, config, builder, timings); } if (encode_all or params.encoder_type == "mono-D-mono-R") { choose_dual_encoder_tradeoff(params, config, builder, timings); } - if (encode_all or params.encoder_type == "multi-D-multi-R") { - choose_dual_encoder_tradeoff(params, config, + if (encode_all or params.encoder_type == "inter-D-inter-R") { + choose_dual_encoder_tradeoff(params, config, builder, timings); } @@ -356,8 +356,8 @@ void build(cmd_line_parser::parser const& parser, Iterator keys, uint64_t num_ke /* only for dense partitioning */ "mono-R", "mono-C", "mono-D", "mono-EF", // mono - "multi-R", "multi-C", "multi-D", "multi-EF", // multi - "mono-C-mono-R", "multi-C-multi-R", "mono-D-mono-R", "multi-D-multi-R", // dual + "inter-R", "inter-C", "inter-D", "inter-EF", // inter + "mono-C-mono-R", "inter-C-inter-R", "mono-D-mono-R", "inter-D-inter-R", // dual /**/ "all" // @@ -436,8 +436,8 @@ int main(int argc, char** argv) { "The encoder type. Possibile values are: " "'R-R', 'PC', 'D-D', 'EF', " "'mono-R', 'mono-C', 'mono-D', 'mono-EF', " - "'multi-R', 'multi-C', 'multi-D', 'multi-EF', " - "'mono-C-mono-R', 'multi-C-multi-R', 'mono-D-mono-R', 'multi-D-multi-R', " + "'inter-R', 'inter-C', 'inter-D', 'inter-EF', " + "'mono-C-mono-R', 'inter-C-inter-R', 'mono-D-mono-R', 'inter-D-inter-R', " "'all'.\n\t" "The 'all' type will just benchmark all encoders. (Useful for benchmarking " "purposes.)",