diff --git a/src/jogasaki/dist/uniform_key_distribution.cpp b/src/jogasaki/dist/uniform_key_distribution.cpp index 40342542..e9dcf823 100644 --- a/src/jogasaki/dist/uniform_key_distribution.cpp +++ b/src/jogasaki/dist/uniform_key_distribution.cpp @@ -88,20 +88,6 @@ status uniform_key_distribution::highkey(pivot_type& out) { return scan_one(true, out); } -double string_distance(std::string_view lo, std::string_view hi) { - std::size_t len = std::max(lo.size(), hi.size()); - double dist{}; - - double div = 1.0; - for(std::size_t i=0; i < len; ++i) { - int l = i < lo.size() ? lo[i] : 0; - int h = i < hi.size() ? hi[i] : 0; - dist += (h - l) * div; - div /= 256.0; - } - return dist; -} - std::size_t common_prefix_len(std::string_view lo, std::string_view hi) { std::size_t len = std::min(lo.size(), hi.size()); std::size_t i = 0; @@ -115,21 +101,35 @@ std::size_t common_prefix_len(std::string_view lo, std::string_view hi) { } std::vector generate_strings(std::string_view lo, std::string_view hi, std::size_t chars) { + // simple implementation to generate a set of strings contained in the range from lo to hi (exclusive) + // steps are as follows: + // 1. let l and h be the character after the common prefix in lo and hi + // 2. generate strings with prefix + l, prefix + (l + 1), ..., prefix + (h - 1) + // 3. append one of `chars` characters to each string generated in 2 + // 4. Within the `chars` * (h - l) strings generated above, adopt only ones in the range from lo to hi + if(hi < lo) { + // invalid arguments + return {}; + } auto cpl = common_prefix_len(lo, hi); + // characters after the common prefix in lo and hi auto h = static_cast(cpl < hi.size() ? hi[cpl] : 0); auto l = static_cast(cpl < lo.size() ? lo[cpl] : 0); std::size_t cnt = h - l; std::vector pivots{}; - pivots.reserve(cnt * chars); + pivots.reserve(cnt * chars); // at maximum - for (std::size_t j = 0; j < cnt; ++j) { + for (std::size_t i = 0; i < cnt; ++i) { std::string prefix{lo.data(), cpl}; - prefix += static_cast(l + j); - - for (std::size_t i = 0; i < chars; ++i) { - pivots.emplace_back(prefix + static_cast(i)); + prefix += static_cast(l + i); + for (std::size_t j = 0; j < chars; ++j) { + std::string p = prefix + static_cast(j); + if(p <= lo || hi <= p) { + continue; + } + pivots.emplace_back(std::move(p)); } } return pivots; diff --git a/src/jogasaki/dist/uniform_key_distribution.h b/src/jogasaki/dist/uniform_key_distribution.h index ddf4f1bb..52131331 100644 --- a/src/jogasaki/dist/uniform_key_distribution.h +++ b/src/jogasaki/dist/uniform_key_distribution.h @@ -26,7 +26,8 @@ namespace jogasaki::dist { /** * @brief key_distribution subclass assuming keys are distributed uniformly - * @details This class assumes that keys are distributed uniformly and calculate the pivots based on this assumption. + * @details This class assumes that keys are distributed uniformly and calculate the pivots + * between the smallest and largest keys. */ class uniform_key_distribution : public key_distribution { public: @@ -87,10 +88,22 @@ class uniform_key_distribution : public key_distribution { status scan_one(bool reverse, uniform_key_distribution::pivot_type& out); }; -double string_distance(std::string_view lo, std::string_view hi); - +/** + * @brief calculate the common prefix length of two strings + * @note the function is public for testing + */ std::size_t common_prefix_len(std::string_view lo, std::string_view hi); +/** + * @brief generate strings between two strings + * @param lo the smaller string + * @param hi the larger string + * @param chars the number of characters consisting a octet (normally 256, but customizable for testing) + * @details the function generates strings between lo and hi (exclusively) + * If the given range is too narrow or invalid (i.e. hi < lo), the function returns an empty vector. + * @return the generated strings + * @note the function is public for testing + */ std::vector generate_strings(std::string_view lo, std::string_view hi, std::size_t chars = 256); diff --git a/test/jogasaki/dist/uniform_distribution_test.cpp b/test/jogasaki/dist/uniform_distribution_test.cpp index 00e4bfc1..c357812a 100644 --- a/test/jogasaki/dist/uniform_distribution_test.cpp +++ b/test/jogasaki/dist/uniform_distribution_test.cpp @@ -54,6 +54,9 @@ class uniform_distribution_test : }; TEST_F(uniform_distribution_test, basic) { + if (jogasaki::kvs::implementation_id() == "memory") { + GTEST_SKIP() << "jogasaki-memory doesn't support uniform key distribution yet"; + } execute_statement("create table t (c0 int primary key)"); execute_statement("insert into t values (1),(2),(3)"); @@ -69,12 +72,17 @@ TEST_F(uniform_distribution_test, basic) { std::string hi{}; std::string lo{}; EXPECT_EQ(status::ok, dist.highkey(hi)); + EXPECT_EQ("\x80\x00\x00\x03"sv, hi); std::cerr << "highkey: " << utils::binary_printer{hi.data(), hi.size()} << std::endl; EXPECT_EQ(status::ok, dist.lowkey(lo)); + EXPECT_EQ("\x80\x00\x00\x01"sv, lo); std::cerr << "lowkey: " << utils::binary_printer{lo.data(), lo.size()} << std::endl; } TEST_F(uniform_distribution_test, complex_primary_key) { + if (jogasaki::kvs::implementation_id() == "memory") { + GTEST_SKIP() << "jogasaki-memory doesn't support uniform key distribution yet"; + } execute_statement("create table t (c0 int, c1 int, primary key(c0, c1))"); execute_statement("insert into t values (1,10),(2,20),(3,30)"); @@ -91,8 +99,10 @@ TEST_F(uniform_distribution_test, complex_primary_key) { std::string lo{}; EXPECT_EQ(status::ok, dist.highkey(hi)); std::cerr << "highkey: " << utils::binary_printer{hi.data(), hi.size()} << std::endl; + EXPECT_EQ("\x80\x00\x00\x03\x80\x00\x00\x1e"sv, hi); EXPECT_EQ(status::ok, dist.lowkey(lo)); std::cerr << "lowkey: " << utils::binary_printer{lo.data(), lo.size()} << std::endl; + EXPECT_EQ("\x80\x00\x00\x01\x80\x00\x00\x0a"sv, lo); } TEST_F(uniform_distribution_test, common_prefix_len) { @@ -110,9 +120,9 @@ TEST_F(uniform_distribution_test, common_prefix_len) { EXPECT_EQ(3, common_prefix_len("abcd", "abc")); } -TEST_F(uniform_distribution_test, gen_strings) { +TEST_F(uniform_distribution_test, gen_strings_basic) { auto res = generate_strings("a1", "a3", 3); - EXPECT_EQ(6, res.size()); + ASSERT_EQ(6, res.size()); EXPECT_EQ("a1\x00"sv, res[0]); EXPECT_EQ("a1\x01"sv, res[1]); EXPECT_EQ("a1\x02"sv, res[2]); @@ -121,6 +131,52 @@ TEST_F(uniform_distribution_test, gen_strings) { EXPECT_EQ("a2\x02"sv, res[5]); } +TEST_F(uniform_distribution_test, gen_strings_removing_ones_outside_range) { + // same as gen_strings_basic but removing strings outside the range + auto res = generate_strings("a1\x01"sv, "a3"sv, 3); + ASSERT_EQ(4, res.size()); + EXPECT_EQ("a1\x02"sv, res[0]); + EXPECT_EQ("a2\x00"sv, res[1]); + EXPECT_EQ("a2\x01"sv, res[2]); + EXPECT_EQ("a2\x02"sv, res[3]); +} + +TEST_F(uniform_distribution_test, gen_strings_with_different_length) { + auto res = generate_strings("a"sv, "a\x02"sv, 3); + ASSERT_EQ(6, res.size()); + EXPECT_EQ("a\x00\x00"sv, res[0]); + EXPECT_EQ("a\x00\x01"sv, res[1]); + EXPECT_EQ("a\x00\x02"sv, res[2]); + EXPECT_EQ("a\x01\x00"sv, res[3]); + EXPECT_EQ("a\x01\x01"sv, res[4]); + EXPECT_EQ("a\x01\x02"sv, res[5]); +} + +TEST_F(uniform_distribution_test, gen_strings_with_different_length_longer_lo) { + auto res = generate_strings("a\x01"sv, "b"sv, 3); + ASSERT_EQ(1, res.size()); + EXPECT_EQ("a\x02"sv, res[0]); +} + +TEST_F(uniform_distribution_test, gen_strings_same_hi_lo) { + auto res = generate_strings("abc"sv, "abc"sv, 3); + ASSERT_EQ(0, res.size()); +} + +TEST_F(uniform_distribution_test, gen_strings_narrow_range) { + { + // verify that the range is too narrow to generate any strings + auto res = generate_strings("a\x01\xFF"sv, "a\x02"sv, 256); + ASSERT_EQ(0, res.size()); + } + { + // verify that the range is narrow and only one string can be generated + auto res = generate_strings("a\x01\xFE"sv, "a\x02"sv, 256); + ASSERT_EQ(1, res.size()); + EXPECT_EQ("a\x01\xFF"sv, res[0]); + } +} + TEST_F(uniform_distribution_test, compute_pivots) { if (jogasaki::kvs::implementation_id() == "memory") { GTEST_SKIP() << "jogasaki-memory doesn't support uniform key distribution yet";