From 6e4e769d91b862449f7d8439daf9340b999ea917 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Wed, 18 Sep 2013 11:25:34 +0200 Subject: [PATCH 01/15] Fixed and extended lex_smaller_count --- include/sdsl/wt_pc.hpp | 44 ++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/include/sdsl/wt_pc.hpp b/include/sdsl/wt_pc.hpp index f567d3b0d..b6c37394c 100644 --- a/include/sdsl/wt_pc.hpp +++ b/include/sdsl/wt_pc.hpp @@ -488,6 +488,8 @@ class wt_pc * * \par Precondition * \f$ i \leq j \leq n \f$ + * \note + * This method is only available if lex_ordered = true */ template> typename std::enable_if::type @@ -509,7 +511,7 @@ class wt_pc uint64_t p = m_tree.bit_path(c); uint32_t path_len = p>>56; if (path_len == 0) { // path_len=0: => c is not present - value_type _c = (value_type)(p&0x00FFFFFFFFFFFFFFULL); + value_type _c = (value_type)p; if (c == _c) { // c is smaller than any symbol in wt return t_ret_type {0, 0, j-i}; } @@ -552,22 +554,40 @@ class wt_pc /*! * \param i Exclusive right bound of the range (\f$i\in[0..size()]\f$). * \param c Symbol c. - * \return Number of characters in [0..i-1], which are smaller than - * c. If c does not occur in the sequence 0 is returned. + * \return A tuple containing: + * * #symbols smaller than c in [0..i-1] + * * rank(c,i) + * \par Precondition + * \f$ i \leq n \f$ * \note * This method is only available if lex_ordered = true */ - template - typename std::enable_if::type + template> + typename std::enable_if::type lex_smaller_count(size_type i, value_type c)const { assert(i <= size()); - // if c does not occur in the sequence - if (!m_tree.is_valid(m_tree.c_to_leaf(c))) - return 0; + if (1==m_sigma) { + value_type _c = m_tree.bv_pos_rank(m_tree.root()); + if (c == _c) { // c is the only symbol in the wt + return t_ret_type {0,i}; + } else if (c < _c) { + return t_ret_type {0,0}; + } else { + return t_ret_type {i,0}; + } + } + uint64_t p = m_tree.bit_path(c); + uint32_t path_len = p>>56; + if (path_len == 0) { // path_len=0: => c is not present + value_type _c = (value_type)p; + if (c == _c) { // c is smaller than any symbol in wt + return t_ret_type {0, 0}; + } + auto res = lex_smaller_count(i, _c); + return t_ret_type {std::get<0>(res)+std::get<1>(res),0}; + } size_type result = 0; size_type all = i; // possible occurrences of c - uint64_t p = m_tree.bit_path(c); - uint32_t path_len = (p>>56); node_type v = m_tree.root(); for (uint32_t l=0; l>= 1) { size_type ones = (m_bv_rank(m_tree.bv_pos(v)+all) @@ -578,9 +598,9 @@ class wt_pc } else { all -= ones; } - v = m_tree.child(v, p&2); + v = m_tree.child(v, p&1); } - return result; + return t_ret_type {result,all}; } //! How many symbols are lexicographic smaller than c in [i..j-1]. From 986ef8641eb3cf4ea3eb36fbd3af7bd7010750c6 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Wed, 18 Sep 2013 11:26:24 +0200 Subject: [PATCH 02/15] Added test for lex_smaller_count --- test/WtByteTest.cpp | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/test/WtByteTest.cpp b/test/WtByteTest.cpp index a206cf0b2..d4e71110d 100644 --- a/test/WtByteTest.cpp +++ b/test/WtByteTest.cpp @@ -255,31 +255,41 @@ void test_lex_count(t_T& wt) std::mt19937_64 rng; std::uniform_int_distribution distribution(0, wt.size()); auto dice = bind(distribution, rng); - for (size_type t=0; t<10000; ++t) { - size_type l = dice(); - size_type r = dice(); - if (r rank_c_i_n(256,0); std::vector rank_c_j_n(256,0); for (size_type c=0; c<256; ++c) { - size_type tmp_j = wt.rank(r,(value_type)c); - size_type tmp_i = wt.rank(l,(value_type)c); - rank_c_j_n[c] = tmp_j; - rank_c_i_n[c] = tmp_i; + rank_c_i_n[c] = wt.rank(i,(value_type)c); + rank_c_j_n[c] = wt.rank(j,(value_type)c); } + size_type num_i_s = 0; + size_type num_j_s = 0; size_type num_c = 0; size_type num_s = 0; - size_type num_g = r-l; + size_type num_g = j-i; for (size_type c=0; c<256; ++c) { + // Test lex_count num_s += num_c; num_c = rank_c_j_n[c]-rank_c_i_n[c]; num_g -= num_c; - auto res = wt.lex_count(l, r, (value_type)c); + auto res = wt.lex_count(i, j, (value_type)c); ASSERT_EQ(rank_c_i_n[c], std::get<0>(res)); ASSERT_EQ(num_s, std::get<1>(res)); ASSERT_EQ(num_g, std::get<2>(res)); + // Test lex_smaller_count + auto res2 = wt.lex_smaller_count(i, (value_type)c); + ASSERT_EQ(num_i_s, std::get<0>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(rank_c_i_n[c], std::get<1>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; + num_i_s += rank_c_i_n[c]; + auto res3 = wt.lex_smaller_count(j, (value_type)c); + ASSERT_EQ(num_j_s, std::get<0>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(rank_c_j_n[c], std::get<1>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; + num_j_s += rank_c_j_n[c]; } } } From d8081bb91b2481bdac6db50b799d0c4023dd7feb Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Wed, 18 Sep 2013 13:51:02 +0200 Subject: [PATCH 03/15] Improved rank and lex_smaller_count --- include/sdsl/wt_pc.hpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/include/sdsl/wt_pc.hpp b/include/sdsl/wt_pc.hpp index b6c37394c..82e1ef4b4 100644 --- a/include/sdsl/wt_pc.hpp +++ b/include/sdsl/wt_pc.hpp @@ -34,8 +34,6 @@ namespace sdsl { -const int_vector<>::size_type ZoO[2] = {0, (int_vector<>::size_type)-1}; - //! A prefix code-shaped wavelet. /*! * \tparam t_shape Shape of the tree (). @@ -326,13 +324,13 @@ class wt_pc uint64_t p = m_tree.bit_path(c); // path_len == 0, if `c` was not in the text or m_sigma=1 uint32_t path_len = (p>>56); - if (!path_len and 1 == m_sigma) { - if (!m_tree.is_valid(m_tree.c_to_leaf(c))) { // if `c` was not in the text + if (!path_len) { + if (m_sigma > 1 or !m_tree.is_valid(m_tree.c_to_leaf(c))) { // if `c` was not in the text return 0; } - return std::min(i, m_size); // if m_sigma == 1 answer is trivial + return i; // if m_sigma == 1 answer is trivial } - size_type result = i & ZoO[path_len>0]; + size_type result = i; node_type v = m_tree.root(); for (uint32_t l=0; l>= 1) { if (p&1) { @@ -524,24 +522,17 @@ class wt_pc size_type res2 = j; node_type v = m_tree.root(); for (uint32_t l=0; l>= 1) { - if (p&1) { - size_type r1_1 = (m_bv_rank(m_tree.bv_pos(v)+res1) - - m_tree.bv_pos_rank(v)); - size_type r1_2 = (m_bv_rank(m_tree.bv_pos(v)+res2) - - m_tree.bv_pos_rank(v)); + size_type r1_1 = (m_bv_rank(m_tree.bv_pos(v)+res1) + - m_tree.bv_pos_rank(v)); + size_type r1_2 = (m_bv_rank(m_tree.bv_pos(v)+res2) + - m_tree.bv_pos_rank(v)); + if (p&1) { smaller += res2 - r1_2 - res1 + r1_1; - res1 = r1_1; res2 = r1_2; } else { - size_type r1_1 = (m_bv_rank(m_tree.bv_pos(v)+res1) - - m_tree.bv_pos_rank(v)); - size_type r1_2 = (m_bv_rank(m_tree.bv_pos(v)+res2) - - m_tree.bv_pos_rank(v)); - greater += r1_2 - r1_1; - res1 -= r1_1; res2 -= r1_2; } @@ -589,7 +580,7 @@ class wt_pc size_type result = 0; size_type all = i; // possible occurrences of c node_type v = m_tree.root(); - for (uint32_t l=0; l>= 1) { + for (uint32_t l=0; l>= 1) { size_type ones = (m_bv_rank(m_tree.bv_pos(v)+all) - m_tree.bv_pos_rank(v)); if (p&1) { From 40da8dc9b9f38a91f04bc997f21e2d6e58ac0230 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Fri, 20 Sep 2013 11:20:28 +0200 Subject: [PATCH 04/15] Fixed bit_path --- include/sdsl/wt_helper.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/sdsl/wt_helper.hpp b/include/sdsl/wt_helper.hpp index 493e4bbd9..9ef2cd16a 100644 --- a/include/sdsl/wt_helper.hpp +++ b/include/sdsl/wt_helper.hpp @@ -526,6 +526,9 @@ struct _int_tree { //! Return the path as left/right bit sequence in a uint64_t inline uint64_t bit_path(value_type c)const { + if (c >= m_path.size()) { + return m_path.size()-1; + } return m_path[c]; } From 4bcb7d9ca88314e153777cafc7e126e2a615a906 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Fri, 20 Sep 2013 11:26:12 +0200 Subject: [PATCH 05/15] Removed lex_smaller_count(i,j,c) use lex_count instead --- include/sdsl/wt_pc.hpp | 76 +++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 49 deletions(-) diff --git a/include/sdsl/wt_pc.hpp b/include/sdsl/wt_pc.hpp index 82e1ef4b4..86b9080dc 100644 --- a/include/sdsl/wt_pc.hpp +++ b/include/sdsl/wt_pc.hpp @@ -310,26 +310,27 @@ class wt_pc return m_tree.bv_pos_rank(v); }; - //! Calculates how many symbols c are in the prefix [0..i-1]. + //! Calculates how many symbols c are in the prefix [0..min(i,size())-1]. /*! - * \param i Exclusive right bound of the range (\f$i\in[0..size()]\f$). + * \param i Exclusive right bound of the range. * \param c Symbol c. - * \return Number of occurrences of symbol c in the prefix [0..i-1]. + * \return Number of occurrences of symbol c in the prefix [0..min(i,size())-1]. * \par Time complexity * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the * zero order entropy of the sequence */ size_type rank(size_type i, value_type c)const { - assert(i <= size()); - uint64_t p = m_tree.bit_path(c); - // path_len == 0, if `c` was not in the text or m_sigma=1 - uint32_t path_len = (p>>56); - if (!path_len) { - if (m_sigma > 1 or !m_tree.is_valid(m_tree.c_to_leaf(c))) { // if `c` was not in the text - return 0; - } + if (i>size()) { + i = size(); + } + if (!m_tree.is_valid(m_tree.c_to_leaf(c))) { + return 0; // if `c` was not in the text + } + if (m_sigma == 1) { return i; // if m_sigma == 1 answer is trivial } + uint64_t p = m_tree.bit_path(c); + uint32_t path_len = (p>>56); size_type result = i; node_type v = m_tree.root(); for (uint32_t l=0; l>= 1) { @@ -516,29 +517,26 @@ class wt_pc auto res = lex_count(i, j, _c); return t_ret_type {0, j-i-std::get<2>(res),std::get<2>(res)}; } - size_type smaller = 0; - size_type greater = 0; - size_type res1 = i; - size_type res2 = j; + size_type smaller = 0, greater = 0; node_type v = m_tree.root(); for (uint32_t l=0; l>= 1) { - size_type r1_1 = (m_bv_rank(m_tree.bv_pos(v)+res1) + size_type r1_1 = (m_bv_rank(m_tree.bv_pos(v)+i) - m_tree.bv_pos_rank(v)); - size_type r1_2 = (m_bv_rank(m_tree.bv_pos(v)+res2) + size_type r1_2 = (m_bv_rank(m_tree.bv_pos(v)+j) - m_tree.bv_pos_rank(v)); if (p&1) { - smaller += res2 - r1_2 - res1 + r1_1; - res1 = r1_1; - res2 = r1_2; + smaller += j - r1_2 - i + r1_1; + i = r1_1; + j = r1_2; } else { greater += r1_2 - r1_1; - res1 -= r1_1; - res2 -= r1_2; + i -= r1_1; + j -= r1_2; } v = m_tree.child(v, p&1); } - return t_ret_type {res1,smaller, greater}; + return t_ret_type {i, smaller, greater}; }; //! How many symbols are lexicographic smaller than c in [0..i-1]. @@ -546,8 +544,8 @@ class wt_pc * \param i Exclusive right bound of the range (\f$i\in[0..size()]\f$). * \param c Symbol c. * \return A tuple containing: - * * #symbols smaller than c in [0..i-1] * * rank(c,i) + * * #symbols smaller than c in [0..i-1] * \par Precondition * \f$ i \leq n \f$ * \note @@ -560,13 +558,14 @@ class wt_pc if (1==m_sigma) { value_type _c = m_tree.bv_pos_rank(m_tree.root()); if (c == _c) { // c is the only symbol in the wt - return t_ret_type {0,i}; + return t_ret_type {i,0}; } else if (c < _c) { return t_ret_type {0,0}; } else { - return t_ret_type {i,0}; + return t_ret_type {0,i}; } } + uint64_t p = m_tree.bit_path(c); uint32_t path_len = p>>56; if (path_len == 0) { // path_len=0: => c is not present @@ -575,7 +574,7 @@ class wt_pc return t_ret_type {0, 0}; } auto res = lex_smaller_count(i, _c); - return t_ret_type {std::get<0>(res)+std::get<1>(res),0}; + return t_ret_type {0, std::get<0>(res)+std::get<1>(res)}; } size_type result = 0; size_type all = i; // possible occurrences of c @@ -591,28 +590,7 @@ class wt_pc } v = m_tree.child(v, p&1); } - return t_ret_type {result,all}; - } - - //! How many symbols are lexicographic smaller than c in [i..j-1]. - /*! - * \param i Start index (inclusive) of the interval. - * \param j End index (exclusive) of the interval. - * \return Number of characters in [i..j-1], which are smaller than - * c. If c does not occur in the sequence 0 is returned. - * \note - * This method is only available if lex_ordered = true - */ - template - typename std::enable_if::type - lex_smaller_count(size_type i, size_type j, value_type c)const { - if (i==j) - return 0; - if (i+1 == j) { - return (*this)[i] < c; - } else { - return count_lex_smaller(j, c) - count_lex_smaller(i, c); - } + return t_ret_type {all, result}; } //! Returns a const_iterator to the first element. From c4d30b3c296f99a4fe86f91b21ee8aac679b8897 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Fri, 20 Sep 2013 12:11:29 +0200 Subject: [PATCH 06/15] Adjusted to return value order of lex_smaller_count --- test/WtByteTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/WtByteTest.cpp b/test/WtByteTest.cpp index d4e71110d..d723de880 100644 --- a/test/WtByteTest.cpp +++ b/test/WtByteTest.cpp @@ -283,12 +283,12 @@ void test_lex_count(t_T& wt) ASSERT_EQ(num_g, std::get<2>(res)); // Test lex_smaller_count auto res2 = wt.lex_smaller_count(i, (value_type)c); - ASSERT_EQ(num_i_s, std::get<0>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; - ASSERT_EQ(rank_c_i_n[c], std::get<1>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(rank_c_i_n[c], std::get<0>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(num_i_s, std::get<1>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; num_i_s += rank_c_i_n[c]; auto res3 = wt.lex_smaller_count(j, (value_type)c); - ASSERT_EQ(num_j_s, std::get<0>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; - ASSERT_EQ(rank_c_j_n[c], std::get<1>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(rank_c_j_n[c], std::get<0>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(num_j_s, std::get<1>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; num_j_s += rank_c_j_n[c]; } } From 82c72aef2343c389b83149a37aa28e96e61f9f5e Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Fri, 20 Sep 2013 12:54:50 +0200 Subject: [PATCH 07/15] Added lex_count method in wt_int.hpp --- include/sdsl/wt_int.hpp | 45 ++++++++++++++++++++++++ test/WtIntTest.cpp | 77 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/include/sdsl/wt_int.hpp b/include/sdsl/wt_int.hpp index 34f1131bd..cf0dddedb 100644 --- a/include/sdsl/wt_int.hpp +++ b/include/sdsl/wt_int.hpp @@ -383,6 +383,51 @@ class wt_int return i-1; }; + + //! How many symbols are lexicographic smaller/greater than c in [i..j-1]. + /*! + * \param i Start index (inclusive) of the interval. + * \param j End index (exclusive) of the interval. + * \param c Symbol c. + * \return A triple containing: + * * rank(c,i) + * * #symbols smaller than c in [i..j-1] + * * #symbols greater than c in [i..j-1] + * + * \par Precondition + * \f$ i \leq j \leq n \f$ + */ + template> + t_ret_type lex_count(size_type i, size_type j, value_type c)const { + assert(i <= j and j <= size()); + size_type offset = 0; + size_type smaller = 0; + size_type greater = 0; + uint64_t mask = (1ULL) << (m_max_depth-1); + size_type node_size = m_size; + for (uint32_t k=0; k < m_max_depth; ++k) { + size_type ones_before_o = m_tree_rank(offset); + size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o; + size_type ones_before_j = m_tree_rank(offset + j) - ones_before_o; + size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o; + if (c & mask) { // search for a one at this level + offset += (node_size - ones_before_end); + node_size = ones_before_end; + smaller += j-i-ones_before_j+ones_before_i; + i = ones_before_i; + j = ones_before_j; + } else { // search for a zero at this level + node_size -= ones_before_end; + greater += ones_before_j-ones_before_i; + i -= ones_before_i; + j -= ones_before_j; + } + offset += m_size; + mask >>= 1; + } + return std::tuple {i, smaller, greater}; + }; + //! range_search_2d searches points in the index interval [lb..rb] and value interval [vlb..vrb]. /*! \param lb Left bound of index interval (inclusive) * \param rb Right bound of index interval (inclusive) diff --git a/test/WtIntTest.cpp b/test/WtIntTest.cpp index 4c31144fd..cd8abc3fa 100644 --- a/test/WtIntTest.cpp +++ b/test/WtIntTest.cpp @@ -23,7 +23,7 @@ class WtIntTest : public ::testing::Test { }; using testing::Types; // TODO: * add test cases for range_search_2d -// * add test for lex_count,... +// * add test for lex_smaller_count,... typedef Types< wt_blcd, select_support_mcl<1>, select_support_mcl<0>, int_tree<>> @@ -138,11 +138,86 @@ TYPED_TEST(WtIntTest, LoadAndInverseSelect) } } + TYPED_TEST(WtIntTest, DeleteTest) { sdsl::remove(temp_file); } +template +class WtIntLexOrdered : public ::testing::Test { }; +typedef Types< +wt_blcd, select_support_mcl<1>, select_support_mcl<0>, int_tree<>> + ,wt_hutu, select_support_mcl<1>, select_support_mcl<0>, int_tree<>> + ,wt_int<> + ,wt_int> + ,wt_int> + > Implementations_lex_ordered; + +TYPED_TEST_CASE(WtIntLexOrdered, Implementations_lex_ordered); + +//! Test the parametrized constructor +TYPED_TEST(WtIntLexOrdered, Constructor) +{ + TypeParam wt; + sdsl::construct(wt, test_file); + ASSERT_TRUE(store_to_file(wt, temp_file)); +} + +//! Test the load method and lex_count method +TYPED_TEST(WtIntLexOrdered, LoadAndLexCount) +{ + int_vector<> iv; + load_from_file(iv, test_file); + TypeParam wt; + ASSERT_TRUE(load_from_file(wt, temp_file)); + ASSERT_EQ(iv.size(), wt.size()); + std::mt19937_64 rng; + uint64_t min = UINT64_MAX, max = 0; + for (size_type j=0; j < iv.size(); ++j) { + if (min>iv[j]) min = iv[j]; + if (max symbol_distribution(min, max); + auto dice_symbol = bind(symbol_distribution, rng); + for (size_type k=1; k<4; ++k) { + std::uniform_int_distribution distribution(0, k*k*k*10); + auto dice = bind(distribution, rng); + for (size_type idx=0; idx < iv.size();) { + size_type i = idx, j = std::min(wt.size(),i+dice()); + size_type smaller_c1=0,greater_c1=0,smaller_c2=0,greater_c2=0; + int_vector<>::value_type c1=iv[i],c2=dice_symbol(); + for (; idxc1) ++greater_c1; + if (iv[idx]c2) ++greater_c2; + + } + auto res1 = wt.lex_count(i,j,c1); + ASSERT_EQ(wt.rank(i,c1),std::get<0>(res1)); + ASSERT_EQ(smaller_c1,std::get<1>(res1)); + ASSERT_EQ(greater_c1,std::get<2>(res1)); + + auto res2 = wt.lex_count(i,j,c2); + ASSERT_EQ(wt.rank(i,c2),std::get<0>(res2)); + ASSERT_EQ(smaller_c2,std::get<1>(res2)); + ASSERT_EQ(greater_c2,std::get<2>(res2)); + + auto res3 = wt.lex_count(i,j,max+1); + ASSERT_EQ(0,std::get<0>(res3)); + ASSERT_EQ(j-i,std::get<1>(res3)); + ASSERT_EQ(0,std::get<2>(res3)); + } + } +} + +TYPED_TEST(WtIntLexOrdered, DeleteTest) +{ + sdsl::remove(temp_file); +} + + template class WtIntTopK : public ::testing::Test { }; typedef Types< wt_int> , wt_int<> , wt_int> > Implementations_ordered; From 9e15fd84a8dd54b7c7fecae42af5064aa5225a9e Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Mon, 23 Sep 2013 14:13:50 +0200 Subject: [PATCH 08/15] Added lex_smaller_count to wt_int.hpp and fixed bug in rank and lex_count --- include/sdsl/wt_int.hpp | 47 +++++++++++++++++++++++++++++++++++++++++ test/WtIntTest.cpp | 34 ++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/include/sdsl/wt_int.hpp b/include/sdsl/wt_int.hpp index cf0dddedb..8ad644d39 100644 --- a/include/sdsl/wt_int.hpp +++ b/include/sdsl/wt_int.hpp @@ -295,11 +295,16 @@ class wt_int * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in[0..size()]\f$. * \param c The symbol to count the occurrences in the prefix. * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the supported vector. + * \par Precondition + * \f$ i \leq n \f$ * \par Time complexity * \f$ \Order{\log |\Sigma|} \f$ */ size_type rank(size_type i, value_type c)const { assert(i <= size()); + if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt + return 0; + } size_type offset = 0; uint64_t mask = (1ULL) << (m_max_depth-1); size_type node_size = m_size; @@ -400,6 +405,9 @@ class wt_int template> t_ret_type lex_count(size_type i, size_type j, value_type c)const { assert(i <= j and j <= size()); + if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt + return std::tuple {0,j-i,0}; + } size_type offset = 0; size_type smaller = 0; size_type greater = 0; @@ -428,6 +436,45 @@ class wt_int return std::tuple {i, smaller, greater}; }; + //! How many symbols are lexicographic smaller than c in [0..i-1]. + /*! + * \param i Exclusive right bound of the range (\f$i\in[0..size()]\f$). + * \param c Symbol c. + * \return A tuple containing: + * * rank(c,i) + * * #symbols smaller than c in [0..i-1] + * \par Precondition + * \f$ i \leq n \f$ + */ + template> + t_ret_type lex_smaller_count(size_type i, value_type c) const { + assert(i <= size()); + if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt + return std::tuple {0,i}; + } + size_type offset = 0; + size_type result = 0; + uint64_t mask = (1ULL) << (m_max_depth-1); + size_type node_size = m_size; + for (uint32_t k=0; k < m_max_depth and i; ++k) { + size_type ones_before_o = m_tree_rank(offset); + size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o; + size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o; + if (c & mask) { // search for a one at this level + offset += (node_size - ones_before_end); + node_size = ones_before_end; + result += i - ones_before_i; + i = ones_before_i; + } else { // search for a zero at this level + node_size = (node_size - ones_before_end); + i -= ones_before_i; + } + offset += m_size; + mask >>= 1; + } + return std::tuple {i, result}; + } + //! range_search_2d searches points in the index interval [lb..rb] and value interval [vlb..vrb]. /*! \param lb Left bound of index interval (inclusive) * \param rb Right bound of index interval (inclusive) diff --git a/test/WtIntTest.cpp b/test/WtIntTest.cpp index cd8abc3fa..c9a2ef7f2 100644 --- a/test/WtIntTest.cpp +++ b/test/WtIntTest.cpp @@ -204,7 +204,7 @@ TYPED_TEST(WtIntLexOrdered, LoadAndLexCount) ASSERT_EQ(smaller_c2,std::get<1>(res2)); ASSERT_EQ(greater_c2,std::get<2>(res2)); - auto res3 = wt.lex_count(i,j,max+1); + auto res3 = wt.lex_count(i,j,max+1+dice_symbol()); ASSERT_EQ(0,std::get<0>(res3)); ASSERT_EQ(j-i,std::get<1>(res3)); ASSERT_EQ(0,std::get<2>(res3)); @@ -212,6 +212,38 @@ TYPED_TEST(WtIntLexOrdered, LoadAndLexCount) } } +//! Test the load method and lex_smaller_count method +TYPED_TEST(WtIntLexOrdered, LoadAndLexSmallerCount) +{ + int_vector<> iv; + load_from_file(iv, test_file); + TypeParam wt; + ASSERT_TRUE(load_from_file(wt, temp_file)); + ASSERT_EQ(iv.size(), wt.size()); + std::mt19937_64 rng; + uint64_t min = UINT64_MAX, max = 0; + for (size_type j=0; j < iv.size(); ++j) { + if (min>iv[j]) min = iv[j]; + if (max symbol_distribution(min, max); + auto dice_symbol = bind(symbol_distribution, rng); + int_vector<> chars(3); + for (size_type idx=0; idx < iv.size(); ++idx) { + chars[0] = iv[idx]; + chars[1] = dice_symbol(); + chars[2] = max+1+dice_symbol(); + + for (uint64_t i = 0; i(exp)-std::get<1>(exp),std::get<0>(res)); + ASSERT_EQ(std::get<1>(exp),std::get<1>(res)); + } + } +} + + TYPED_TEST(WtIntLexOrdered, DeleteTest) { sdsl::remove(temp_file); From 5c71e6e0610573cbe24426bd132bb9af84849890 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Wed, 25 Sep 2013 13:56:12 +0200 Subject: [PATCH 09/15] Added interval_symbols to wt_int.hpp --- include/sdsl/wt_int.hpp | 78 +++++++++++++++++++++++++++++++++++++++++ include/sdsl/wt_pc.hpp | 5 ++- test/WtIntTest.cpp | 78 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 156 insertions(+), 5 deletions(-) diff --git a/include/sdsl/wt_int.hpp b/include/sdsl/wt_int.hpp index 8ad644d39..18b9de358 100644 --- a/include/sdsl/wt_int.hpp +++ b/include/sdsl/wt_int.hpp @@ -72,6 +72,7 @@ class wt_int typedef t_select_zero select_0_type; typedef wt_tag index_category; typedef int_alphabet_tag alphabet_category; + enum {lex_ordered=1}; typedef std::pair point_type; typedef std::vector point_vec_type; @@ -111,6 +112,48 @@ class wt_int m_path_rank_off = int_vector<64>(max_depth+1); } + // recursive internal version of the method interval_symbols + void _interval_symbols(size_type i, size_type j, size_type& k, + std::vector& cs, + std::vector& rank_c_i, + std::vector& rank_c_j, + size_type depth, + size_type path, + size_type node_size, + size_type offset) const { + // invariant: j>i + + if (depth >= m_max_depth) { + rank_c_i[k]= i; + rank_c_j[k]= j; + cs[k++]= path; + return; + } + + size_type ones_before_o = m_tree_rank(offset); + size_type ones_before_i = m_tree_rank(offset+i) -ones_before_o; + size_type ones_before_j = m_tree_rank(offset+j) -ones_before_o; + size_type ones_before_end = m_tree_rank(offset+ node_size)-ones_before_o; + + // goto left child + if ((j-i)-(ones_before_j-ones_before_i)>0) { + size_type new_offset = offset + m_size; + size_type new_node_size = node_size- ones_before_end; + size_type new_i = i - ones_before_i; + size_type new_j = j - ones_before_j; + _interval_symbols(new_i, new_j, k, cs,rank_c_i, rank_c_j, depth+1, path<<1, new_node_size,new_offset); + } + + // goto right child + if ((ones_before_j-ones_before_i)> 0) { + size_type new_offset = offset+(node_size - ones_before_end)+m_size; + size_type new_node_size = ones_before_end; + size_type new_i = ones_before_i; + size_type new_j = ones_before_j; + _interval_symbols(new_i, new_j, k, cs,rank_c_i, rank_c_j, depth+1,(path<<1)|1, new_node_size, new_offset); + } + } + public: const size_type& sigma = m_sigma; //!< Effective alphabet size of the wavelet tree. @@ -389,6 +432,41 @@ class wt_int }; + //! For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c). + /*! + * \param i The start index (inclusive) of the interval. + * \param j The end index (exclusive) of the interval. + * \param k Reference for number of different symbols in [i..j-1]. + * \param cs Reference to a vector that will contain in + * cs[0..k-1] all symbols that occur in [i..j-1] in + * ascending order. + * \param rank_c_i Reference to a vector which equals + * rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$. + * \param rank_c_j Reference to a vector which equals + * rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$. + * \par Time complexity + * \f$ \Order{\min{\sigma, k \log \sigma}} \f$ + * + * \par Precondition + * \f$ i \leq j \leq n \f$ + * \f$ cs.size() \geq \sigma \f$ + * \f$ rank_{c_i}.size() \geq \sigma \f$ + * \f$ rank_{c_j}.size() \geq \sigma \f$ + */ + void interval_symbols(size_type i, size_type j, size_type& k, + std::vector& cs, + std::vector& rank_c_i, + std::vector& rank_c_j) const { + assert(i <= j and j <= size()); + k=0; + if (i==j) { + return; + } + + _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, 0, m_size, 0); + + } + //! How many symbols are lexicographic smaller/greater than c in [i..j-1]. /*! * \param i Start index (inclusive) of the interval. diff --git a/include/sdsl/wt_pc.hpp b/include/sdsl/wt_pc.hpp index 86b9080dc..46e13e6c2 100644 --- a/include/sdsl/wt_pc.hpp +++ b/include/sdsl/wt_pc.hpp @@ -148,7 +148,6 @@ class wt_pc std::vector& rank_c_i, std::vector& rank_c_j, node_type v) const { // invariant: j>i - // goto right child size_type i_new = (m_bv_rank(m_tree.bv_pos(v) + i) - m_tree.bv_pos_rank(v)); size_type j_new = (m_bv_rank(m_tree.bv_pos(v) + j) @@ -418,8 +417,8 @@ class wt_pc * \param k Reference for number of different symbols in [i..j-1]. * \param cs Reference to a vector that will contain in * cs[0..k-1] all symbols that occur in [i..j-1] in - * arbitrary order (for Huffman shape) and ascending - * order (for Hu-Tucker shape). + * arbitrary order (if lex_ordered = false) and ascending + * order (if lex_ordered = true). * \param rank_c_i Reference to a vector which equals * rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$. * \param rank_c_j Reference to a vector which equals diff --git a/test/WtIntTest.cpp b/test/WtIntTest.cpp index c9a2ef7f2..fed97336d 100644 --- a/test/WtIntTest.cpp +++ b/test/WtIntTest.cpp @@ -23,7 +23,6 @@ class WtIntTest : public ::testing::Test { }; using testing::Types; // TODO: * add test cases for range_search_2d -// * add test for lex_smaller_count,... typedef Types< wt_blcd, select_support_mcl<1>, select_support_mcl<0>, int_tree<>> @@ -139,11 +138,86 @@ TYPED_TEST(WtIntTest, LoadAndInverseSelect) } + TYPED_TEST(WtIntTest, DeleteTest) { sdsl::remove(temp_file); } +template +class WtIntervalTest : public ::testing::Test { }; + +typedef Types< +wt_blcd, select_support_mcl<1>, select_support_mcl<0>, int_tree<>> + ,wt_huff, select_support_mcl<1>, select_support_mcl<0>, int_tree<>> + ,wt_huff, rrr_vector<63>::rank_1_type, rrr_vector<63>::select_1_type, rrr_vector<63>::select_0_type, int_tree<>> + ,wt_hutu, select_support_mcl<1>, select_support_mcl<0>, int_tree<>> + ,wt_int> + ,wt_int<> + ,wt_int> + > Implementations_interval; + +TYPED_TEST_CASE(WtIntervalTest, Implementations_interval); + +//! Test the parametrized constructor +TYPED_TEST(WtIntervalTest, Constructor) +{ + TypeParam wt; + sdsl::construct(wt, test_file); + ASSERT_TRUE(store_to_file(wt, temp_file)); +} + +//! Test the load method and interval_symbols method +TYPED_TEST(WtIntervalTest, LoadAndIntervalSymbols) +{ + int_vector<> iv; + load_from_file(iv, test_file); + TypeParam wt; + ASSERT_TRUE(load_from_file(wt, temp_file)); + ASSERT_EQ(iv.size(), wt.size()); + + size_type k = 0; + std::vector rank_c_i(wt.sigma); + std::vector rank_c_j(wt.sigma); + std::vector::value_type> cs(wt.sigma); + + std::mt19937_64 rng; + for (size_type n=1; n<4; ++n) { + std::uniform_int_distribution distribution(0, n*n*n*10); + auto dice = bind(distribution, rng); + for (size_type i=0,j=0; i < iv.size(); i=j) { + j = std::min(wt.size(),i+dice()); + + wt.interval_symbols(i, j, k, cs, rank_c_i, rank_c_j); + + size_type symbols = (j-i); + for (size_type m = 0; m0 and TypeParam::lex_ordered) { + ASSERT_LT(cs[m-1],cs[m]); + } + } + + ASSERT_EQ(0,symbols); + if (!TypeParam::lex_ordered) { + sort(cs.begin(),cs.begin()+k); + for (size_type m=1; m class WtIntLexOrdered : public ::testing::Test { }; typedef Types< @@ -252,7 +326,7 @@ TYPED_TEST(WtIntLexOrdered, DeleteTest) template class WtIntTopK : public ::testing::Test { }; -typedef Types< wt_int> , wt_int<> , wt_int> > Implementations_ordered; +typedef Types> , wt_int<> , wt_int> > Implementations_ordered; TYPED_TEST_CASE(WtIntTopK, Implementations_ordered); From 828921a4a9a7a656032b1254fc5d97c44a95b12a Mon Sep 17 00:00:00 2001 From: Timo Beller Date: Wed, 25 Sep 2013 15:11:10 +0200 Subject: [PATCH 10/15] Code formatting and preconditions --- include/sdsl/wt_int.hpp | 73 ++++++++++++++++++++++------------------- include/sdsl/wt_pc.hpp | 47 +++++++++++++++----------- 2 files changed, 67 insertions(+), 53 deletions(-) diff --git a/include/sdsl/wt_int.hpp b/include/sdsl/wt_int.hpp index 18b9de358..e093bf2ab 100644 --- a/include/sdsl/wt_int.hpp +++ b/include/sdsl/wt_int.hpp @@ -131,26 +131,26 @@ class wt_int } size_type ones_before_o = m_tree_rank(offset); - size_type ones_before_i = m_tree_rank(offset+i) -ones_before_o; - size_type ones_before_j = m_tree_rank(offset+j) -ones_before_o; - size_type ones_before_end = m_tree_rank(offset+ node_size)-ones_before_o; + size_type ones_before_i = m_tree_rank(offset+i) - ones_before_o; + size_type ones_before_j = m_tree_rank(offset+j) - ones_before_o; + size_type ones_before_end = m_tree_rank(offset+ node_size) - ones_before_o; // goto left child if ((j-i)-(ones_before_j-ones_before_i)>0) { size_type new_offset = offset + m_size; - size_type new_node_size = node_size- ones_before_end; + size_type new_node_size = node_size - ones_before_end; size_type new_i = i - ones_before_i; size_type new_j = j - ones_before_j; - _interval_symbols(new_i, new_j, k, cs,rank_c_i, rank_c_j, depth+1, path<<1, new_node_size,new_offset); + _interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, path<<1, new_node_size, new_offset); } // goto right child - if ((ones_before_j-ones_before_i)> 0) { - size_type new_offset = offset+(node_size - ones_before_end)+m_size; + if ((ones_before_j-ones_before_i)>0) { + size_type new_offset = offset+(node_size - ones_before_end) + m_size; size_type new_node_size = ones_before_end; size_type new_i = ones_before_i; size_type new_j = ones_before_j; - _interval_symbols(new_i, new_j, k, cs,rank_c_i, rank_c_j, depth+1,(path<<1)|1, new_node_size, new_offset); + _interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, (path<<1)|1, new_node_size, new_offset); } } @@ -306,8 +306,10 @@ class wt_int } //! Recovers the i-th symbol of the original vector. - /*! \param i The index of the symbol in the original vector. \f$i \in [0..size()-1]\f$ + /*! \param i The index of the symbol in the original vector. * \returns The i-th symbol of the original vector. + * \par Precondition + * \f$ i < size() \f$ */ value_type operator[](size_type i)const { assert(i < size()); @@ -338,10 +340,10 @@ class wt_int * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in[0..size()]\f$. * \param c The symbol to count the occurrences in the prefix. * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the supported vector. - * \par Precondition - * \f$ i \leq n \f$ * \par Time complexity - * \f$ \Order{\log |\Sigma|} \f$ + * \f$ \Order{\log |\Sigma|} \f$ + * \par Precondition + * \f$ i \leq size() \f$ */ size_type rank(size_type i, value_type c)const { assert(i <= size()); @@ -349,7 +351,7 @@ class wt_int return 0; } size_type offset = 0; - uint64_t mask = (1ULL) << (m_max_depth-1); + uint64_t mask = (1ULL) << (m_max_depth-1); size_type node_size = m_size; for (uint32_t k=0; k < m_max_depth and i; ++k) { size_type ones_before_o = m_tree_rank(offset); @@ -375,6 +377,8 @@ class wt_int /*! * \param i The index of the symbol. * \return Pair (rank(wt[i],i),wt[i]) + * \par Precondition + * \f$ i < size() \f$ */ std::pair inverse_select(size_type i)const { @@ -385,17 +389,18 @@ class wt_int //! Calculates the i-th occurrence of the symbol c in the supported vector. /*! - * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$. + * \param i The i-th occurrence. * \param c The symbol c. * \par Time complexity - * \f$ \Order{\log |\Sigma|} \f$ + * \f$ \Order{\log |\Sigma|} \f$ + * \par Precondition + * \f$ 1 \leq i \leq rank(size(), c) \f$ */ size_type select(size_type i, value_type c)const { - assert(i > 0); - assert(i <= rank(size(), c)); + assert(1 <= i and i <= rank(size(), c)); // possible optimization: if the array is a permutation we can start at the bottom of the tree size_type offset = 0; - uint64_t mask = (1ULL) << (m_max_depth-1); + uint64_t mask = (1ULL) << (m_max_depth-1); size_type node_size = m_size; m_path_off[0] = m_path_rank_off[0] = 0; @@ -444,11 +449,11 @@ class wt_int * rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$. * \param rank_c_j Reference to a vector which equals * rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$. - * \par Time complexity - * \f$ \Order{\min{\sigma, k \log \sigma}} \f$ + * \par Time complexity + * \f$ \Order{\min{\sigma, k \log \sigma}} \f$ * * \par Precondition - * \f$ i \leq j \leq n \f$ + * \f$ i \leq j \leq size() \f$ * \f$ cs.size() \geq \sigma \f$ * \f$ rank_{c_i}.size() \geq \sigma \f$ * \f$ rank_{c_j}.size() \geq \sigma \f$ @@ -478,15 +483,15 @@ class wt_int * * #symbols greater than c in [i..j-1] * * \par Precondition - * \f$ i \leq j \leq n \f$ + * \f$ i \leq j \leq size() \f$ */ template> t_ret_type lex_count(size_type i, size_type j, value_type c)const { assert(i <= j and j <= size()); if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt - return std::tuple {0,j-i,0}; + return t_ret_type {0, j-i, 0}; } - size_type offset = 0; + size_type offset = 0; size_type smaller = 0; size_type greater = 0; uint64_t mask = (1ULL) << (m_max_depth-1); @@ -511,46 +516,46 @@ class wt_int offset += m_size; mask >>= 1; } - return std::tuple {i, smaller, greater}; + return t_ret_type {i, smaller, greater}; }; //! How many symbols are lexicographic smaller than c in [0..i-1]. /*! - * \param i Exclusive right bound of the range (\f$i\in[0..size()]\f$). + * \param i Exclusive right bound of the range. * \param c Symbol c. * \return A tuple containing: * * rank(c,i) * * #symbols smaller than c in [0..i-1] * \par Precondition - * \f$ i \leq n \f$ + * \f$ i \leq size() \f$ */ template> t_ret_type lex_smaller_count(size_type i, value_type c) const { assert(i <= size()); if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt - return std::tuple {0,i}; + return t_ret_type {0, i}; } size_type offset = 0; size_type result = 0; - uint64_t mask = (1ULL) << (m_max_depth-1); + uint64_t mask = (1ULL) << (m_max_depth-1); size_type node_size = m_size; for (uint32_t k=0; k < m_max_depth and i; ++k) { size_type ones_before_o = m_tree_rank(offset); size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o; size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o; if (c & mask) { // search for a one at this level - offset += (node_size - ones_before_end); + offset += (node_size - ones_before_end); node_size = ones_before_end; - result += i - ones_before_i; - i = ones_before_i; + result += i - ones_before_i; + i = ones_before_i; } else { // search for a zero at this level node_size = (node_size - ones_before_end); - i -= ones_before_i; + i -= ones_before_i; } offset += m_size; mask >>= 1; } - return std::tuple {i, result}; + return t_ret_type {i, result}; } //! range_search_2d searches points in the index interval [lb..rb] and value interval [vlb..vrb]. diff --git a/include/sdsl/wt_pc.hpp b/include/sdsl/wt_pc.hpp index 46e13e6c2..951f9c8cc 100644 --- a/include/sdsl/wt_pc.hpp +++ b/include/sdsl/wt_pc.hpp @@ -283,11 +283,14 @@ class wt_pc //! Recovers the i-th symbol of the original vector. /*! - * \param i Index in the original vector. \f$i \in [0..size()-1]\f$. + * \param i Index in the original vector. * \return The i-th symbol of the original vector. * \par Time complexity * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the * zero order entropy of the sequence + * + * \par Precondition + * \f$ i < size() \f$ */ value_type operator[](size_type i)const { assert(i < size()); @@ -309,19 +312,20 @@ class wt_pc return m_tree.bv_pos_rank(v); }; - //! Calculates how many symbols c are in the prefix [0..min(i,size())-1]. + //! Calculates how many symbols c are in the prefix [0..i-1]. /*! * \param i Exclusive right bound of the range. * \param c Symbol c. - * \return Number of occurrences of symbol c in the prefix [0..min(i,size())-1]. + * \return Number of occurrences of symbol c in the prefix [0..i-1]. * \par Time complexity - * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the - * zero order entropy of the sequence + * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the + * zero order entropy of the sequence + * + * \par Precondition + * \f$ i \leq size() \f$ */ size_type rank(size_type i, value_type c)const { - if (i>size()) { - i = size(); - } + assert(i <= size()); if (!m_tree.is_valid(m_tree.c_to_leaf(c))) { return 0; // if `c` was not in the text } @@ -349,8 +353,11 @@ class wt_pc /*! * \param i The index of the symbol. * \return Pair (rank(wt[i],i),wt[i]) - * \par Time complexity - * \f$ \Order{H_0} \f$ + * \par Time complexity + * \f$ \Order{H_0} \f$ + * + * \par Precondition + * \f$ i < size() \f$ */ std::pair inverse_select(size_type i)const { @@ -373,15 +380,17 @@ class wt_pc //! Calculates the ith occurrence of the symbol c in the supported vector. /*! - * \param i The ith occurrence. \f$i\in [1..rank(size(),c)]\f$. + * \param i The ith occurrence. * \param c The symbol c. * \par Time complexity * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the zero order * entropy of the sequence + * + * \par Precondition + * \f$ 1 \leq i \leq rank(size(), c) \f$ */ size_type select(size_type i, value_type c)const { - assert(i > 0); - assert(i <= rank(size(), c)); + assert(1 <= i and i <= rank(size(), c)); node_type v = m_tree.c_to_leaf(c); if (!m_tree.is_valid(v)) { // if c was not in the text return m_size; // -> return a position right to the end @@ -423,11 +432,11 @@ class wt_pc * rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$. * \param rank_c_j Reference to a vector which equals * rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$. - * \par Time complexity - * \f$ \Order{\min{\sigma, k \log \sigma}} \f$ + * \par Time complexity + * \f$ \Order{\min{\sigma, k \log \sigma}} \f$ * * \par Precondition - * \f$ i \leq j \leq n \f$ + * \f$ i \leq j \leq size() \f$ * \f$ cs.size() \geq \sigma \f$ * \f$ rank_{c_i}.size() \geq \sigma \f$ * \f$ rank_{c_j}.size() \geq \sigma \f$ @@ -485,7 +494,7 @@ class wt_pc * * #symbols greater than c in [i..j-1] * * \par Precondition - * \f$ i \leq j \leq n \f$ + * \f$ i \leq j \leq size() \f$ * \note * This method is only available if lex_ordered = true */ @@ -540,13 +549,13 @@ class wt_pc //! How many symbols are lexicographic smaller than c in [0..i-1]. /*! - * \param i Exclusive right bound of the range (\f$i\in[0..size()]\f$). + * \param i Exclusive right bound of the range. * \param c Symbol c. * \return A tuple containing: * * rank(c,i) * * #symbols smaller than c in [0..i-1] * \par Precondition - * \f$ i \leq n \f$ + * \f$ i \leq size() \f$ * \note * This method is only available if lex_ordered = true */ From b6519a9ede953883e1cd00878fe66ac0449b0a47 Mon Sep 17 00:00:00 2001 From: Timo Beller Date: Thu, 26 Sep 2013 08:47:58 +0200 Subject: [PATCH 11/15] Fixed compile error --- test/WtIntTest.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/WtIntTest.cpp b/test/WtIntTest.cpp index fed97336d..03c90b419 100644 --- a/test/WtIntTest.cpp +++ b/test/WtIntTest.cpp @@ -185,27 +185,27 @@ TYPED_TEST(WtIntervalTest, LoadAndIntervalSymbols) for (size_type n=1; n<4; ++n) { std::uniform_int_distribution distribution(0, n*n*n*10); auto dice = bind(distribution, rng); - for (size_type i=0,j=0; i < iv.size(); i=j) { + for (size_type i=0, j=0; i < iv.size(); i=j) { j = std::min(wt.size(),i+dice()); wt.interval_symbols(i, j, k, cs, rank_c_i, rank_c_j); size_type symbols = (j-i); for (size_type m = 0; m0 and TypeParam::lex_ordered) { ASSERT_LT(cs[m-1],cs[m]); } } - ASSERT_EQ(0,symbols); + ASSERT_EQ(0ULL, symbols); if (!TypeParam::lex_ordered) { - sort(cs.begin(),cs.begin()+k); + sort(cs.begin(), cs.begin()+k); for (size_type m=1; m::max(), max = 0; for (size_type j=0; j < iv.size(); ++j) { if (min>iv[j]) min = iv[j]; if (max::max(), max = 0; for (size_type j=0; j < iv.size(); ++j) { if (min>iv[j]) min = iv[j]; if (max Date: Thu, 26 Sep 2013 09:04:51 +0200 Subject: [PATCH 12/15] Fixed compile error --- test/WtIntTest.cpp | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/test/WtIntTest.cpp b/test/WtIntTest.cpp index 03c90b419..50bbd82ee 100644 --- a/test/WtIntTest.cpp +++ b/test/WtIntTest.cpp @@ -259,8 +259,8 @@ TYPED_TEST(WtIntLexOrdered, LoadAndLexCount) auto dice = bind(distribution, rng); for (size_type idx=0; idx < iv.size();) { size_type i = idx, j = std::min(wt.size(),i+dice()); - size_type smaller_c1=0,greater_c1=0,smaller_c2=0,greater_c2=0; - int_vector<>::value_type c1=iv[i],c2=dice_symbol(); + size_type smaller_c1=0, greater_c1=0, smaller_c2=0, greater_c2=0; + int_vector<>::value_type c1=iv[i], c2=dice_symbol(); for (; idxc1) ++greater_c1; @@ -268,20 +268,20 @@ TYPED_TEST(WtIntLexOrdered, LoadAndLexCount) if (iv[idx]>c2) ++greater_c2; } - auto res1 = wt.lex_count(i,j,c1); - ASSERT_EQ(wt.rank(i,c1),std::get<0>(res1)); - ASSERT_EQ(smaller_c1,std::get<1>(res1)); - ASSERT_EQ(greater_c1,std::get<2>(res1)); - - auto res2 = wt.lex_count(i,j,c2); - ASSERT_EQ(wt.rank(i,c2),std::get<0>(res2)); - ASSERT_EQ(smaller_c2,std::get<1>(res2)); - ASSERT_EQ(greater_c2,std::get<2>(res2)); - - auto res3 = wt.lex_count(i,j,max+1+dice_symbol()); - ASSERT_EQ(0,std::get<0>(res3)); - ASSERT_EQ(j-i,std::get<1>(res3)); - ASSERT_EQ(0,std::get<2>(res3)); + auto res1 = wt.lex_count(i, j, c1); + ASSERT_EQ(wt.rank(i, c1), std::get<0>(res1)); + ASSERT_EQ(smaller_c1, std::get<1>(res1)); + ASSERT_EQ(greater_c1, std::get<2>(res1)); + + auto res2 = wt.lex_count(i, j, c2); + ASSERT_EQ(wt.rank(i, c2), std::get<0>(res2)); + ASSERT_EQ(smaller_c2, std::get<1>(res2)); + ASSERT_EQ(greater_c2, std::get<2>(res2)); + + auto res3 = wt.lex_count(i, j, max+1+dice_symbol()); + ASSERT_EQ(0ULL, std::get<0>(res3)); + ASSERT_EQ(j-i, std::get<1>(res3)); + ASSERT_EQ(0ULL, std::get<2>(res3)); } } } @@ -309,10 +309,10 @@ TYPED_TEST(WtIntLexOrdered, LoadAndLexSmallerCount) chars[2] = max+1+dice_symbol(); for (uint64_t i = 0; i(exp)-std::get<1>(exp),std::get<0>(res)); - ASSERT_EQ(std::get<1>(exp),std::get<1>(res)); + auto exp = wt.lex_count(0, idx, chars[i]); + auto res = wt.lex_smaller_count(idx, chars[i]); + ASSERT_EQ(idx-std::get<2>(exp)-std::get<1>(exp), std::get<0>(res)); + ASSERT_EQ(std::get<1>(exp), std::get<1>(res)); } } } From 2ddc8ba23e707bcfbde16ad1a190b875b96b3cb7 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Thu, 26 Sep 2013 10:28:14 +0200 Subject: [PATCH 13/15] Improved inverse_select and interval_symbols --- include/sdsl/wt_int.hpp | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/include/sdsl/wt_int.hpp b/include/sdsl/wt_int.hpp index e093bf2ab..a3f85be74 100644 --- a/include/sdsl/wt_int.hpp +++ b/include/sdsl/wt_int.hpp @@ -383,8 +383,26 @@ class wt_int std::pair inverse_select(size_type i)const { assert(i < size()); - value_type c = (*this)[i]; - return std::make_pair(rank(i, c),c); + + value_type c = 0; + size_type node_size = m_size, offset = 0; + for (uint32_t k=0; k < m_max_depth; ++k) { + size_type ones_before_o = m_tree_rank(offset); + size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o; + size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o; + c<<=1; + if (m_tree[offset+i]) { // go to the right child + offset += (node_size - ones_before_end); + node_size = ones_before_end; + i = ones_before_i; + c|=1; + } else { // go to the left child + node_size = (node_size - ones_before_end); + i = (i-ones_before_i); + } + offset += m_size; + } + return std::make_pair(i,c); } //! Calculates the i-th occurrence of the symbol c in the supported vector. @@ -467,6 +485,14 @@ class wt_int if (i==j) { return; } + if ((i+1)==j) { + auto res = inverse_select(i); + cs[0]=res.second; + rank_c_i[0]=res.first; + rank_c_j[0]=res.first+1; + k=1; + return; + } _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, 0, m_size, 0); From 448e7c6595d836b45b0958186d5ed466d74213d6 Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Thu, 26 Sep 2013 11:19:29 +0200 Subject: [PATCH 14/15] Simplified interval_symbols_test --- test/WtByteTest.cpp | 75 +++++++++++++++++---------------------------- test/WtIntTest.cpp | 5 +-- 2 files changed, 29 insertions(+), 51 deletions(-) diff --git a/test/WtByteTest.cpp b/test/WtByteTest.cpp index d723de880..154cb5a69 100644 --- a/test/WtByteTest.cpp +++ b/test/WtByteTest.cpp @@ -183,58 +183,39 @@ void test_interval_symbols(t_T& wt) ASSERT_EQ(true, load_from_file(wt, temp_file)); int_vector<8> text; ASSERT_EQ(true, load_vector_from_file(text, test_file, 1)); - if (wt.size()) { - std::mt19937_64 rng; - std::uniform_int_distribution distribution(0, wt.size()); - auto dice = bind(distribution, rng); - for (size_type t=0; t<10000; ++t) { - size_type l = dice(); - size_type r = dice(); - if (r distribution(0, wt.size()); + auto dice = bind(distribution, rng); + size_type k; + std::vector cs(wt.sigma); + std::vector rank_c_i(wt.sigma); + std::vector rank_c_j(wt.sigma); + for (size_type t=0; t<(wt.size()/100+100); ++t) { + size_type i = dice(), j = dice(); + if (i0 and t_T::lex_ordered) { + ASSERT_LT(cs[m-1],cs[m]); } - size_type k; - std::vector cs(wt.sigma); - std::vector rank_c_i(wt.sigma); - std::vector rank_c_j(wt.sigma); - wt.interval_symbols(l, r, k, cs, rank_c_i, rank_c_j); - - size_type k_n = 0; - std::vector rank_c_i_n(256,0); - std::vector rank_c_j_n(256,0); - - std::vector cs_n(wt.sigma); - size_type cnt = 0; + } - for (size_type j=0; j<256; ++j) { - size_type tmp_j = wt.rank(r,(value_type)j); - size_type tmp_i = wt.rank(l,(value_type)j); - if (tmp_j-tmp_i>0) { - rank_c_j_n[j] = tmp_j; - rank_c_i_n[j] = tmp_i; - ++k_n; - if (t_T::lex_ordered) { - cs_n[cnt++] = j; - } - } - } - ASSERT_EQ(k_n, k); - std::vector rank_c_i_wt(256,0); - std::vector rank_c_j_wt(256,0); - for (size_type j=0; j iv; - load_from_file(iv, test_file); TypeParam wt; ASSERT_TRUE(load_from_file(wt, temp_file)); - ASSERT_EQ(iv.size(), wt.size()); size_type k = 0; std::vector rank_c_i(wt.sigma); @@ -185,7 +182,7 @@ TYPED_TEST(WtIntervalTest, LoadAndIntervalSymbols) for (size_type n=1; n<4; ++n) { std::uniform_int_distribution distribution(0, n*n*n*10); auto dice = bind(distribution, rng); - for (size_type i=0, j=0; i < iv.size(); i=j) { + for (size_type i=0, j=0; i < wt.size(); i=j) { j = std::min(wt.size(),i+dice()); wt.interval_symbols(i, j, k, cs, rank_c_i, rank_c_j); From f8db199a39fe4a37317674d25c078e1c20d299cc Mon Sep 17 00:00:00 2001 From: Alexander Diehm Date: Thu, 26 Sep 2013 12:26:24 +0200 Subject: [PATCH 15/15] Fixed casts --- test/WtByteTest.cpp | 4 ++-- test/WtIntTest.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/WtByteTest.cpp b/test/WtByteTest.cpp index 154cb5a69..9613cc26f 100644 --- a/test/WtByteTest.cpp +++ b/test/WtByteTest.cpp @@ -201,14 +201,14 @@ void test_interval_symbols(t_T& wt) for (size_type m = 0; m0 and t_T::lex_ordered) { ASSERT_LT(cs[m-1],cs[m]); } } - ASSERT_EQ(0ULL, symbols); + ASSERT_EQ((size_type)0, symbols); if (!t_T::lex_ordered) { sort(cs.begin(), cs.begin()+k); for (size_type m=1; m0 and TypeParam::lex_ordered) { ASSERT_LT(cs[m-1],cs[m]); } } - ASSERT_EQ(0ULL, symbols); + ASSERT_EQ((size_type)0, symbols); if (!TypeParam::lex_ordered) { sort(cs.begin(), cs.begin()+k); for (size_type m=1; m(res2)); auto res3 = wt.lex_count(i, j, max+1+dice_symbol()); - ASSERT_EQ(0ULL, std::get<0>(res3)); + ASSERT_EQ((size_type)0, std::get<0>(res3)); ASSERT_EQ(j-i, std::get<1>(res3)); - ASSERT_EQ(0ULL, std::get<2>(res3)); + ASSERT_EQ((size_type)0, std::get<2>(res3)); } } }