diff --git a/include/seqan3/search/fm_index/bi_fm_index_iterator.hpp b/include/seqan3/search/fm_index/bi_fm_index_iterator.hpp index 3e0ff60c164..149e3ecb2a1 100644 --- a/include/seqan3/search/fm_index/bi_fm_index_iterator.hpp +++ b/include/seqan3/search/fm_index/bi_fm_index_iterator.hpp @@ -162,26 +162,24 @@ class bi_fm_index_iterator //!\brief Optimized bidirectional search without alphabet mapping template - bool bidirectional_search(csa_t const & csa, - size_type const l_fwd, size_type const r_fwd, - size_type const l_bwd, size_type const r_bwd, - sdsl_char_type const c, - size_type & l_fwd_res, size_type & r_fwd_res, - size_type & l_bwd_res, size_type & r_bwd_res) const noexcept + bool bidirectional_search(csa_t const & csa, sdsl_char_type const c, + size_type & l_fwd, size_type & r_fwd, + size_type & l_bwd, size_type & r_bwd) const noexcept { assert(l_fwd <= r_fwd && r_fwd < csa.size()); assert(r_fwd + 1 >= l_fwd && r_bwd + 1 - l_bwd == r_fwd + 1 - l_fwd); + size_type _l_fwd, _r_fwd, _l_bwd, _r_bwd; if constexpr(std::Same) { size_type const c_begin = csa.C[c]; if (r_fwd + 1 - l_fwd == csa.size()) // [[unlikely]] { - l_fwd_res = c_begin; - l_bwd_res = c_begin; - r_fwd_res = csa.C[c + 1] - 1; - r_bwd_res = r_fwd_res; + _l_fwd = c_begin; + _l_bwd = c_begin; + _r_fwd = csa.C[c + 1] - 1; + _r_bwd = _r_fwd; } else { @@ -189,27 +187,25 @@ class bi_fm_index_iterator size_type const rank_l = std::get<0>(r_s_b); size_type const s = std::get<1>(r_s_b), b = std::get<2>(r_s_b); size_type const rank_r = r_fwd - l_fwd - s - b + rank_l; - l_fwd_res = c_begin + rank_l; - r_fwd_res = c_begin + rank_r; - l_bwd_res = l_bwd + s; - r_bwd_res = r_bwd - b; + _l_fwd = c_begin + rank_l; + _r_fwd = c_begin + rank_r; + _l_bwd = l_bwd + s; + _r_bwd = r_bwd - b; } } else { size_type const cc = csa.char2comp[c]; if (cc == 0 && c > 0) // [[unlikely]] - { return false; - } size_type const c_begin = csa.C[cc]; if (r_fwd + 1 - l_fwd == csa.size()) // [[unlikely]] { - l_fwd_res = c_begin; - l_bwd_res = c_begin; - r_fwd_res = csa.C[cc + 1] - 1; - r_bwd_res = r_fwd_res; + l_fwd = c_begin; + l_bwd = c_begin; + r_fwd = csa.C[cc + 1] - 1; + r_bwd = r_fwd; return true; } else @@ -218,57 +214,60 @@ class bi_fm_index_iterator size_type const rank_l = std::get<0>(r_s_b); size_type const s = std::get<1>(r_s_b), b = std::get<2>(r_s_b); size_type const rank_r = r_fwd - l_fwd - s - b + rank_l; - l_fwd_res = c_begin + rank_l; - r_fwd_res = c_begin + rank_r; - l_bwd_res = l_bwd + s; - r_bwd_res = r_bwd - b; + _l_fwd = c_begin + rank_l; + _r_fwd = c_begin + rank_r; + _l_bwd = l_bwd + s; + _r_bwd = r_bwd - b; } } - assert(r_fwd_res + 1 >= l_fwd_res && r_bwd_res + 1 - l_bwd_res == r_fwd_res + 1 - l_fwd_res); - return r_fwd_res >= l_fwd_res; + assert(_r_fwd + 1 >= _l_fwd && _r_bwd + 1 - _l_bwd == _r_fwd + 1 - _l_fwd); + if (_r_fwd >= _l_fwd) + { + l_fwd = _l_fwd; + r_fwd = _r_fwd; + l_bwd = _l_bwd; + r_bwd = _r_bwd; + return true; + } + return false; } //!\brief Optimized bidirectional search for cycle_back() and cycle_front() without alphabet mapping template - bool bidirectional_search_cycle(csa_t const & csa, - size_type const l_fwd, size_type const r_fwd, - size_type const l_bwd, size_type const r_bwd, - sdsl_char_type const c, - size_type & l_fwd_res, size_type & r_fwd_res, - size_type & l_bwd_res, size_type & r_bwd_res) const noexcept + bool bidirectional_search_cycle(csa_t const & csa, sdsl_char_type const c, + size_type const l_parent, size_type const r_parent, + size_type & l_fwd, size_type & r_fwd, + size_type & l_bwd, size_type & r_bwd) const noexcept { - assert(l_fwd <= r_fwd && r_fwd < csa.size()); + assert(l_parent <= r_parent && r_parent < csa.size()); + size_type c_begin; if constexpr(std::Same) - { - size_type const c_begin = csa.C[c]; - auto const r_s_b = csa.wavelet_tree.lex_count(l_fwd, r_fwd + 1, c); - size_type const rank_l = std::get<0>(r_s_b); - size_type const s = std::get<1>(r_s_b), b = std::get<2>(r_s_b); - size_type const rank_r = r_fwd - l_fwd - s - b + rank_l; - l_fwd_res = c_begin + rank_l; - r_fwd_res = c_begin + rank_r; - l_bwd_res = r_bwd + 1; - r_bwd_res = r_bwd + 1 + rank_r - rank_l; // TODO: check maybe +1/-1 missing - - assert(r_fwd_res + 1 >= l_fwd_res && r_bwd_res + 1 - l_bwd_res == r_fwd_res + 1 - l_fwd_res); - return r_fwd_res >= l_fwd_res; // r_fwd_res + 1 - l_fwd_res - } + c_begin = csa.C[c]; // TODO: check whether this can be removed else + c_begin = csa.C[csa.char2comp[c]]; + + auto const r_s_b = csa.wavelet_tree.lex_count(l_parent, r_parent + 1, c); + size_type const s = std::get<1>(r_s_b), + b = std::get<2>(r_s_b), + rank_l = std::get<0>(r_s_b), + rank_r = r_parent - l_parent - s - b + rank_l; + + size_type const _l_fwd = c_begin + rank_l; + size_type const _r_fwd = c_begin + rank_r; + size_type const _l_bwd = r_bwd + 1; + size_type const _r_bwd = r_bwd + 1 + rank_r - rank_l; + + assert(_r_fwd + 1 >= _l_fwd && _r_bwd + 1 - _l_bwd == _r_fwd + 1 - _l_fwd); + if (_r_fwd >= _l_fwd) { - size_type const c_begin = csa.C[csa.char2comp[c]]; - auto const r_s_b = csa.wavelet_tree.lex_count(l_fwd, r_fwd + 1, c); - size_type const rank_l = std::get<0>(r_s_b); - size_type const s = std::get<1>(r_s_b), b = std::get<2>(r_s_b); - size_type const rank_r = r_fwd - l_fwd - s - b + rank_l; - l_fwd_res = c_begin + rank_l; - r_fwd_res = c_begin + rank_r; - l_bwd_res = r_bwd + 1; - r_bwd_res = r_bwd + 1 + rank_r - rank_l; // TODO: check maybe +1/-1 missing - - assert(r_fwd_res + 1 >= l_fwd_res && r_bwd_res + 1 - l_bwd_res == r_fwd_res + 1 - l_fwd_res); - return r_fwd_res >= l_fwd_res; // r_fwd_res + 1 - l_fwd_res + l_fwd = _l_fwd; + r_fwd = _r_fwd; + l_bwd = _l_bwd; + r_bwd = _r_bwd; + return true; } + return false; } public: @@ -358,24 +357,20 @@ class bi_fm_index_iterator assert(index != nullptr); + size_type new_parent_lb = fwd_lb, new_parent_rb = fwd_rb; + sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel - size_type _fwd_lb, _fwd_rb, _rev_lb, _rev_rb; while (c < index->fwd_fm.m_index.sigma && - !bidirectional_search(index->fwd_fm.m_index, fwd_lb, fwd_rb, rev_lb, rev_rb, - index->fwd_fm.m_index.comp2char[c], _fwd_lb, _fwd_rb, _rev_lb, _rev_rb)) + !bidirectional_search(index->fwd_fm.m_index, index->fwd_fm.m_index.comp2char[c], + fwd_lb, fwd_rb, rev_lb, rev_rb)) { ++c; } if (c != index->fwd_fm.m_index.sigma) { - parent_lb = fwd_lb; - parent_rb = fwd_rb; - - fwd_lb = _fwd_lb; - fwd_rb = _fwd_rb; - rev_lb = _rev_lb; - rev_rb = _rev_rb; + parent_lb = new_parent_lb; + parent_rb = new_parent_rb; _last_char = c; ++depth; @@ -410,24 +405,20 @@ class bi_fm_index_iterator assert(index != nullptr); + size_type new_parent_lb = rev_lb, new_parent_rb = rev_rb; + sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel - size_type _fwd_lb, _fwd_rb, _rev_lb, _rev_rb; while (c < index->rev_fm.m_index.sigma && - !bidirectional_search(index->rev_fm.m_index, rev_lb, rev_rb, fwd_lb, fwd_rb, - index->rev_fm.m_index.comp2char[c], _rev_lb, _rev_rb, _fwd_lb, _fwd_rb)) + !bidirectional_search(index->rev_fm.m_index, index->rev_fm.m_index.comp2char[c], + rev_lb, rev_rb, fwd_lb, fwd_rb)) { ++c; } if (c != index->rev_fm.m_index.sigma) { - parent_lb = rev_lb; - parent_rb = rev_rb; - - fwd_lb = _fwd_lb; - fwd_rb = _fwd_rb; - rev_lb = _rev_lb; - rev_rb = _rev_rb; + parent_lb = new_parent_lb; + parent_rb = new_parent_rb; _last_char = c; ++depth; @@ -463,20 +454,13 @@ class bi_fm_index_iterator assert(index != nullptr); - size_type _fwd_lb, _fwd_rb, _rev_lb, _rev_rb; + size_type new_parent_lb = fwd_lb, new_parent_rb = fwd_rb; auto c_char = to_rank(c) + 1; - - if (bidirectional_search(index->fwd_fm.m_index, fwd_lb, fwd_rb, rev_lb, rev_rb, - c_char, _fwd_lb, _fwd_rb, _rev_lb, _rev_rb)) + if (bidirectional_search(index->fwd_fm.m_index, c_char, fwd_lb, fwd_rb, rev_lb, rev_rb)) { - parent_lb = fwd_lb; - parent_rb = fwd_rb; - - fwd_lb = _fwd_lb; - fwd_rb = _fwd_rb; - rev_lb = _rev_lb; - rev_rb = _rev_rb; + parent_lb = new_parent_lb; + parent_rb = new_parent_rb; _last_char = c_char; ++depth; @@ -512,20 +496,13 @@ class bi_fm_index_iterator assert(index != nullptr); - size_type _fwd_lb, _fwd_rb, _rev_lb, _rev_rb; + size_type new_parent_lb = rev_lb, new_parent_rb = rev_rb; auto c_char = to_rank(c) + 1; - - if (bidirectional_search(index->rev_fm.m_index, rev_lb, rev_rb, fwd_lb, fwd_rb, - c_char, _rev_lb, _rev_rb, _fwd_lb, _fwd_rb)) + if (bidirectional_search(index->rev_fm.m_index, c_char, rev_lb, rev_rb, fwd_lb, fwd_rb)) { - parent_lb = rev_lb; - parent_rb = rev_rb; - - fwd_lb = _fwd_lb; - fwd_rb = _fwd_rb; - rev_lb = _rev_lb; - rev_rb = _rev_rb; + parent_lb = new_parent_lb; + parent_rb = new_parent_rb; _last_char = c_char; ++depth; @@ -565,10 +542,10 @@ class bi_fm_index_iterator auto first = seq.begin(); auto last = seq.end(); - assert(index != nullptr && first != last); // range must not be empty! + assert(index != nullptr); size_type _fwd_lb = fwd_lb, _fwd_rb = fwd_rb, _rev_lb = rev_lb, _rev_rb = rev_rb; - size_type _parent_lb, _parent_rb; + size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb; sdsl_char_type c; @@ -576,10 +553,9 @@ class bi_fm_index_iterator { c = to_rank(*it) + 1; - _parent_lb = _fwd_lb; - _parent_rb = _fwd_rb; - if (!bidirectional_search(index->fwd_fm.m_index, _fwd_lb, _fwd_rb, _rev_lb, _rev_rb, - c, _fwd_lb, _fwd_rb, _rev_lb, _rev_rb)) + new_parent_lb = _fwd_lb; + new_parent_rb = _fwd_rb; + if (!bidirectional_search(index->fwd_fm.m_index, c, _fwd_lb, _fwd_rb, _rev_lb, _rev_rb)) return false; } @@ -588,8 +564,9 @@ class bi_fm_index_iterator rev_lb = _rev_lb; rev_rb = _rev_rb; - parent_lb = _parent_lb; - parent_rb = _parent_rb; + parent_lb = new_parent_lb; + parent_rb = new_parent_rb; + _last_char = c; depth += last - first; @@ -627,23 +604,22 @@ class bi_fm_index_iterator auto first = rev_seq.begin(); auto last = rev_seq.end(); - assert(index != nullptr && first != last); // range must not be empty! + assert(index != nullptr); - size_type _fwd_lb = fwd_lb, _fwd_rb = fwd_rb, _rev_lb = rev_lb, _rev_rb = rev_rb; - size_type _parent_lb, _parent_rb; + size_type _fwd_lb = fwd_lb, _fwd_rb = fwd_rb, + _rev_lb = rev_lb, _rev_rb = rev_rb; + size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb; sdsl_char_type c; - // TODO(h-2, eseiler and cpockrandt): should we iterate from left to right or right to left? for (auto it = first; it != last; ++it) { c = to_rank(*it) + 1; - _parent_lb = _rev_lb; - _parent_rb = _rev_rb; - if (!bidirectional_search(index->rev_fm.m_index, _rev_lb, _rev_rb, _fwd_lb, _fwd_rb, - c, _rev_lb, _rev_rb, _fwd_lb, _fwd_rb)) - return false; + new_parent_lb = _rev_lb; + new_parent_rb = _rev_rb; + if (!bidirectional_search(index->rev_fm.m_index, c, _rev_lb, _rev_rb, _fwd_lb, _fwd_rb)) + return false; } fwd_lb = _fwd_lb; @@ -651,8 +627,8 @@ class bi_fm_index_iterator rev_lb = _rev_lb; rev_rb = _rev_rb; - parent_lb = _parent_lb; - parent_rb = _parent_rb; + parent_lb = new_parent_lb; + parent_rb = new_parent_rb; _last_char = c; depth += last - first; @@ -695,22 +671,16 @@ class bi_fm_index_iterator assert(index != nullptr && query_length() > 0); sdsl_char_type c = _last_char + 1; - size_type _fwd_lb, _fwd_rb, _rev_lb, _rev_rb; - while (c < index->fwd_fm.m_index.sigma && !bidirectional_search_cycle(index->fwd_fm.m_index, - parent_lb, parent_rb, rev_lb, rev_rb, index->fwd_fm.m_index.comp2char[c], - _fwd_lb, _fwd_rb, _rev_lb, _rev_rb)) + while (c < index->fwd_fm.m_index.sigma && + !bidirectional_search_cycle(index->fwd_fm.m_index, index->fwd_fm.m_index.comp2char[c], + parent_lb, parent_rb, fwd_lb, fwd_rb, rev_lb, rev_rb)) { ++c; } if (c != index->fwd_fm.m_index.sigma) { - fwd_lb = _fwd_lb; - fwd_rb = _fwd_rb; - rev_lb = _rev_lb; - rev_rb = _rev_rb; - _last_char = c; return true; @@ -754,22 +724,15 @@ class bi_fm_index_iterator assert(index != nullptr && query_length() > 0); sdsl_char_type c = _last_char + 1; - size_type _fwd_lb, _fwd_rb, _rev_lb, _rev_rb; - - while (c < index->rev_fm.m_index.sigma && !bidirectional_search_cycle(index->rev_fm.m_index, - parent_lb, parent_rb, fwd_lb, fwd_rb, index->rev_fm.m_index.comp2char[c], - _rev_lb, _rev_rb, _fwd_lb, _fwd_rb)) + while (c < index->rev_fm.m_index.sigma && + !bidirectional_search_cycle(index->rev_fm.m_index, index->rev_fm.m_index.comp2char[c], + parent_lb, parent_rb, rev_lb, rev_rb, fwd_lb, fwd_rb)) { ++c; } if (c != index->rev_fm.m_index.sigma) { - fwd_lb = _fwd_lb; - fwd_rb = _fwd_rb; - rev_lb = _rev_lb; - rev_rb = _rev_rb; - _last_char = c; return true; diff --git a/include/seqan3/search/fm_index/fm_index_iterator.hpp b/include/seqan3/search/fm_index/fm_index_iterator.hpp index ba6baef1581..a8a5268960a 100644 --- a/include/seqan3/search/fm_index/fm_index_iterator.hpp +++ b/include/seqan3/search/fm_index/fm_index_iterator.hpp @@ -128,26 +128,24 @@ class fm_index_iterator //!\brief Optimized backward search without alphabet mapping template - bool backward_search(csa_t const & csa, size_type const l, size_type const r, sdsl_char_type const c, - size_type & l_res, size_type & r_res) const noexcept + bool backward_search(csa_t const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept { assert(l <= r && r < csa.size()); + size_type _l, _r; if constexpr(std::Same) { size_type const c_begin = csa.C[c]; if (r + 1 - l == csa.size()) // [[unlikely]] { - l_res = c_begin; - r_res = csa.C[c + 1] - 1; + _l = c_begin; + _r = csa.C[c + 1] - 1; } else { - l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] - r_res = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r] + _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] + _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r] } - assert(r_res + 1 - l_res >= 0); - return r_res >= l_res; } else { @@ -158,18 +156,24 @@ class fm_index_iterator size_type const c_begin = csa.C[cc]; if (l == 0 && r + 1 == csa.size()) // [[unlikely]] { - l_res = c_begin; - r_res = csa.C[cc + 1] - 1; + l = c_begin; + r = csa.C[cc + 1] - 1; return true; } else { - l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] - r_res = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r] - assert(r_res + 1 - l_res >= 0); - return r_res >= l_res; + _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1] + _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r] } } + assert(_r + 1 - _l >= 0); + if (_r >= _l) + { + r = _r; + l = _l; + return true; + } + return false; } public: @@ -205,8 +209,7 @@ class fm_index_iterator bool operator==(fm_index_iterator const & rhs) const noexcept { assert(index != nullptr); - assert(node != rhs.node || - (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb))); + assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb))); // position in the implicit suffix tree is defined by the SA interval and depth. // No need to compare parent intervals @@ -256,9 +259,8 @@ class fm_index_iterator assert(index != nullptr); sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel - size_type _lb, _rb; - while (c < index->m_index.sigma && - !backward_search(index->m_index, node.lb, node.rb, index->m_index.comp2char[c], _lb, _rb)) + size_type _lb = node.lb, _rb = node.rb; + while (c < index->m_index.sigma && !backward_search(index->m_index, index->m_index.comp2char[c], _lb, _rb)) { ++c; } @@ -295,11 +297,11 @@ class fm_index_iterator { assert(index != nullptr); - size_type _lb, _rb; + size_type _lb = node.lb, _rb = node.rb; sdsl_char_type c_char = to_rank(c) + 1; - if (backward_search(index->m_index, node.lb, node.rb, c_char, _lb, _rb)) + if (backward_search(index->m_index, c_char, _lb, _rb)) { parent_lb = node.lb; parent_rb = node.rb; @@ -335,10 +337,10 @@ class fm_index_iterator auto first = seq.begin(); auto last = seq.end(); - assert(index != nullptr && first != last); // range must not be empty! + assert(index != nullptr); // range must not be empty! size_type _lb = node.lb, _rb = node.rb; - size_type _parent_lb = node.lb, _parent_rb = node.rb; + size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb; sdsl_char_type c; @@ -346,14 +348,14 @@ class fm_index_iterator { c = to_rank(*it) + 1; - _parent_lb = _lb; - _parent_rb = _rb; - if (!backward_search(index->m_index, _parent_lb, _parent_rb, c, _lb, _rb)) + new_parent_lb = _lb; + new_parent_rb = _rb; + if (!backward_search(index->m_index, c, _lb, _rb)) return false; } - parent_lb = _parent_lb; - parent_rb = _parent_rb; + parent_lb = new_parent_lb; + parent_rb = new_parent_rb; node = {_lb, _rb, last - first + node.depth, c}; return true; } @@ -390,10 +392,9 @@ class fm_index_iterator assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb); sdsl_char_type c = node.last_char + 1; - size_type _lb, _rb; + size_type _lb = parent_lb, _rb = parent_rb; - while (c < index->m_index.sigma && - !backward_search(index->m_index, parent_lb, parent_rb, index->m_index.comp2char[c], _lb, _rb)) + while (c < index->m_index.sigma && !backward_search(index->m_index, index->m_index.comp2char[c], _lb, _rb)) { ++c; } diff --git a/test/unit/index/fm_index_iterator_test.cpp b/test/unit/index/fm_index_iterator_test.cpp index 89f4f539b25..ca82eb02f68 100644 --- a/test/unit/index/fm_index_iterator_test.cpp +++ b/test/unit/index/fm_index_iterator_test.cpp @@ -146,11 +146,9 @@ TYPED_TEST(fm_index_iterator_test, extend_right_range) EXPECT_FALSE(it.extend_right("A"_dna4)); EXPECT_EQ(it, it_cpy); - // extend_right(range) does not take an empty range + // extend_right(empty range) it_cpy = it; -#ifndef NDEBUG - EXPECT_DEATH(it.extend_right(""_dna4), ""); -#endif + EXPECT_TRUE(it.extend_right(""_dna4)); EXPECT_EQ(it, it_cpy); }