From 8b33a212bd7b776f622f5854012c0d72ce84b883 Mon Sep 17 00:00:00 2001 From: cpockrandt Date: Sun, 5 Aug 2018 01:14:48 +0200 Subject: [PATCH] fixed a lot of stuff and tests (reference of index in iterator converted to a ptr, moved depth out of node for history iterator) --- .../seqan3/index/detail/fm_index_iterator.hpp | 3 +- include/seqan3/index/fm_index.hpp | 18 +- include/seqan3/index/fm_index_iterator.hpp | 88 ++++++--- include/seqan3/index/some_stuff.hpp | 15 ++ test/unit/index/fm_index_iterator_test.cpp | 167 +++++++++++++----- test/unit/index/fm_index_test.cpp | 57 +++--- 6 files changed, 227 insertions(+), 121 deletions(-) diff --git a/include/seqan3/index/detail/fm_index_iterator.hpp b/include/seqan3/index/detail/fm_index_iterator.hpp index 34dfa2a4ef7..ee9ece53046 100644 --- a/include/seqan3/index/detail/fm_index_iterator.hpp +++ b/include/seqan3/index/detail/fm_index_iterator.hpp @@ -59,12 +59,11 @@ struct fm_index_iterator_node size_type lb; size_type rb; - size_type depth; comp_char_type last_char; bool operator==(fm_index_iterator_node const & rhs) const { - return lb == rhs.lb && rb == rhs.rb && depth == rhs.depth && last_char == rhs.last_char; + return lb == rhs.lb && rb == rhs.rb && last_char == rhs.last_char; } bool operator!=(fm_index_iterator_node const & rhs) const diff --git a/include/seqan3/index/fm_index.hpp b/include/seqan3/index/fm_index.hpp index 7ad1750b66d..93ed728cd53 100644 --- a/include/seqan3/index/fm_index.hpp +++ b/include/seqan3/index/fm_index.hpp @@ -46,14 +46,6 @@ namespace seqan3 { -// namespace detail { -// template // TODO: fm_index_concept instead of typename -// class fm_index_iterator_node; -// } -// -// template // TODO: fm_index_concept instead of typename -// class fm_index_iterator; - struct fm_index_default_traits { using sdsl_index_type = sdsl::csa_wt< @@ -71,7 +63,9 @@ struct fm_index_default_traits >; }; -// TODO: what are the possible input types of text? only our own alphabet types? i.e. T, T where T could be any input_range? +// TODO(h-2): what is missing? noexcept, etc. + +// TODO(h-2): what are the possible input types of text? only our own alphabet types? i.e. T, T where T could be any input_range? // TODO: check whether input_range_concept is the correct one! depends on open decisions in sdsl (im-construction, writing in-memory data to tmpfs and on the construction algorithms) template @@ -102,13 +96,13 @@ class fm_index fm_index(fm_index &&) = default; fm_index & operator=(fm_index &&) = default; - // requires common_reference_concept, char_t> && detail::dimension_v == dimensions + // TODO(h-2): requires common_reference_concept, char_t> && detail::dimension_v == dimensions fm_index(text_t const & text) { construct(text); } - // can you allow for rvalues and make the index store them? + // TODO(h-2): should we remove this function to avoid passing rvalue references? void construct(text_t const & text) { this->text = &text; @@ -156,7 +150,7 @@ class fm_index }; -// TODO: where to put this code? concept.hpp doesn't seem to work :/ +// TODO(h-2): where to put this code? concept.hpp doesn't seem to work :/ #ifndef NDEBUG // static_assert(fm_index_concept>>); diff --git a/include/seqan3/index/fm_index_iterator.hpp b/include/seqan3/index/fm_index_iterator.hpp index c0c3d988d39..d53e3e6b253 100644 --- a/include/seqan3/index/fm_index_iterator.hpp +++ b/include/seqan3/index/fm_index_iterator.hpp @@ -54,12 +54,14 @@ namespace seqan3 // template // class fm_index_iterator; +// TODO: remove mapping by overwriting backward_search. one only has to deal with incomplete alphabets then (maybe add own alphabet type to sdsl?) + // TODO: to_rank() + 1 consistent with comp_char, mapping and implicit_sentinel? // NOTE: for convenience the fm_index behaves like a suffix tree, not a prefix tree // NOTE: bidirectional_fm_index_concept must fulfill fm_index_concept (subset) such that // a bidirectional index can be plugged into a unidirectional iterator -template // TODO: fm_index_concept instead of typename +template // TODO(h-2): fm_index_concept instead of typename class fm_index_iterator { @@ -71,13 +73,14 @@ class fm_index_iterator protected: using node_type = detail::fm_index_iterator_node; - index_type const & index; + const index_type * index; // TODO(h-2): reference don't work if we wan't an assignment operator. Maybe use weak_ptr? index_type const & index; size_type parent_lb, parent_rb; + size_type _depth; // TODO(h-2): naming? Conflict because of depth() member function node_type node; size_type offset() const { - return index.m_index.size() - depth() - 1; // since the string is reversed during construction + return index->m_index.size() - depth() - 1; // since the string is reversed during construction } public: @@ -88,13 +91,14 @@ class fm_index_iterator fm_index_iterator(fm_index_iterator &&) = default; fm_index_iterator & operator=(fm_index_iterator &&) = default; - fm_index_iterator(index_t const & _index) : index(_index), node({0, _index.m_index.size() - 1, 0, 0}) + fm_index_iterator(index_t const & _index) : index(&_index), _depth(0), node({0, _index.m_index.size() - 1, 0}) {} // TODO: cannot compare indices yet (not supported by sdsl) bool operator==(fm_index_iterator const & rhs) const { - return node == rhs.node && parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb; + // parent_lb/parent_rb might be uninitialized in a root node + return node == rhs.node && depth() == rhs.depth() && ((parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb) || depth() == 0); } bool operator!=(fm_index_iterator const & rhs) const @@ -108,12 +112,13 @@ class fm_index_iterator { typename index_type::comp_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel typename index_type::size_type _lb, _rb; - for (; c < index.m_index.sigma && !sdsl::backward_search(index.m_index, node.lb, node.rb, index.m_index.comp2char[c], _lb, _rb); ++c) {} - if (c != index.m_index.sigma) + for (; c < index->m_index.sigma && !sdsl::backward_search(index->m_index, node.lb, node.rb, index->m_index.comp2char[c], _lb, _rb); ++c) {} + if (c != index->m_index.sigma) { + ++_depth; parent_lb = node.lb; parent_rb = node.rb; - node = {_lb, _rb, node.depth + 1, c}; + node = {_lb, _rb, c}; return true; } return false; @@ -122,11 +127,20 @@ class fm_index_iterator bool down(typename index_type::char_type const & c) { typename index_type::size_type _lb, _rb; - if (sdsl::backward_search(index.m_index, node.lb, node.lb, index.m_index.comp2char[c.to_rank() + 1], _lb, _rb)) + + auto c_char = c.to_rank() + 1; + auto c_comp = index->m_index.char2comp[c_char]; + + // character does not occur in text / index + if (!c_comp) // TODO: [[unlikely]] + return false; + + if (sdsl::backward_search(index->m_index, node.lb, node.rb, c_char, _lb, _rb)) { + ++_depth; parent_lb = node.lb; parent_rb = node.rb; - node = {_lb, _rb, node.depth + 1, c.to_rank() + 1}; + node = {_lb, _rb, c_comp}; return true; } return false; @@ -139,40 +153,55 @@ class fm_index_iterator // requires (innermost_value_type_t == typename index_t::char_type) bool down(pattern_t && pattern) { + // TODO(h-2): empty patterns will lead a segmentation fault. + // checking for this would lead to another branching (otherwise c_comp would be unitialized and overwrite it). + assert(pattern.size() > 0); + typename index_type::size_type _lb = node.lb, _rb = node.rb; typename index_type::size_type _parent_lb = node.lb, _parent_rb = node.rb; auto first = pattern.cbegin(); auto last = pattern.cend(); + typename index_type::comp_char_type c_char; + typename index_type::comp_char_type c_comp; + for (auto it = first; it != last; ++it) { + c_char = (*it).to_rank() + 1; + c_comp = index->m_index.char2comp[c_char]; + + // character does not occur in text / index + if (!c_comp) // TODO: [[unlikely]] + return false; + _parent_lb = _lb; _parent_rb = _rb; - if (!sdsl::backward_search(index.m_index, _parent_lb, _parent_rb, (*it).to_rank() + 1, _lb, _rb)) + if (!sdsl::backward_search(index->m_index, _parent_lb, _parent_rb, c_char, _lb, _rb)) return false; } - node = {_lb, _rb, node.depth + (last - first), (*(last - 1)).to_rank() + 1}; + _depth += last - first; parent_lb = _parent_lb; parent_rb = _parent_rb; + node = {_lb, _rb, c_comp}; return true; } bool right() { - assert(node.depth > 0); + assert(depth() > 0); typename index_type::comp_char_type c = node.last_char + 1; typename index_type::size_type _lb, _rb; - while (c < index.m_index.sigma && !sdsl::backward_search(index.m_index, parent_lb, parent_rb, index.m_index.comp2char[c], _lb, _rb)) - { + + while (c < index->m_index.sigma && !sdsl::backward_search(index->m_index, parent_lb, parent_rb, index->m_index.comp2char[c], _lb, _rb)) ++c; - } - if (c != index.m_index.sigma) + + if (c != index->m_index.sigma) { - parent_lb = node.lb; - parent_rb = node.rb; - node = {_lb, _rb, node.depth, c}; + // parent_lb = node.lb; + // parent_rb = node.rb; + node = {_lb, _rb, c}; return true; } return false; @@ -180,23 +209,24 @@ class fm_index_iterator size_type depth() const { - return node.depth; + assert(_depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); + return _depth; } // TODO: what is the most suitable return type? outermost container of text_type with char_type in it? auto path_label() const { - assert(index.text != nullptr); + assert(index->text != nullptr); using char_type = typename index_type::char_type; - if (node.depth == 0) // TODO: [[unlikely]] + if (depth() == 0) // TODO: [[unlikely]] return std::vector{}; - const typename index_t::size_type pattern_begin = offset() - index.m_index[node.lb]; - return std::vector(index.text->cbegin() + pattern_begin, - index.text->cbegin() + pattern_begin + node.depth); - // return sdsl::extract(index.m_index, pattern_begin, pattern_begin + node.depth - 1); + const typename index_t::size_type pattern_begin = offset() - index->m_index[node.lb]; + return std::vector(index->text->cbegin() + pattern_begin, + index->text->cbegin() + pattern_begin + depth()); + // return sdsl::extract(index.m_index, pattern_begin, pattern_begin + depth - 1); } size_type count() const @@ -209,7 +239,7 @@ class fm_index_iterator { std::vector occ(count()); for (typename index_t::size_type i = 0; i < occ.size(); ++i) { - occ[i] = offset() - index.m_index[node.lb + i]; + occ[i] = offset() - index->m_index[node.lb + i]; } return occ; } @@ -218,7 +248,7 @@ class fm_index_iterator { const size_type _offset = offset(); return ranges::view::iota(node.lb, node.lb + count()) - | ranges::view::transform([*this, _offset] (auto sa_pos) { return _offset - index.m_index[sa_pos]; }); + | ranges::view::transform([*this, _offset] (auto sa_pos) { return _offset - index->m_index[sa_pos]; }); } }; diff --git a/include/seqan3/index/some_stuff.hpp b/include/seqan3/index/some_stuff.hpp index 4787b1fa188..72c5bf59cf7 100644 --- a/include/seqan3/index/some_stuff.hpp +++ b/include/seqan3/index/some_stuff.hpp @@ -6,3 +6,18 @@ // template // class bidirectional_suffix_array_history_iterator; + + + + + +// std::cout << "rank: " << (unsigned)c.to_rank() << '\n'; +// std::cout << "sigma: " << index->m_index.sigma << '\n'; +// std::cout << "comp2char: "; +// for (unsigned i = 0; i < index->m_index.sigma; ++i) +// std::cout << (unsigned)index->m_index.comp2char[i] << ' '; +// std::cout << '\n'; +// std::cout << "char2comp: "; +// for (unsigned i = 0; i < 255; ++i) +// std::cout << (unsigned)index->m_index.char2comp[i] << ' '; +// std::cout << '\n'; diff --git a/test/unit/index/fm_index_iterator_test.cpp b/test/unit/index/fm_index_iterator_test.cpp index d689fc01e4a..d63bd526d05 100644 --- a/test/unit/index/fm_index_iterator_test.cpp +++ b/test/unit/index/fm_index_iterator_test.cpp @@ -45,17 +45,45 @@ using namespace seqan3::literal; TEST(fm_index_iterator, ctr) { - // TODO + using index_type = fm_index>; + using iterator_type = index_type::iterator_type; + + std::vector text {"ACGACG"_dna4}; + fm_index> sa{text}; + + // custom constructor + iterator_type it0{sa}; + + // copy construction + iterator_type it1{it0}; + EXPECT_EQ(it0, it1); + + // copy assignment + iterator_type it2 = it0; + EXPECT_EQ(it0, it2); + + // move construction + iterator_type it3{std::move(it0)}; + EXPECT_EQ(it0, it3); + + // move assigment + iterator_type it4 = std::move(it0); + EXPECT_EQ(it0, it4); } TEST(fm_index_iterator, basic_search) { - using text_type = std::vector; - text_type text {"ACGACG"_dna4}; - fm_index sa{text}; + std::vector text {"ACGACG"_dna4}; + fm_index> sa{text}; - // successful down(range) + // root auto it = sa.root(); + EXPECT_EQ(it.locate(), (std::vector{0, 1, 4, 2, 5, 3, 6})); // sentinel position included + EXPECT_EQ(it.depth(), 0); + EXPECT_EQ(it.count(), 7); + + // successful down(range) + it = sa.root(); EXPECT_TRUE(it.down("CG"_dna4)); EXPECT_EQ(it.locate(), (std::vector{1, 4})); EXPECT_EQ(it.depth(), 2); @@ -66,51 +94,101 @@ TEST(fm_index_iterator, basic_search) EXPECT_EQ(it.depth(), 3); EXPECT_EQ(it.count(), 1); - // unsuccessful down(range), it remains untouched auto it_cpy = it; - EXPECT_FALSE(it.down("T"_dna4)); + // EXPECT_TRUE(it.down(""_dna4)); // TODO(h-2): should this be allowed? + // EXPECT_EQ(it, it_cpy); + + // unsuccessful down(range), it remains untouched + it_cpy = it; + EXPECT_FALSE(it.down("A"_dna4)); EXPECT_EQ(it, it_cpy); // successful down(char) - auto it2 = sa.root(); // TODO: copy assignment is removed because it would be ill-formed ??? - EXPECT_TRUE(it2.down(dna4::A)); - EXPECT_EQ(it2.locate(), (std::vector{0, 3})); // TODO: returns false??? - EXPECT_EQ(it2.depth(), 1); + it = sa.root(); + EXPECT_TRUE(it.down(dna4::A)); + EXPECT_EQ(it.locate(), (std::vector{0, 3})); + EXPECT_EQ(it.depth(), 1); - EXPECT_TRUE(it2.down(dna4::C)); - EXPECT_EQ(it2.locate(), (std::vector{0, 3})); // TODO: returns false??? - EXPECT_EQ(it2.depth(), 2); + EXPECT_TRUE(it.down(dna4::C)); + EXPECT_EQ(it.locate(), (std::vector{0, 3})); + EXPECT_EQ(it.depth(), 2); // unsuccessful down(char), it remains untouched - auto it_cpy2 = it2; // TODO: same as above - EXPECT_FALSE(it2.down(dna4::C)); - EXPECT_EQ(it2, it_cpy2); - EXPECT_FALSE(it2.down(dna4::T)); - EXPECT_EQ(it2, it_cpy2); - - // TODO: down() - auto it3 = sa.root(); // TODO: copy assignment is removed because it would be ill-formed ??? - EXPECT_TRUE(it3.down()); - EXPECT_EQ(it3.locate(), (std::vector{0, 3})); // TODO: returns false??? - EXPECT_EQ(it3.depth(), 1); - - EXPECT_TRUE(it3.right()); - EXPECT_EQ(it3.locate(), (std::vector{1, 4})); // TODO: returns false??? - EXPECT_EQ(it3.depth(), 1); - - EXPECT_TRUE(it3.down()); - EXPECT_EQ(it3.locate(), (std::vector{0, 3})); // TODO: returns false??? - EXPECT_EQ(it3.depth(), 2); - - auto it_cpy3 = it3; // TODO: same as above - EXPECT_FALSE(it3.right()); - EXPECT_EQ(it3, it_cpy3); - - // // TODO: right() with down(), down(c), down(pattern) before - // auto it3 = sa.root(); // TODO: same as above - // ASSERT_DEATH(it3.right(), ""); - - // TODO: path_label() + it_cpy = it; + EXPECT_FALSE(it.down(dna4::C)); + EXPECT_EQ(it, it_cpy); + + // successful down() and right() + it = sa.root(); + EXPECT_TRUE(it.down()); + EXPECT_EQ(it.locate(), (std::vector{0, 3})); + EXPECT_EQ(it.depth(), 1); + + EXPECT_TRUE(it.right()); + EXPECT_EQ(it.locate(), (std::vector{1, 4})); + EXPECT_EQ(it.depth(), 1); + + EXPECT_TRUE(it.down()); + EXPECT_EQ(it.locate(), (std::vector{1, 4})); + EXPECT_EQ(it.depth(), 2); + + // unsuccessful right(), it remains untouched + it_cpy = it; + EXPECT_FALSE(it.right()); + EXPECT_EQ(it, it_cpy); + + // unsuccessful down(), it remains untouched + it = sa.root(); + EXPECT_TRUE(it.down("GACG"_dna4)); + it_cpy = it; + EXPECT_FALSE(it.down()); + EXPECT_EQ(it, it_cpy); + + // right() cannot be called on the root node + it = sa.root(); + ASSERT_DEATH(it.right(), ""); + + // path_label() + it = sa.root(); + EXPECT_TRUE(it.down("ACG"_dna4)); + EXPECT_EQ(it.path_label(), "ACG"_dna4); +} + +TEST(fm_index_iterator, incomplete_alphabet) +{ + // search a char that does not occur in the text (higher rank than largest char occurring in text) + { + std::vector text {"ACGACG"_dna4}; + fm_index> sa{text}; + auto it = sa.root(); + EXPECT_FALSE(it.down(dna4::T)); + EXPECT_EQ(it, sa.root()); + } + + // search a char that does not occur in the text (smaller rank than smallest char occurring in text) + { + std::vector text {"CGTCGT"_dna4}; + fm_index> sa{text}; + auto it = sa.root(); + EXPECT_FALSE(it.down(dna4::A)); + EXPECT_EQ(it, sa.root()); + } + + // search a char that does not occur in the text (some rank that is neither the smallest nor the highest smallest occurring in text) + { + std::vector text {"ATATAT"_dna4}; + fm_index> sa{text}; + auto it = sa.root(); + EXPECT_FALSE(it.down(dna4::C)); + EXPECT_FALSE(it.down(dna4::G)); + EXPECT_FALSE(it.down("ACGT"_dna4)); + EXPECT_FALSE(it.down("G"_dna4)); + EXPECT_EQ(it, sa.root()); + + EXPECT_TRUE(it.down(dna4::A)); + EXPECT_TRUE(it.right()); + EXPECT_EQ(it.path_label(), "T"_dna4); + } } TEST(fm_index_iterator, lazy_locate) @@ -129,6 +207,7 @@ TEST(fm_index_iterator, lazy_locate) // std::cout << occ << ' '; // std::cout << '\n'; - // TODO: doesn't work! + // TODO(h-2): doesn't work! + // EXPECT_EQ(it.locate(), it.lazy_locate()); // EXPECT_TRUE(it.locate() == it.lazy_locate()); } diff --git a/test/unit/index/fm_index_test.cpp b/test/unit/index/fm_index_test.cpp index 192da616fb6..d25b9f8f715 100644 --- a/test/unit/index/fm_index_test.cpp +++ b/test/unit/index/fm_index_test.cpp @@ -59,38 +59,49 @@ TEST(fm_index, ctr) fm_index> dna_sa0; dna_sa0.construct("ACGT"_dna4); - // TODO: copy construction and assignment are not working // copy construction - // [[maybe_unused]] fm_index dna_sa1{dna_sa0}; + fm_index> dna_sa1{dna_sa0}; + EXPECT_EQ(dna_sa0.size(), dna_sa1.size()); // EXPECT_EQ(dna_sa0, dna_sa1); // copy assignment - // [[maybe_unused]] fm_index dna_sa2 = dna_sa0; + fm_index> dna_sa2 = dna_sa0; + EXPECT_EQ(dna_sa0.size(), dna_sa2.size()); // EXPECT_EQ(dna_sa0, dna_sa2); // move construction - [[maybe_unused]] fm_index> dna_sa3{std::move(dna_sa0)}; + fm_index> dna_sa3{std::move(dna_sa0)}; + EXPECT_EQ(dna_sa0.size(), dna_sa3.size()); // EXPECT_EQ(dna_sa0, dna_sa3); // move assigment - [[maybe_unused]] fm_index> dna_sa4 = std::move(dna_sa0); + fm_index> dna_sa4 = std::move(dna_sa0); + EXPECT_EQ(dna_sa0.size(), dna_sa4.size()); // EXPECT_EQ(dna_sa0, dna_sa4); // container contructor - [[maybe_unused]] fm_index> dna_sa5{"ACGT"_dna4}; + fm_index> dna_sa5{"ACGT"_dna4}; + EXPECT_EQ(dna_sa0.size(), dna_sa5.size()); // EXPECT_EQ(dna_sa0, dna_sa5); } TEST(fm_index, swap) { - fm_index> sa0{"ACGT"_dna4}; + fm_index> sa0{"ACGTACGT"_dna4}; + fm_index> sa2{sa0}; fm_index> sa1{"ACGT"_dna4}; - fm_index> sa2{}; - fm_index> sa3{}; + fm_index> sa3{sa1}; - std::swap(sa1, sa2); + EXPECT_EQ(sa0.size(), sa2.size()); + EXPECT_EQ(sa1.size(), sa3.size()); + EXPECT_NE(sa0.size(), sa1.size()); // EXPECT_EQ(sa0, sa2); - // EXPECT_EQ(sa1, sa3); + + std::swap(sa1, sa2); + + EXPECT_EQ(sa0.size(), sa1.size()); + EXPECT_EQ(sa2.size(), sa3.size()); + EXPECT_NE(sa0.size(), sa2.size()); } TEST(fm_index, size) @@ -99,32 +110,10 @@ TEST(fm_index, size) EXPECT_TRUE(sa.empty()); std::vector test {"ACGTACGT"_dna4}; - sa.construct(test/*"ACGT"_dna4*/); + sa.construct(test); EXPECT_EQ(sa.size(), 9); // including a sentinel character } -// TEST(fm_index, constructing) -// { -// fm_index> sa0, sa1, sa2, sa3; -// -// // TODO: test with 0s in text (currently not supported since 0 is reserved for sentinel) -// sdsl::int_vector<8> text0 {1, 2, 3, 1}; -// sdsl::int_vector<3> text1 {1, 2, 3, 1}; -// std::vector text2 {1, 2, 3, 1}; -// std::vector text3 {1, 2, 3, 1}; -// -// sa0.construct(text0); -// sa1.construct(text1); -// sa2.construct(text2); -// sa3.construct(text3); -// -// // TODO: replace by EXPECT_EQ -// EXPECT_EQ(sa0.size(), 5); -// EXPECT_EQ(sa1.size(), 5); -// EXPECT_EQ(sa2.size(), 5); -// EXPECT_EQ(sa3.size(), 5); -// } - TEST(fm_index, serialization) { std::vector text {"ACGTACGT"_dna4};