From 01f5ec4b360f07436443deac097691940c67c7da Mon Sep 17 00:00:00 2001 From: cpockrandt Date: Sun, 5 Aug 2018 05:06:53 +0200 Subject: [PATCH] started with history_iterator --- include/seqan3/index/concept.hpp | 20 +- include/seqan3/index/fm_index.hpp | 9 + .../index/fm_index_history_iterator.hpp | 277 ++++++++++++++++++ include/seqan3/index/fm_index_iterator.hpp | 3 +- test/unit/index/fm_index_iterator_test.cpp | 140 ++++++--- 5 files changed, 410 insertions(+), 39 deletions(-) create mode 100644 include/seqan3/index/fm_index_history_iterator.hpp diff --git a/include/seqan3/index/concept.hpp b/include/seqan3/index/concept.hpp index 5270d86edcf..c70d35f7ba7 100644 --- a/include/seqan3/index/concept.hpp +++ b/include/seqan3/index/concept.hpp @@ -50,6 +50,8 @@ namespace seqan3 // fm_index_traits // ================================================================== +// TODO: add history iterator concept, extend fm_index concept + template concept bool fm_index_traits_concept = requires (t v, typename t::sdsl_index_type::size_type lb, @@ -64,7 +66,7 @@ concept bool fm_index_traits_concept = requires (t v, (typename t::sdsl_index_type{})[0]; // suffix array access (typename t::sdsl_index_type{}).comp2char[0]; (typename t::sdsl_index_type{}).sigma; - { sdsl::construct_im(m_index, sdsl::int_vector<8> {}, 0) } -> void; // TODO! should allow any input_range + { sdsl::construct_im(m_index, sdsl::int_vector<8> {}, 0) } -> void; // TODO should allow any input_range { sdsl::backward_search(typename t::sdsl_index_type{}, lb, rb, typename t::sdsl_index_type::char_type{}, lb, rb) } -> typename t::sdsl_index_type::size_type; }; @@ -76,18 +78,22 @@ concept bool fm_index_concept = requires (t v) typename t::char_type; typename t::size_type; typename t::iterator_type; // TODO: requires iterator_concept? + typename t::history_iterator_type; // TODO: requires history_iterator_type? // (uint8_t)t::dimensions; // number of nested containers // (bool)t::is_bidirectional; // TODO: constructors - // TODO: construction_policy requires requires (t index, std::vector text) { { t(text) } }; requires requires (t index, std::vector text) { { index.construct(text) } -> void; }; + { v.root() } -> typename t::iterator_type; + { v.root_history() } -> typename t::history_iterator_type; + { v.size() } -> typename t::size_type; { v.empty() } -> bool; + { v.load(std::string{}) } -> bool; { v.store(std::string{}) } -> bool; }; @@ -98,8 +104,6 @@ concept bool fm_index_iterator_concept = requires (t it) typename t::index_type; typename t::size_type; - //requires index_tree_concept; - // TODO: constructors // requires requires (t it, typename t::index_type const & index) { it(index) }; @@ -115,4 +119,12 @@ concept bool fm_index_iterator_concept = requires (t it) // { it.lazy_locate() } -> TODO; }; +template +concept bool fm_index_history_iterator_concept = requires (t it) +{ + requires fm_index_iterator_concept; + + { it.up() } -> bool; +}; + } // namespace seqan3 diff --git a/include/seqan3/index/fm_index.hpp b/include/seqan3/index/fm_index.hpp index 93ed728cd53..7069007418d 100644 --- a/include/seqan3/index/fm_index.hpp +++ b/include/seqan3/index/fm_index.hpp @@ -36,6 +36,7 @@ // #include #include +#include #include #include @@ -84,7 +85,9 @@ class fm_index using size_type = typename sdsl_index_type::size_type; using iterator_type = fm_index_iterator>; + using history_iterator_type = fm_index_history_iterator>; friend class fm_index_iterator>; + friend class fm_index_history_iterator>; friend class detail::fm_index_iterator_node>; // static const bool is_bidirectional = false; @@ -137,6 +140,12 @@ class fm_index return iterator_type(*this); } + // TODO(h-2): naming? + history_iterator_type root_history() const + { + return history_iterator_type(*this); + } + // TODO: replace with cereal once sdsl supports it bool load(std::string const & path) { diff --git a/include/seqan3/index/fm_index_history_iterator.hpp b/include/seqan3/index/fm_index_history_iterator.hpp new file mode 100644 index 00000000000..e3d042716ac --- /dev/null +++ b/include/seqan3/index/fm_index_history_iterator.hpp @@ -0,0 +1,277 @@ +// ============================================================================ +// SeqAn - The Library for Sequence Analysis +// ============================================================================ +// +// Copyright (c) 2006-2017, Knut Reinert & Freie Universitaet Berlin +// Copyright (c) 2016-2017, Knut Reinert & MPI Molekulare Genetik +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of Knut Reinert or the FU Berlin nor the names of +// its contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +// DAMAGE. +// +// ============================================================================ +// Author: Christopher Pockrandt +// ============================================================================ + +#pragma once + +#include +#include +// #include +#include +#include +#include + +#include + +#include +#include + +namespace seqan3 +{ + +// TODO: remove mapping by overwriting backward_search. one only has to deal with incomplete alphabets then (maybe add own alphabet type to sdsl?) + +// TODO: to_rank() + 1 consistent with comp_char, mapping and implicit_sentinel? + +// NOTE: for convenience the fm_index behaves like a suffix tree, not a prefix tree +// NOTE: bidirectional_fm_index_concept must fulfill fm_index_concept (subset) such that +// a bidirectional index can be plugged into a unidirectional iterator +template // TODO(h-2): fm_index_concept instead of typename +class fm_index_history_iterator +{ + +public: + using index_type = index_t; + using size_type = typename index_type::size_type; + // using text_pos_t = typename index_t::size_type; // TODO: depends on dimensions of the text + +protected: + using node_type = detail::fm_index_iterator_node; + + const index_type * index; // TODO(h-2): reference don't work if we wan't an assignment operator. Maybe use weak_ptr? index_type const & index; + size_type _depth; // TODO(h-2): naming? Conflict because of depth() member function + std::forward_list node_stack; // TODO(h-2): better a forward_list instead of vector? but probably not cache local? + + size_type offset() const + { + return index->m_index.size() - depth() - 1; // since the string is reversed during construction + } + +public: + + fm_index_history_iterator() = delete; + fm_index_history_iterator(fm_index_history_iterator const &) = default; + fm_index_history_iterator & operator=(fm_index_history_iterator const &) = default; + fm_index_history_iterator(fm_index_history_iterator &&) = default; + fm_index_history_iterator & operator=(fm_index_history_iterator &&) = default; + + fm_index_history_iterator(index_t const & _index) : index(&_index), _depth(0), node_stack({{0, _index.m_index.size() - 1, 0}}) + {} + + // TODO: cannot compare indices yet (not supported by sdsl) + bool operator==(fm_index_history_iterator const & rhs) const + { + return node_stack == rhs.node_stack && depth() == rhs.depth(); + } + + bool operator!=(fm_index_history_iterator const & rhs) const + { + return !(*this == rhs); + } + + // TODO: implement first backward search more efficiently if iterator is at root node! + // NOTE: whenever false is returned, it is guaranteed that the iterator remained untouched + bool down() + { + auto const & node = node_stack.front(); + + typename index_type::comp_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel + typename index_type::size_type _lb, _rb; + while (c < index->m_index.sigma && + !sdsl::backward_search(index->m_index, node.lb, node.rb, index->m_index.comp2char[c], _lb, _rb)) + { + ++c; + } + if (c != index->m_index.sigma) + { + ++_depth; + node_stack.push_front({_lb, _rb, c}); + return true; + } + return false; + } + + bool down(typename index_type::char_type const & c) + { + typename index_type::size_type _lb, _rb; + + auto c_char = c.to_rank() + 1; + auto c_comp = index->m_index.char2comp[c_char]; + + // character does not occur in text / index + if (!c_comp) // TODO: [[unlikely]] + return false; + + auto const & node = node_stack.front(); + + if (sdsl::backward_search(index->m_index, node.lb, node.rb, c_char, _lb, _rb)) + { + ++_depth; + node_stack.push_front({_lb, _rb, c_comp}); + return true; + } + return false; + } + + // TODO: requires clause does not work with == ... syntax error? + // TODO: enforce the innermost type of char_type to be convertable to char_type + template + requires std::is_same_v, typename index_t::char_type> + // requires (innermost_value_type_t == typename index_t::char_type) + bool down(pattern_t && pattern) + { + // TODO(h-2): empty patterns will lead a segmentation fault. + // checking for this would lead to another branching (otherwise c_comp would be unitialized and overwrite it). + assert(pattern.size() > 0); + + auto const & node = node_stack.front(); + typename index_type::size_type _lb = node.lb, _rb = node.rb; + + auto first = pattern.cbegin(); + auto last = pattern.cend(); + + typename index_type::comp_char_type c_char; + typename index_type::comp_char_type c_comp; + + for (auto it = first; it != last; ++it) + { + c_char = (*it).to_rank() + 1; + c_comp = index->m_index.char2comp[c_char]; + + // character does not occur in text / index + if (!c_comp) // TODO: [[unlikely]] + return false; + + if (!sdsl::backward_search(index->m_index, _lb, _rb, c_char, _lb, _rb)) + return false; + } + + _depth += last - first; + node_stack.push_front({_lb, _rb, c_comp}); + return true; + } + + bool right() + { + assert(depth() > 0/* && node_stack.size() > 1*/); + + auto const & node = node_stack.front(); + + typename index_type::comp_char_type c = node.last_char + 1; + typename index_type::size_type _lb, _rb; + + auto const parent_lb = (*(++node_stack.cbegin())).lb; + auto const parent_rb = (*(++node_stack.cbegin())).rb; + while (c < index->m_index.sigma && !sdsl::backward_search(index->m_index, parent_lb, parent_rb, index->m_index.comp2char[c], _lb, _rb)) + ++c; + + if (c != index->m_index.sigma) + { + // parent_lb = node.lb; + // parent_rb = node.rb; + node_stack.front() = {_lb, _rb, c}; + return true; + } + return false; + } + + // TODO(h-2): what is the expected behavior after down(range)? we might have to include "depth" into "node" again + bool up() + { + // TODO(h-2): are there class variants to avoid redundant assertions? + assert(_depth != 0 || (node_stack.front().lb == 0 && node_stack.front().rb == index->size() - 1)); + // assert(node_stack.size() - 1 <= _depth); + + if (depth() == 0) // TODO: [[unklikely]] + return false; + --_depth; + node_stack.pop_front(); + return true; + // TODO(h-2): same assertion here again + } + + size_type depth() const + { + assert(_depth != 0 || (node_stack.front().lb == 0 && node_stack.front().rb == index->size() - 1)); + // assert(node_stack.size() - 1 <= _depth); + + return _depth; + } + + // TODO: what is the most suitable return type? outermost container of text_type with char_type in it? + auto path_label() const + { + assert(index->text != nullptr); + + using char_type = typename index_type::char_type; + + if (depth() == 0) // TODO: [[unlikely]] + return std::vector{}; + + auto const & node = node_stack.front(); + const typename index_t::size_type pattern_begin = offset() - index->m_index[node.lb]; + return std::vector(index->text->cbegin() + pattern_begin, + index->text->cbegin() + pattern_begin + depth()); + // return sdsl::extract(index.m_index, pattern_begin, pattern_begin + depth - 1); + } + + size_type count() const + { + auto const & node = node_stack.front(); + return 1 + node.rb - node.lb; + } + + // TODO: best return type? std::vector? + auto locate() const + { + std::vector occ(count()); + auto const & node = node_stack.front(); + for (typename index_t::size_type i = 0; i < occ.size(); ++i) { + occ[i] = offset() - index->m_index[node.lb + i]; + } + return occ; + } + + auto lazy_locate() const + { + const size_type _offset = offset(); + auto const & node = node_stack.front(); + return ranges::view::iota(node.lb, node.lb + count()) + | ranges::view::transform([*this, _offset] (auto sa_pos) { return _offset - index->m_index[sa_pos]; }); + } + +}; + +} // namespace seqan3 diff --git a/include/seqan3/index/fm_index_iterator.hpp b/include/seqan3/index/fm_index_iterator.hpp index d53e3e6b253..19635c52e50 100644 --- a/include/seqan3/index/fm_index_iterator.hpp +++ b/include/seqan3/index/fm_index_iterator.hpp @@ -51,8 +51,7 @@ namespace seqan3 { -// template -// class fm_index_iterator; +// TODO(h-2): char alphabet. to_rank()? // TODO: remove mapping by overwriting backward_search. one only has to deal with incomplete alphabets then (maybe add own alphabet type to sdsl?) diff --git a/test/unit/index/fm_index_iterator_test.cpp b/test/unit/index/fm_index_iterator_test.cpp index d63bd526d05..606b895ecff 100644 --- a/test/unit/index/fm_index_iterator_test.cpp +++ b/test/unit/index/fm_index_iterator_test.cpp @@ -43,16 +43,29 @@ using namespace seqan3; using namespace seqan3::literal; -TEST(fm_index_iterator, ctr) +template +class fm_index_iterator_test : public ::testing::Test +{}; + +// TODO(h-2): remove inner type of iterators? +using fm_index_iterator_types = ::testing::Types>>, + fm_index_history_iterator>>>; + +TYPED_TEST_CASE(fm_index_iterator_test, fm_index_iterator_types); + +TYPED_TEST(fm_index_iterator_test, ctr) { + using text_type = std::vector; using index_type = fm_index>; - using iterator_type = index_type::iterator_type; + using iterator_type = TypeParam; - std::vector text {"ACGACG"_dna4}; - fm_index> sa{text}; + text_type text {"ACGACG"_dna4}; + index_type sa{text}; // custom constructor iterator_type it0{sa}; + EXPECT_EQ(it0.depth(), 0); + EXPECT_EQ(it0.locate().size(), sa.size()); // copy construction iterator_type it1{it0}; @@ -62,28 +75,33 @@ TEST(fm_index_iterator, ctr) iterator_type it2 = it0; EXPECT_EQ(it0, it2); + // TODO(h-2): move construction does not work for history iterator // move construction - iterator_type it3{std::move(it0)}; - EXPECT_EQ(it0, it3); + // iterator_type it3{std::move(it0)}; + // EXPECT_EQ(it0, it3); // move assigment - iterator_type it4 = std::move(it0); - EXPECT_EQ(it0, it4); + // iterator_type it4 = std::move(it0); + // EXPECT_EQ(it0, it4); } -TEST(fm_index_iterator, basic_search) +TYPED_TEST(fm_index_iterator_test, basic_search) { - std::vector text {"ACGACG"_dna4}; - fm_index> sa{text}; + using text_type = std::vector; + using index_type = fm_index>; + using iterator_type = TypeParam; + + text_type text {"ACGACG"_dna4}; + index_type sa{text}; // root - auto it = sa.root(); + iterator_type it(sa); EXPECT_EQ(it.locate(), (std::vector{0, 1, 4, 2, 5, 3, 6})); // sentinel position included EXPECT_EQ(it.depth(), 0); EXPECT_EQ(it.count(), 7); // successful down(range) - it = sa.root(); + it = iterator_type(sa); EXPECT_TRUE(it.down("CG"_dna4)); EXPECT_EQ(it.locate(), (std::vector{1, 4})); EXPECT_EQ(it.depth(), 2); @@ -94,9 +112,9 @@ TEST(fm_index_iterator, basic_search) EXPECT_EQ(it.depth(), 3); EXPECT_EQ(it.count(), 1); - auto it_cpy = it; + iterator_type it_cpy = it; // EXPECT_TRUE(it.down(""_dna4)); // TODO(h-2): should this be allowed? - // EXPECT_EQ(it, it_cpy); + EXPECT_EQ(it, it_cpy); // unsuccessful down(range), it remains untouched it_cpy = it; @@ -104,7 +122,7 @@ TEST(fm_index_iterator, basic_search) EXPECT_EQ(it, it_cpy); // successful down(char) - it = sa.root(); + it = iterator_type(sa); EXPECT_TRUE(it.down(dna4::A)); EXPECT_EQ(it.locate(), (std::vector{0, 3})); EXPECT_EQ(it.depth(), 1); @@ -119,7 +137,7 @@ TEST(fm_index_iterator, basic_search) EXPECT_EQ(it, it_cpy); // successful down() and right() - it = sa.root(); + it = iterator_type(sa); EXPECT_TRUE(it.down()); EXPECT_EQ(it.locate(), (std::vector{0, 3})); EXPECT_EQ(it.depth(), 1); @@ -138,52 +156,56 @@ TEST(fm_index_iterator, basic_search) EXPECT_EQ(it, it_cpy); // unsuccessful down(), it remains untouched - it = sa.root(); + it = iterator_type(sa); EXPECT_TRUE(it.down("GACG"_dna4)); it_cpy = it; EXPECT_FALSE(it.down()); EXPECT_EQ(it, it_cpy); // right() cannot be called on the root node - it = sa.root(); + it = iterator_type(sa); ASSERT_DEATH(it.right(), ""); // path_label() - it = sa.root(); + it = iterator_type(sa); EXPECT_TRUE(it.down("ACG"_dna4)); EXPECT_EQ(it.path_label(), "ACG"_dna4); } -TEST(fm_index_iterator, incomplete_alphabet) +TYPED_TEST(fm_index_iterator_test, incomplete_alphabet) { + using text_type = std::vector; + using index_type = fm_index>; + using iterator_type = TypeParam; + // search a char that does not occur in the text (higher rank than largest char occurring in text) { - std::vector text {"ACGACG"_dna4}; - fm_index> sa{text}; - auto it = sa.root(); + text_type text {"ACGACG"_dna4}; + index_type sa{text}; + iterator_type it = iterator_type(sa); EXPECT_FALSE(it.down(dna4::T)); - EXPECT_EQ(it, sa.root()); + EXPECT_EQ(it, iterator_type(sa)); } // search a char that does not occur in the text (smaller rank than smallest char occurring in text) { - std::vector text {"CGTCGT"_dna4}; - fm_index> sa{text}; - auto it = sa.root(); + text_type text {"CGTCGT"_dna4}; + index_type sa{text}; + iterator_type it = iterator_type(sa); EXPECT_FALSE(it.down(dna4::A)); - EXPECT_EQ(it, sa.root()); + EXPECT_EQ(it, iterator_type(sa)); } // search a char that does not occur in the text (some rank that is neither the smallest nor the highest smallest occurring in text) { - std::vector text {"ATATAT"_dna4}; - fm_index> sa{text}; - auto it = sa.root(); + text_type text {"ATATAT"_dna4}; + index_type sa{text}; + iterator_type it = iterator_type(sa); EXPECT_FALSE(it.down(dna4::C)); EXPECT_FALSE(it.down(dna4::G)); EXPECT_FALSE(it.down("ACGT"_dna4)); EXPECT_FALSE(it.down("G"_dna4)); - EXPECT_EQ(it, sa.root()); + EXPECT_EQ(it, iterator_type(sa)); EXPECT_TRUE(it.down(dna4::A)); EXPECT_TRUE(it.right()); @@ -211,3 +233,55 @@ TEST(fm_index_iterator, lazy_locate) // EXPECT_EQ(it.locate(), it.lazy_locate()); // EXPECT_TRUE(it.locate() == it.lazy_locate()); } + +// // "AACGTACT" +// // std::vector> dfs_result {}; +// /* +// do +// { +// std::cout << representative(it) << std::endl; +// if (!goDown(it) && !goRight(it)) +// while (goUp(it) && !goRight(it)) +// ; +// } +// while (!isRoot(it)); +// */ + +// TEST(fm_index_iterator, history_test) +// { +// std::vector text {"ACGTACCT"_dna4}; +// fm_index> sa{text}; +// +// // root node +// auto it = sa.root_history(); +// EXPECT_FALSE(it.up()); +// EXPECT_EQ(it, sa.root_history()); +// +// // down(char) starting at the root node +// EXPECT_TRUE(it.down(dna4::A)); // "A" +// EXPECT_TRUE(it.up()); // "" +// EXPECT_EQ(it, sa.root_history()); +// +// // down(char) not starting at the root node +// EXPECT_TRUE(it.down(dna4::A)); // "A" +// auto it_cpy = it; +// EXPECT_TRUE(it.down(dna4::C)); // "AC" +// EXPECT_TRUE(it.up()); // "A" +// EXPECT_EQ(it, it_cpy); +// +// // right() +// EXPECT_TRUE(it.down(dna4::C)); // "AC" +// it_cpy = it; +// EXPECT_TRUE(it.down(dna4::C)); // "ACC" +// EXPECT_TRUE(it.right()); // "ACG" +// EXPECT_TRUE(it.up()); // "AC" +// EXPECT_EQ(it, it_cpy); +// +// // down(range) +// it = sa.root_history(); +// EXPECT_TRUE(it.down("AC"_dna4)); +// it_cpy = it; +// EXPECT_TRUE(it.down("GTA"_dna4)); +// EXPECT_TRUE(it.up()); +// EXPECT_EQ(it, it_cpy); +// }