Skip to content

Commit

Permalink
fixed a lot of stuff and tests (reference of index in iterator conver…
Browse files Browse the repository at this point in the history
…ted to a ptr, moved depth out of node for history iterator)
  • Loading branch information
cpockrandt committed Aug 5, 2018
1 parent 82b6789 commit 8b33a21
Show file tree
Hide file tree
Showing 6 changed files with 227 additions and 121 deletions.
3 changes: 1 addition & 2 deletions include/seqan3/index/detail/fm_index_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,11 @@ struct fm_index_iterator_node

size_type lb;
size_type rb;
size_type depth;
comp_char_type last_char;

bool operator==(fm_index_iterator_node const & rhs) const
{
return lb == rhs.lb && rb == rhs.rb && depth == rhs.depth && last_char == rhs.last_char;
return lb == rhs.lb && rb == rhs.rb && last_char == rhs.last_char;
}

bool operator!=(fm_index_iterator_node const & rhs) const
Expand Down
18 changes: 6 additions & 12 deletions include/seqan3/index/fm_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,6 @@
namespace seqan3
{

// namespace detail {
// template <typename index_t> // TODO: fm_index_concept instead of typename
// class fm_index_iterator_node;
// }
//
// template <typename index_t> // TODO: fm_index_concept instead of typename
// class fm_index_iterator;

struct fm_index_default_traits
{
using sdsl_index_type = sdsl::csa_wt<
Expand All @@ -71,7 +63,9 @@ struct fm_index_default_traits
>;
};

// TODO: what are the possible input types of text? only our own alphabet types? i.e. T<dna>, T<aa27> where T could be any input_range?
// TODO(h-2): what is missing? noexcept, etc.

// TODO(h-2): what are the possible input types of text? only our own alphabet types? i.e. T<dna>, T<aa27> where T could be any input_range?

// TODO: check whether input_range_concept is the correct one! depends on open decisions in sdsl (im-construction, writing in-memory data to tmpfs and on the construction algorithms)
template <input_range_concept text_t/*, uint8_t dimensions*/, fm_index_traits_concept fm_index_traits = fm_index_default_traits>
Expand Down Expand Up @@ -102,13 +96,13 @@ class fm_index
fm_index(fm_index &&) = default;
fm_index & operator=(fm_index &&) = default;

// requires common_reference_concept<detail::innermost_value_type_t<container_t>, char_t> && detail::dimension_v<container_T> == dimensions
// TODO(h-2): requires common_reference_concept<detail::innermost_value_type_t<container_t>, char_t> && detail::dimension_v<container_T> == dimensions
fm_index(text_t const & text)
{
construct(text);
}

// can you allow for rvalues and make the index store them?
// TODO(h-2): should we remove this function to avoid passing rvalue references?
void construct(text_t const & text)
{
this->text = &text;
Expand Down Expand Up @@ -156,7 +150,7 @@ class fm_index

};

// TODO: where to put this code? concept.hpp doesn't seem to work :/
// TODO(h-2): where to put this code? concept.hpp doesn't seem to work :/
#ifndef NDEBUG

// static_assert(fm_index_concept<fm_index<std::vector<dna4>>>);
Expand Down
88 changes: 59 additions & 29 deletions include/seqan3/index/fm_index_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@ namespace seqan3
// template <typename index_t>
// class fm_index_iterator;

// TODO: remove mapping by overwriting backward_search. one only has to deal with incomplete alphabets then (maybe add own alphabet type to sdsl?)

// TODO: to_rank() + 1 consistent with comp_char, mapping and implicit_sentinel?

// NOTE: for convenience the fm_index behaves like a suffix tree, not a prefix tree
// NOTE: bidirectional_fm_index_concept must fulfill fm_index_concept (subset) such that
// a bidirectional index can be plugged into a unidirectional iterator
template <typename index_t> // TODO: fm_index_concept instead of typename
template <typename index_t> // TODO(h-2): fm_index_concept instead of typename
class fm_index_iterator
{

Expand All @@ -71,13 +73,14 @@ class fm_index_iterator
protected:
using node_type = detail::fm_index_iterator_node<index_t>;

index_type const & index;
const index_type * index; // TODO(h-2): reference don't work if we wan't an assignment operator. Maybe use weak_ptr? index_type const & index;
size_type parent_lb, parent_rb;
size_type _depth; // TODO(h-2): naming? Conflict because of depth() member function
node_type node;

size_type offset() const
{
return index.m_index.size() - depth() - 1; // since the string is reversed during construction
return index->m_index.size() - depth() - 1; // since the string is reversed during construction
}

public:
Expand All @@ -88,13 +91,14 @@ class fm_index_iterator
fm_index_iterator(fm_index_iterator &&) = default;
fm_index_iterator & operator=(fm_index_iterator &&) = default;

fm_index_iterator(index_t const & _index) : index(_index), node({0, _index.m_index.size() - 1, 0, 0})
fm_index_iterator(index_t const & _index) : index(&_index), _depth(0), node({0, _index.m_index.size() - 1, 0})
{}

// TODO: cannot compare indices yet (not supported by sdsl)
bool operator==(fm_index_iterator const & rhs) const
{
return node == rhs.node && parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb;
// parent_lb/parent_rb might be uninitialized in a root node
return node == rhs.node && depth() == rhs.depth() && ((parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb) || depth() == 0);
}

bool operator!=(fm_index_iterator const & rhs) const
Expand All @@ -108,12 +112,13 @@ class fm_index_iterator
{
typename index_type::comp_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
typename index_type::size_type _lb, _rb;
for (; c < index.m_index.sigma && !sdsl::backward_search(index.m_index, node.lb, node.rb, index.m_index.comp2char[c], _lb, _rb); ++c) {}
if (c != index.m_index.sigma)
for (; c < index->m_index.sigma && !sdsl::backward_search(index->m_index, node.lb, node.rb, index->m_index.comp2char[c], _lb, _rb); ++c) {}
if (c != index->m_index.sigma)
{
++_depth;
parent_lb = node.lb;
parent_rb = node.rb;
node = {_lb, _rb, node.depth + 1, c};
node = {_lb, _rb, c};
return true;
}
return false;
Expand All @@ -122,11 +127,20 @@ class fm_index_iterator
bool down(typename index_type::char_type const & c)
{
typename index_type::size_type _lb, _rb;
if (sdsl::backward_search(index.m_index, node.lb, node.lb, index.m_index.comp2char[c.to_rank() + 1], _lb, _rb))

auto c_char = c.to_rank() + 1;
auto c_comp = index->m_index.char2comp[c_char];

// character does not occur in text / index
if (!c_comp) // TODO: [[unlikely]]
return false;

if (sdsl::backward_search(index->m_index, node.lb, node.rb, c_char, _lb, _rb))
{
++_depth;
parent_lb = node.lb;
parent_rb = node.rb;
node = {_lb, _rb, node.depth + 1, c.to_rank() + 1};
node = {_lb, _rb, c_comp};
return true;
}
return false;
Expand All @@ -139,64 +153,80 @@ class fm_index_iterator
// requires (innermost_value_type_t<pattern_t> == typename index_t::char_type)
bool down(pattern_t && pattern)
{
// TODO(h-2): empty patterns will lead a segmentation fault.
// checking for this would lead to another branching (otherwise c_comp would be unitialized and overwrite it).
assert(pattern.size() > 0);

typename index_type::size_type _lb = node.lb, _rb = node.rb;
typename index_type::size_type _parent_lb = node.lb, _parent_rb = node.rb;

auto first = pattern.cbegin();
auto last = pattern.cend();

typename index_type::comp_char_type c_char;
typename index_type::comp_char_type c_comp;

for (auto it = first; it != last; ++it)
{
c_char = (*it).to_rank() + 1;
c_comp = index->m_index.char2comp[c_char];

// character does not occur in text / index
if (!c_comp) // TODO: [[unlikely]]
return false;

_parent_lb = _lb;
_parent_rb = _rb;
if (!sdsl::backward_search(index.m_index, _parent_lb, _parent_rb, (*it).to_rank() + 1, _lb, _rb))
if (!sdsl::backward_search(index->m_index, _parent_lb, _parent_rb, c_char, _lb, _rb))
return false;
}
node = {_lb, _rb, node.depth + (last - first), (*(last - 1)).to_rank() + 1};

_depth += last - first;
parent_lb = _parent_lb;
parent_rb = _parent_rb;
node = {_lb, _rb, c_comp};
return true;
}

bool right()
{
assert(node.depth > 0);
assert(depth() > 0);
typename index_type::comp_char_type c = node.last_char + 1;
typename index_type::size_type _lb, _rb;
while (c < index.m_index.sigma && !sdsl::backward_search(index.m_index, parent_lb, parent_rb, index.m_index.comp2char[c], _lb, _rb))
{

while (c < index->m_index.sigma && !sdsl::backward_search(index->m_index, parent_lb, parent_rb, index->m_index.comp2char[c], _lb, _rb))
++c;
}
if (c != index.m_index.sigma)

if (c != index->m_index.sigma)
{
parent_lb = node.lb;
parent_rb = node.rb;
node = {_lb, _rb, node.depth, c};
// parent_lb = node.lb;
// parent_rb = node.rb;
node = {_lb, _rb, c};
return true;
}
return false;
}

size_type depth() const
{
return node.depth;
assert(_depth != 0 || (node.lb == 0 && node.rb == index->size() - 1));
return _depth;
}

// TODO: what is the most suitable return type? outermost container of text_type with char_type in it?
auto path_label() const
{
assert(index.text != nullptr);
assert(index->text != nullptr);

using char_type = typename index_type::char_type;

if (node.depth == 0) // TODO: [[unlikely]]
if (depth() == 0) // TODO: [[unlikely]]
return std::vector<char_type>{};

const typename index_t::size_type pattern_begin = offset() - index.m_index[node.lb];
return std::vector<char_type>(index.text->cbegin() + pattern_begin,
index.text->cbegin() + pattern_begin + node.depth);
// return sdsl::extract(index.m_index, pattern_begin, pattern_begin + node.depth - 1);
const typename index_t::size_type pattern_begin = offset() - index->m_index[node.lb];
return std::vector<char_type>(index->text->cbegin() + pattern_begin,
index->text->cbegin() + pattern_begin + depth());
// return sdsl::extract(index.m_index, pattern_begin, pattern_begin + depth - 1);
}

size_type count() const
Expand All @@ -209,7 +239,7 @@ class fm_index_iterator
{
std::vector<size_type> occ(count());
for (typename index_t::size_type i = 0; i < occ.size(); ++i) {
occ[i] = offset() - index.m_index[node.lb + i];
occ[i] = offset() - index->m_index[node.lb + i];
}
return occ;
}
Expand All @@ -218,7 +248,7 @@ class fm_index_iterator
{
const size_type _offset = offset();
return ranges::view::iota(node.lb, node.lb + count())
| ranges::view::transform([*this, _offset] (auto sa_pos) { return _offset - index.m_index[sa_pos]; });
| ranges::view::transform([*this, _offset] (auto sa_pos) { return _offset - index->m_index[sa_pos]; });
}

};
Expand Down
15 changes: 15 additions & 0 deletions include/seqan3/index/some_stuff.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,18 @@

// template <typename index_t>
// class bidirectional_suffix_array_history_iterator;





// std::cout << "rank: " << (unsigned)c.to_rank() << '\n';
// std::cout << "sigma: " << index->m_index.sigma << '\n';
// std::cout << "comp2char: ";
// for (unsigned i = 0; i < index->m_index.sigma; ++i)
// std::cout << (unsigned)index->m_index.comp2char[i] << ' ';
// std::cout << '\n';
// std::cout << "char2comp: ";
// for (unsigned i = 0; i < 255; ++i)
// std::cout << (unsigned)index->m_index.char2comp[i] << ' ';
// std::cout << '\n';
Loading

0 comments on commit 8b33a21

Please sign in to comment.