Skip to content

Commit

Permalink
started with history_iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
cpockrandt committed Aug 5, 2018
1 parent 8b33a21 commit 01f5ec4
Show file tree
Hide file tree
Showing 5 changed files with 410 additions and 39 deletions.
20 changes: 16 additions & 4 deletions include/seqan3/index/concept.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ namespace seqan3
// fm_index_traits
// ==================================================================

// TODO: add history iterator concept, extend fm_index concept

template <typename t>
concept bool fm_index_traits_concept = requires (t v,
typename t::sdsl_index_type::size_type lb,
Expand All @@ -64,7 +66,7 @@ concept bool fm_index_traits_concept = requires (t v,
(typename t::sdsl_index_type{})[0]; // suffix array access
(typename t::sdsl_index_type{}).comp2char[0];
(typename t::sdsl_index_type{}).sigma;
{ sdsl::construct_im(m_index, sdsl::int_vector<8> {}, 0) } -> void; // TODO! should allow any input_range
{ sdsl::construct_im(m_index, sdsl::int_vector<8> {}, 0) } -> void; // TODO should allow any input_range
{ sdsl::backward_search(typename t::sdsl_index_type{}, lb, rb, typename t::sdsl_index_type::char_type{},
lb, rb) } -> typename t::sdsl_index_type::size_type;
};
Expand All @@ -76,18 +78,22 @@ concept bool fm_index_concept = requires (t v)
typename t::char_type;
typename t::size_type;
typename t::iterator_type; // TODO: requires iterator_concept?
typename t::history_iterator_type; // TODO: requires history_iterator_type?

// (uint8_t)t::dimensions; // number of nested containers
// (bool)t::is_bidirectional;

// TODO: constructors

// TODO: construction_policy
requires requires (t index, std::vector<dna4> text) { { t(text) } };
requires requires (t index, std::vector<dna4> text) { { index.construct(text) } -> void; };

{ v.root() } -> typename t::iterator_type;
{ v.root_history() } -> typename t::history_iterator_type;

{ v.size() } -> typename t::size_type;
{ v.empty() } -> bool;

{ v.load(std::string{}) } -> bool;
{ v.store(std::string{}) } -> bool;
};
Expand All @@ -98,8 +104,6 @@ concept bool fm_index_iterator_concept = requires (t it)
typename t::index_type;
typename t::size_type;

//requires index_tree_concept<typename t::index_type>;

// TODO: constructors
// requires requires (t it, typename t::index_type const & index) { it(index) };

Expand All @@ -115,4 +119,12 @@ concept bool fm_index_iterator_concept = requires (t it)
// { it.lazy_locate() } -> TODO;
};

template <typename t>
concept bool fm_index_history_iterator_concept = requires (t it)
{
requires fm_index_iterator_concept<t>;

{ it.up() } -> bool;
};

} // namespace seqan3
9 changes: 9 additions & 0 deletions include/seqan3/index/fm_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

// #include <seqan3/index/concept.hpp>
#include <seqan3/index/fm_index_iterator.hpp>
#include <seqan3/index/fm_index_history_iterator.hpp>
#include <seqan3/core/metafunction/range.hpp>

#include <sdsl/suffix_trees.hpp>
Expand Down Expand Up @@ -84,7 +85,9 @@ class fm_index
using size_type = typename sdsl_index_type::size_type;

using iterator_type = fm_index_iterator<fm_index<text_t, fm_index_traits>>;
using history_iterator_type = fm_index_history_iterator<fm_index<text_t, fm_index_traits>>;
friend class fm_index_iterator<fm_index<text_t, fm_index_traits>>;
friend class fm_index_history_iterator<fm_index<text_t, fm_index_traits>>;
friend class detail::fm_index_iterator_node<fm_index<text_t, fm_index_traits>>;

// static const bool is_bidirectional = false;
Expand Down Expand Up @@ -137,6 +140,12 @@ class fm_index
return iterator_type(*this);
}

// TODO(h-2): naming?
history_iterator_type root_history() const
{
return history_iterator_type(*this);
}

// TODO: replace with cereal once sdsl supports it
bool load(std::string const & path)
{
Expand Down
277 changes: 277 additions & 0 deletions include/seqan3/index/fm_index_history_iterator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
// ============================================================================
// SeqAn - The Library for Sequence Analysis
// ============================================================================
//
// Copyright (c) 2006-2017, Knut Reinert & Freie Universitaet Berlin
// Copyright (c) 2016-2017, Knut Reinert & MPI Molekulare Genetik
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of Knut Reinert or the FU Berlin nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
// DAMAGE.
//
// ============================================================================
// Author: Christopher Pockrandt <christopher.pockrandt AT fu-berlin.de>
// ============================================================================

#pragma once

#include <seqan3/index/detail/fm_index_iterator.hpp>
#include <seqan3/index/concept.hpp>
// #include <seqan3/index/fm_index.hpp>
#include <seqan3/std/concept/iterator.hpp>
#include <seqan3/range/concept.hpp>
#include <seqan3/core/metafunction/range.hpp>

#include <sdsl/suffix_trees.hpp>

#include <range/v3/view/iota.hpp>
#include <range/v3/view/transform.hpp>

namespace seqan3
{

// TODO: remove mapping by overwriting backward_search. one only has to deal with incomplete alphabets then (maybe add own alphabet type to sdsl?)

// TODO: to_rank() + 1 consistent with comp_char, mapping and implicit_sentinel?

// NOTE: for convenience the fm_index behaves like a suffix tree, not a prefix tree
// NOTE: bidirectional_fm_index_concept must fulfill fm_index_concept (subset) such that
// a bidirectional index can be plugged into a unidirectional iterator
template <typename index_t> // TODO(h-2): fm_index_concept instead of typename
class fm_index_history_iterator
{

public:
using index_type = index_t;
using size_type = typename index_type::size_type;
// using text_pos_t = typename index_t::size_type; // TODO: depends on dimensions of the text

protected:
using node_type = detail::fm_index_iterator_node<index_t>;

const index_type * index; // TODO(h-2): reference don't work if we wan't an assignment operator. Maybe use weak_ptr? index_type const & index;
size_type _depth; // TODO(h-2): naming? Conflict because of depth() member function
std::forward_list<node_type> node_stack; // TODO(h-2): better a forward_list instead of vector? but probably not cache local?

size_type offset() const
{
return index->m_index.size() - depth() - 1; // since the string is reversed during construction
}

public:

fm_index_history_iterator() = delete;
fm_index_history_iterator(fm_index_history_iterator const &) = default;
fm_index_history_iterator & operator=(fm_index_history_iterator const &) = default;
fm_index_history_iterator(fm_index_history_iterator &&) = default;
fm_index_history_iterator & operator=(fm_index_history_iterator &&) = default;

fm_index_history_iterator(index_t const & _index) : index(&_index), _depth(0), node_stack({{0, _index.m_index.size() - 1, 0}})
{}

// TODO: cannot compare indices yet (not supported by sdsl)
bool operator==(fm_index_history_iterator const & rhs) const
{
return node_stack == rhs.node_stack && depth() == rhs.depth();
}

bool operator!=(fm_index_history_iterator const & rhs) const
{
return !(*this == rhs);
}

// TODO: implement first backward search more efficiently if iterator is at root node!
// NOTE: whenever false is returned, it is guaranteed that the iterator remained untouched
bool down()
{
auto const & node = node_stack.front();

typename index_type::comp_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
typename index_type::size_type _lb, _rb;
while (c < index->m_index.sigma &&
!sdsl::backward_search(index->m_index, node.lb, node.rb, index->m_index.comp2char[c], _lb, _rb))
{
++c;
}
if (c != index->m_index.sigma)
{
++_depth;
node_stack.push_front({_lb, _rb, c});
return true;
}
return false;
}

bool down(typename index_type::char_type const & c)
{
typename index_type::size_type _lb, _rb;

auto c_char = c.to_rank() + 1;
auto c_comp = index->m_index.char2comp[c_char];

// character does not occur in text / index
if (!c_comp) // TODO: [[unlikely]]
return false;

auto const & node = node_stack.front();

if (sdsl::backward_search(index->m_index, node.lb, node.rb, c_char, _lb, _rb))
{
++_depth;
node_stack.push_front({_lb, _rb, c_comp});
return true;
}
return false;
}

// TODO: requires clause does not work with == ... syntax error?
// TODO: enforce the innermost type of char_type to be convertable to char_type
template <range_concept pattern_t>
requires std::is_same_v<innermost_value_type_t<pattern_t>, typename index_t::char_type>
// requires (innermost_value_type_t<pattern_t> == typename index_t::char_type)
bool down(pattern_t && pattern)
{
// TODO(h-2): empty patterns will lead a segmentation fault.
// checking for this would lead to another branching (otherwise c_comp would be unitialized and overwrite it).
assert(pattern.size() > 0);

auto const & node = node_stack.front();
typename index_type::size_type _lb = node.lb, _rb = node.rb;

auto first = pattern.cbegin();
auto last = pattern.cend();

typename index_type::comp_char_type c_char;
typename index_type::comp_char_type c_comp;

for (auto it = first; it != last; ++it)
{
c_char = (*it).to_rank() + 1;
c_comp = index->m_index.char2comp[c_char];

// character does not occur in text / index
if (!c_comp) // TODO: [[unlikely]]
return false;

if (!sdsl::backward_search(index->m_index, _lb, _rb, c_char, _lb, _rb))
return false;
}

_depth += last - first;
node_stack.push_front({_lb, _rb, c_comp});
return true;
}

bool right()
{
assert(depth() > 0/* && node_stack.size() > 1*/);

auto const & node = node_stack.front();

typename index_type::comp_char_type c = node.last_char + 1;
typename index_type::size_type _lb, _rb;

auto const parent_lb = (*(++node_stack.cbegin())).lb;
auto const parent_rb = (*(++node_stack.cbegin())).rb;
while (c < index->m_index.sigma && !sdsl::backward_search(index->m_index, parent_lb, parent_rb, index->m_index.comp2char[c], _lb, _rb))
++c;

if (c != index->m_index.sigma)
{
// parent_lb = node.lb;
// parent_rb = node.rb;
node_stack.front() = {_lb, _rb, c};
return true;
}
return false;
}

// TODO(h-2): what is the expected behavior after down(range)? we might have to include "depth" into "node" again
bool up()
{
// TODO(h-2): are there class variants to avoid redundant assertions?
assert(_depth != 0 || (node_stack.front().lb == 0 && node_stack.front().rb == index->size() - 1));
// assert(node_stack.size() - 1 <= _depth);

if (depth() == 0) // TODO: [[unklikely]]
return false;
--_depth;
node_stack.pop_front();
return true;
// TODO(h-2): same assertion here again
}

size_type depth() const
{
assert(_depth != 0 || (node_stack.front().lb == 0 && node_stack.front().rb == index->size() - 1));
// assert(node_stack.size() - 1 <= _depth);

return _depth;
}

// TODO: what is the most suitable return type? outermost container of text_type with char_type in it?
auto path_label() const
{
assert(index->text != nullptr);

using char_type = typename index_type::char_type;

if (depth() == 0) // TODO: [[unlikely]]
return std::vector<char_type>{};

auto const & node = node_stack.front();
const typename index_t::size_type pattern_begin = offset() - index->m_index[node.lb];
return std::vector<char_type>(index->text->cbegin() + pattern_begin,
index->text->cbegin() + pattern_begin + depth());
// return sdsl::extract(index.m_index, pattern_begin, pattern_begin + depth - 1);
}

size_type count() const
{
auto const & node = node_stack.front();
return 1 + node.rb - node.lb;
}

// TODO: best return type? std::vector?
auto locate() const
{
std::vector<size_type> occ(count());
auto const & node = node_stack.front();
for (typename index_t::size_type i = 0; i < occ.size(); ++i) {
occ[i] = offset() - index->m_index[node.lb + i];
}
return occ;
}

auto lazy_locate() const
{
const size_type _offset = offset();
auto const & node = node_stack.front();
return ranges::view::iota(node.lb, node.lb + count())
| ranges::view::transform([*this, _offset] (auto sa_pos) { return _offset - index->m_index[sa_pos]; });
}

};

} // namespace seqan3
3 changes: 1 addition & 2 deletions include/seqan3/index/fm_index_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@
namespace seqan3
{

// template <typename index_t>
// class fm_index_iterator;
// TODO(h-2): char alphabet. to_rank()?

// TODO: remove mapping by overwriting backward_search. one only has to deal with incomplete alphabets then (maybe add own alphabet type to sdsl?)

Expand Down
Loading

0 comments on commit 01f5ec4

Please sign in to comment.