Skip to content

Commit

Permalink
depth into node again, history iterator tests, some comments
Browse files Browse the repository at this point in the history
  • Loading branch information
cpockrandt committed Aug 6, 2018
1 parent 01f5ec4 commit 1d28372
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 444 deletions.
59 changes: 28 additions & 31 deletions include/seqan3/index/concept.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,15 @@

#pragma once

// #include <seqan3/index/fm_index.hpp>
#include <seqan3/alphabet/nucleotide/dna4.hpp>
#include <seqan3/range/concept.hpp>
#include <seqan3/core/metafunction/range.hpp>
#include <seqan3/range/concept.hpp>

#include <sdsl/suffix_arrays.hpp>

namespace seqan3
{

// ==================================================================
// fm_index_traits
// ==================================================================

// TODO: add history iterator concept, extend fm_index concept

template <typename t>
concept bool fm_index_traits_concept = requires (t v,
typename t::sdsl_index_type::size_type lb,
Expand All @@ -77,24 +70,30 @@ concept bool fm_index_concept = requires (t v)
typename t::text_type;
typename t::char_type;
typename t::size_type;
typename t::iterator_type; // TODO: requires iterator_concept?
typename t::history_iterator_type; // TODO: requires history_iterator_type?
typename t::iterator_type;

// NOTE: circular dependency
// requires fm_index_iterator_concept<typename t::iterator_type>;

// (uint8_t)t::dimensions; // number of nested containers
// (bool)t::is_bidirectional;

// TODO: constructors
requires requires (t index) { { t() } };
requires requires (t index, t const & index2) { { t(index2) } };
requires requires (t index, t const & index2) { { index = index2 } };
requires requires (t index, std::vector<dna4> const & text) { { t(t(text)) } };
requires requires (t index, std::vector<dna4> const & text) { { index = t(text) } };

requires requires (t index, const std::vector<dna4> & text) { { t(text) } };

requires requires (t index, std::vector<dna4> text) { { t(text) } };
requires requires (t index, std::vector<dna4> text) { { index.construct(text) } -> void; };
requires requires (t index, const std::vector<dna4> & text) { { index.construct(text) } -> void; };

{ v.root() } -> typename t::iterator_type;
{ v.root_history() } -> typename t::history_iterator_type;

{ v.size() } -> typename t::size_type;
{ v.size() } -> typename t::size_type;
{ v.empty() } -> bool;

{ v.load(std::string{}) } -> bool;
{ v.load(std::string{}) } -> bool;
{ v.store(std::string{}) } -> bool;
};

Expand All @@ -104,27 +103,25 @@ concept bool fm_index_iterator_concept = requires (t it)
typename t::index_type;
typename t::size_type;

// TODO: constructors
// requires requires (t it, typename t::index_type const & index) { it(index) };
requires fm_index_concept<typename t::index_type>;

requires requires (t it, t const & it2) { { t(it2) } };
requires requires (t it, t const & it2) { { it = it2 } };
requires requires (t it, typename t::index_type const & index) { { t(t(index)) } };
requires requires (t it, typename t::index_type const & index) { { it = t(index) } };

requires requires (typename t::index_type const & index) { { t(index) } };

// { it.down() } -> t &; TODO: discuss return type of suffix tree iterator
{ it.down() } -> bool;
{ it.down(typename t::index_type::char_type{}) } -> bool;
{ it.down(std::vector<typename t::index_type::char_type>{}) } -> bool;
{ it.right() } -> bool;
{ it.depth() } -> typename t::size_type;
// { it.path_label() } -> TODO;
{ it.count() } -> typename t::size_type;
// { it.locate() } -> TODO;
// { it.lazy_locate() } -> TODO;
};

template <typename t>
concept bool fm_index_history_iterator_concept = requires (t it)
{
requires fm_index_iterator_concept<t>;

{ it.up() } -> bool;
{ it.depth() } -> typename t::size_type;
{ it.path_label() } -> auto;
{ it.count() } -> typename t::size_type;
{ it.locate() } -> std::vector<typename t::size_type>;
{ it.lazy_locate() } -> auto;
};

} // namespace seqan3
6 changes: 5 additions & 1 deletion include/seqan3/index/detail/fm_index_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,15 @@ struct fm_index_iterator_node

size_type lb;
size_type rb;
size_type depth;
comp_char_type last_char;

bool operator==(fm_index_iterator_node const & rhs) const
{
return lb == rhs.lb && rb == rhs.rb && last_char == rhs.last_char;
// NOTE: last_char is implementation specific for right().
// lb, rb and depth already determine the node in the suffix tree.
// Thus there is no need to compare last_char.
return lb == rhs.lb && rb == rhs.rb && depth == rhs.depth;
}

bool operator!=(fm_index_iterator_node const & rhs) const
Expand Down
82 changes: 57 additions & 25 deletions include/seqan3/index/fm_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@

// #include <seqan3/index/concept.hpp>
#include <seqan3/index/fm_index_iterator.hpp>
#include <seqan3/index/fm_index_history_iterator.hpp>
#include <seqan3/core/metafunction/range.hpp>

#include <sdsl/suffix_trees.hpp>
Expand Down Expand Up @@ -64,12 +63,61 @@ struct fm_index_default_traits
>;
};

// TODO(h-2): what is missing? noexcept, etc.

// TODO(h-2): what are the possible input types of text? only our own alphabet types? i.e. T<dna>, T<aa27> where T could be any input_range?
// TODO: noexcept is still missing

// TODO: check whether input_range_concept is the correct one! depends on open decisions in sdsl (im-construction, writing in-memory data to tmpfs and on the construction algorithms)
template <input_range_concept text_t/*, uint8_t dimensions*/, fm_index_traits_concept fm_index_traits = fm_index_default_traits>

/*!\brief The SeqAn FM Index.
* \ingroup fm_index
*
* \details
*
* The seqan3::fm_index is fast and space-efficient string index to search strings and collections of strings.
*
* ### General information
*
* Here is a short example on how to build an index and search a pattern using an iterator. Please note that there
* is a very powerful search module with a high-level interface seqan3::TODO that encapsulates the use of iterators.
*
* ```cpp
* #include <seqan3/index/all.hpp>
* #include <vector>
*
* using namespace seqan3;
* using namespace seqan3::literal;
*
* int main(int argc, char ** argv)
* {
* std::vector<dna4> genome {"ATCGATCGAAGGCTAGCTAGCTAAGGGA"_dna4};
* fm_index<std::vector<dna4>> index{text}; // build the index
*
* auto it = index.root(); // create an iterator pointing to the root of a virtual suffix tree
* it.down("AAGG"_dna4); // search
* std::cout << "Number of hits: " << it.count() << '\n'; // outputs: TODO
* std::cout << "Positions in the genome: " << it.locate() << '\n'; // outputs: TODO
*
* return 0;
* }
* ```
*
* Even though the FM index is originally a prefix tree and one performs backward searches, for convenience
* of the user it is implemented as a suffix tree. There is no need to reverse the text to be indexed, the patterns
* to be searched or recompute positions.
*
* Here is an example using a collection of strings (e.g. a genome with multiple chromosomes or a protein database):
*
* TODO
*
* There is also a history iterator, i.e. an iterator that stores its previous states on a stack such that going down
* an edge can be undone. Please take a look at the documentation of seqan3::fm_index_history_iterator::up() since it
* does not undo all operations.
*
* ### Choosing an index implementation
*
* The underlying implementation of the FM Index (Rank data structure, sampling rates, etc.) can be specified ...
*/
template <input_range_concept text_t, fm_index_traits_concept fm_index_traits = fm_index_default_traits>
requires alphabet_concept<innermost_value_type_t<text_t>>
class fm_index
{
protected:
Expand All @@ -85,9 +133,7 @@ class fm_index
using size_type = typename sdsl_index_type::size_type;

using iterator_type = fm_index_iterator<fm_index<text_t, fm_index_traits>>;
using history_iterator_type = fm_index_history_iterator<fm_index<text_t, fm_index_traits>>;
friend class fm_index_iterator<fm_index<text_t, fm_index_traits>>;
friend class fm_index_history_iterator<fm_index<text_t, fm_index_traits>>;
friend class detail::fm_index_iterator_node<fm_index<text_t, fm_index_traits>>;

// static const bool is_bidirectional = false;
Expand All @@ -99,13 +145,12 @@ class fm_index
fm_index(fm_index &&) = default;
fm_index & operator=(fm_index &&) = default;

// TODO(h-2): requires common_reference_concept<detail::innermost_value_type_t<container_t>, char_t> && detail::dimension_v<container_T> == dimensions
// TODO: requires common_reference_concept<detail::innermost_value_type_t<container_t>, char_t> && detail::dimension_v<container_T> == dimensions
fm_index(text_t const & text)
{
construct(text);
}

// TODO(h-2): should we remove this function to avoid passing rvalue references?
void construct(text_t const & text)
{
this->text = &text;
Expand All @@ -115,10 +160,12 @@ class fm_index
// TODO: sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
sdsl::int_vector<8> tmp_text(text.size());
for (auto it = text.cbegin(); it != text.cend(); it++)
tmp_text[text.cend() - it - 1] = (*it).to_rank() + 1; // reverse and increase rank by one
tmp_text[text.cend() - it - 1] = to_rank(*it) + 1; // reverse and increase rank by one
sdsl::construct_im(m_index, tmp_text, 0);
}

void construct(text_t &&) = delete;

size_type size() const
{
return m_index.size();
Expand All @@ -140,12 +187,6 @@ class fm_index
return iterator_type(*this);
}

// TODO(h-2): naming?
history_iterator_type root_history() const
{
return history_iterator_type(*this);
}

// TODO: replace with cereal once sdsl supports it
bool load(std::string const & path)
{
Expand All @@ -159,13 +200,4 @@ class fm_index

};

// TODO(h-2): where to put this code? concept.hpp doesn't seem to work :/
#ifndef NDEBUG

// static_assert(fm_index_concept<fm_index<std::vector<dna4>>>);
// static_assert(fm_index_traits_concept<fm_index_default_traits>);
// static_assert(fm_index_iterator_concept<fm_index_iterator<fm_index<std::vector<dna4>>>>);

#endif

} // namespace seqan3
Loading

0 comments on commit 1d28372

Please sign in to comment.