From 955a465e6f60a75b302c36359469bf182c60dd1e Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Fri, 28 Jun 2024 16:35:32 +0200 Subject: [PATCH] Try stellar matcher --- .../matcher/stellar_matcher.hpp | 163 +++++++++++++++++- 1 file changed, 155 insertions(+), 8 deletions(-) diff --git a/include/utilities/alphabet_wrapper/matcher/stellar_matcher.hpp b/include/utilities/alphabet_wrapper/matcher/stellar_matcher.hpp index 82406ee9..df8c52c4 100644 --- a/include/utilities/alphabet_wrapper/matcher/stellar_matcher.hpp +++ b/include/utilities/alphabet_wrapper/matcher/stellar_matcher.hpp @@ -6,28 +6,42 @@ // ----------------------------------------------------------------------------------------------------- /*!\file - * \brief Provides an adapter to make the horspool online pattern matching algorithm work with the JST. + * \brief Provides an adapter to make the SWIFT algorithm work with std views. * \author Rene Rahn */ #pragma once +#include + +#include + #include namespace jst::contrib { template - class stellar_matcher: public seqan_pattern_base> + class stellar_matcher : public seqan_pattern_base> { private: - friend seqan_pattern_base>; + using base_t = seqan_pattern_base>; + + friend base_t; using compatible_needle_type = jst::contrib::seqan_container_t; - using pattern_type = seqan2::Pattern>; + using multi_needle_type = seqan2::StringSet; + using qgram_shape_type = seqan2::Shape, seqan2::SimpleShape>; + using finder_spec_type = seqan2::Swift; + using index_type = seqan2::Index>; + using pattern_type = seqan2::Pattern; - pattern_type _pattern{}; + multi_needle_type _multi_needle{}; + index_type _needle_index{_multi_needle}; + pattern_type _pattern{_needle_index}; + double _error_rate{}; + unsigned _min_length{}; public: @@ -35,12 +49,145 @@ namespace jst::contrib template requires (!std::same_as<_needle_t, stellar_matcher> && std::constructible_from) - explicit stellar_matcher(_needle_t && needle) : - _pattern{jst::contrib::make_seqan_container(std::views::all((_needle_t &&) needle))} - {} + explicit stellar_matcher(_needle_t && needle, double error_rate = 0.0, unsigned min_length = 100) : + _error_rate{error_rate}, _min_length{min_length} + { + appendValue(getFibre(_needle_index, seqan2::QGramText{}), + jst::contrib::make_seqan_container(std::views::all((_needle_t &&) needle))); + _patternInit(_pattern, _error_rate, _min_length); + } + + template + requires (!std::same_as<_multi_needle_t, stellar_matcher> && + std::constructible_from>) + explicit stellar_matcher(_multi_needle_t && multi_needle, double error_rate = 0.0, unsigned min_length = 100) : + _error_rate{error_rate}, _min_length{min_length} + { + for (auto && needle : multi_needle) + appendValue(getFibre(_needle_index, seqan2::QGramText{}), + jst::contrib::make_seqan_container(std::views::all((decltype(needle) &&) needle))); + + _patternInit(_pattern, _error_rate, _min_length); + } + + constexpr auto position() const noexcept { + return seqan2::position(_pattern); + } + private: + + template + constexpr auto make_finder(haystack_t & haystack) const noexcept + { + // TODO: get localOptions to configure repeat length and period. + return seqan2::Finder{haystack}; + } + + constexpr stellar_matcher & get_pattern() noexcept { + return *this; + } + + constexpr auto custom_find_arguments() const noexcept { + return std::tuple{_error_rate, _min_length}; + } + + constexpr friend std::size_t tag_invoke(std::tag_t, stellar_matcher const & me) noexcept { + return length(getFibre(needle(me._pattern), seqan2::QGramShape{})); + } + + ///////////////////////////////////////////////////////////////////////////////// + template + constexpr bool initialise(seqan2::Finder & finder, + pattern_type & pattern) + { + pattern.finderLength = std::ranges::size(haystack(finder)); + seqan2::_patternInit(pattern, _error_rate, _min_length); + seqan2::_finderSetNonEmpty(finder); + finder.dotPos = 100000; + finder.dotPos2 = 10 * finder.dotPos; + + if (!seqan2::_firstNonRepeatRange(finder, pattern)) + return false; + + if (seqan2::_swiftMultiProcessQGram(finder, pattern, seqan2::hash(pattern.shape, hostIterator(hostIterator(finder))))) + seqan2::_copySwiftHit(finder, pattern); + + return true; + } + + template + friend bool find(seqan2::Finder & finder, + stellar_matcher & matcher, + args_t && ...args) + { + pattern_type & pattern = matcher._pattern; + if (empty(finder)) { + if (!matcher.initialise(finder, pattern)) { + return false; + //!TODO: can an empty finder have hits? + } else if (finder.curHit != finder.endHit) { + return true; + } + } + + // all previous matches reported -> search new ones + clear(finder.hits); + + //!TODO: replace with call to seqan2::find() ? + // are we at the end of the text? + if (seqan2::atEnd(finder) && finder.curRepeat == finder.endRepeat) + { + finder.curHit = finder.endHit; + return false; + } + + do + { + if (pattern.params.printDots) seqan2::_printDots(finder); + if (seqan2::atEnd(++finder)) + { + if (!seqan2::_nextNonRepeatRange(finder, pattern)) + { + if(seqan2::_swiftMultiFlushBuckets(finder, pattern)) + { + seqan2::_copySwiftHit(finder, pattern); + return true; + } + else + return false; + } + seqan2::hash(pattern.shape, seqan2::hostIterator(seqan2::hostIterator(finder))); + } + else + { + ++finder.curPos; + seqan2::hashNext(pattern.shape, seqan2::hostIterator(seqan2::hostIterator(finder))); + } + + if (seqan2::_swiftMultiProcessQGram(finder, pattern, seqan2::value(pattern.shape))) + { + seqan2::_copySwiftHit(finder, pattern); + return true; + } + + } while (true); + } }; + ///////////////////////////////////////////////////////////////////////////////// + template stellar_matcher(needle_t &&) -> stellar_matcher>; + template + stellar_matcher(needle_t &&, double) -> stellar_matcher>; + + template + requires std::ranges::random_access_range> + stellar_matcher(multi_needle_t &&) -> stellar_matcher>>; + + template + requires std::ranges::random_access_range> + stellar_matcher(multi_needle_t &&, double) -> stellar_matcher>>; + } // namespace jst::contrib