From 80317ef437110ccfd0f946908ed6d841948506a9 Mon Sep 17 00:00:00 2001 From: Evelin Date: Fri, 11 Oct 2024 16:17:51 +0200 Subject: [PATCH] Dismiss low entropy patterns --- include/raptor/threshold/threshold.hpp | 2 ++ include/valik/search/local_prefilter.hpp | 25 ++++++++++++++++-------- src/raptor/threshold/threshold.cpp | 5 +++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/raptor/threshold/threshold.hpp b/include/raptor/threshold/threshold.hpp index 2a06acd1..658ceec6 100644 --- a/include/raptor/threshold/threshold.hpp +++ b/include/raptor/threshold/threshold.hpp @@ -32,6 +32,8 @@ class threshold size_t get(size_t const minimiser_count) const noexcept; + size_t mean_number_of_minimizers() const noexcept; + private: enum class threshold_kinds { diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index cf1e9889..fefbf8a0 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -67,10 +67,10 @@ struct pattern_bounds * @return pattern_bounds The interval of minimisers corresponding to the pattern. */ template -pattern_bounds make_pattern_bounds(size_t const & begin, - search_arguments const & arguments, - span_vec_t const & window_span_begin, - raptor::threshold::threshold const & thresholder) +std::optional make_pattern_bounds(size_t const & begin, + search_arguments const & arguments, + span_vec_t const & window_span_begin, + raptor::threshold::threshold const & thresholder) { assert(window_span_begin.size() >= 1); assert(window_span_begin[0] == 0); @@ -91,8 +91,16 @@ pattern_bounds make_pattern_bounds(size_t const & begin, size_t const minimiser_count = pattern.end_position - pattern.begin_position; - pattern.threshold = thresholder.get(minimiser_count); - return pattern; + if (arguments.keep_all_repeats || + (arguments.keep_best_repeats && + (minimiser_count >= (thresholder.mean_number_of_minimizers())))) + // ignore low entropy repeat patterns + { + pattern.threshold = thresholder.get(minimiser_count); + return pattern; + } + else + return std::nullopt; } /** @@ -205,8 +213,9 @@ void local_prefilter( std::unordered_map sequence_hits{}; pattern_begin_positions(seq.size(), arguments.pattern_size, arguments.query_every, [&](size_t const begin) { - pattern_bounds const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder); - find_pattern_bins(pattern, bin_count, counting_table, sequence_hits, pattern_hits); + auto const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder); + if (pattern) + find_pattern_bins(*pattern, bin_count, counting_table, sequence_hits, pattern_hits); }); result_cb(record, sequence_hits, pattern_hits); diff --git a/src/raptor/threshold/threshold.cpp b/src/raptor/threshold/threshold.cpp index 1d57418c..5d669280 100644 --- a/src/raptor/threshold/threshold.cpp +++ b/src/raptor/threshold/threshold.cpp @@ -60,5 +60,10 @@ size_t threshold::get(size_t const minimiser_count) const noexcept } } } + +size_t threshold::mean_number_of_minimizers() const noexcept +{ + return (size_t) std::round((maximal_number_of_minimizers - minimal_number_of_minimizers) / 2.0); +} } // namespace raptor::threshold