Skip to content

Commit

Permalink
Dismiss low entropy patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Oct 11, 2024
1 parent a33d321 commit 80317ef
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 8 deletions.
2 changes: 2 additions & 0 deletions include/raptor/threshold/threshold.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class threshold

size_t get(size_t const minimiser_count) const noexcept;

size_t mean_number_of_minimizers() const noexcept;

private:
enum class threshold_kinds
{
Expand Down
25 changes: 17 additions & 8 deletions include/valik/search/local_prefilter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ struct pattern_bounds
* @return pattern_bounds The interval of minimisers corresponding to the pattern.
*/
template <typename span_vec_t>
pattern_bounds make_pattern_bounds(size_t const & begin,
search_arguments const & arguments,
span_vec_t const & window_span_begin,
raptor::threshold::threshold const & thresholder)
std::optional<pattern_bounds> make_pattern_bounds(size_t const & begin,
search_arguments const & arguments,
span_vec_t const & window_span_begin,
raptor::threshold::threshold const & thresholder)
{
assert(window_span_begin.size() >= 1);
assert(window_span_begin[0] == 0);
Expand All @@ -91,8 +91,16 @@ pattern_bounds make_pattern_bounds(size_t const & begin,

size_t const minimiser_count = pattern.end_position - pattern.begin_position;

pattern.threshold = thresholder.get(minimiser_count);
return pattern;
if (arguments.keep_all_repeats ||
(arguments.keep_best_repeats &&
(minimiser_count >= (thresholder.mean_number_of_minimizers()))))
// ignore low entropy repeat patterns
{
pattern.threshold = thresholder.get(minimiser_count);
return pattern;
}
else
return std::nullopt;
}

/**
Expand Down Expand Up @@ -205,8 +213,9 @@ void local_prefilter(
std::unordered_map<size_t, size_t> sequence_hits{};
pattern_begin_positions(seq.size(), arguments.pattern_size, arguments.query_every, [&](size_t const begin)
{
pattern_bounds const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder);
find_pattern_bins(pattern, bin_count, counting_table, sequence_hits, pattern_hits);
auto const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder);
if (pattern)
find_pattern_bins(*pattern, bin_count, counting_table, sequence_hits, pattern_hits);
});

result_cb(record, sequence_hits, pattern_hits);
Expand Down
5 changes: 5 additions & 0 deletions src/raptor/threshold/threshold.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,10 @@ size_t threshold::get(size_t const minimiser_count) const noexcept
}
}
}

size_t threshold::mean_number_of_minimizers() const noexcept
{
return (size_t) std::round((maximal_number_of_minimizers - minimal_number_of_minimizers) / 2.0);
}

} // namespace raptor::threshold

0 comments on commit 80317ef

Please sign in to comment.