Skip to content

Commit

Permalink
Keep repeats supported by the most patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Oct 3, 2024
1 parent 2c8d0a0 commit 90d411c
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 13 deletions.
16 changes: 10 additions & 6 deletions include/valik/search/local_prefilter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ template <typename binning_bitvector_t>
void find_pattern_bins(pattern_bounds const & pattern,
size_t const & bin_count,
binning_bitvector_t const & counting_table,
std::unordered_set<size_t> & sequence_hits)
std::unordered_map<size_t, size_t> & sequence_hits,
uint64_t & pattern_hits)
{
// counting vector for the current pattern
seqan3::counting_vector<uint8_t> total_counts(bin_count, 0);
Expand All @@ -119,8 +120,9 @@ void find_pattern_bins(pattern_bounds const & pattern,
auto &&count = total_counts[current_bin];
if (count >= pattern.threshold)
{
// the result_set is a union of results from all patterns of a read
sequence_hits.insert(current_bin);
// the result is a union of results from all patterns of a read
sequence_hits[current_bin]++;
pattern_hits++;
}
}
}
Expand Down Expand Up @@ -198,14 +200,16 @@ void local_prefilter(

minimiser.clear();

std::unordered_set<size_t> sequence_hits{};
uint64_t pattern_hits{0};
// {bin ID, pattern hit count}
std::unordered_map<size_t, size_t> sequence_hits{};
pattern_begin_positions(seq.size(), arguments.pattern_size, arguments.query_every, [&](size_t const begin)
{
pattern_bounds const pattern = make_pattern_bounds(begin, arguments, window_span_begin, thresholder);
find_pattern_bins(pattern, bin_count, counting_table, sequence_hits);
find_pattern_bins(pattern, bin_count, counting_table, sequence_hits, pattern_hits);
});

result_cb(record, sequence_hits);
result_cb(record, sequence_hits, pattern_hits);
}
}

Expand Down
21 changes: 14 additions & 7 deletions include/valik/search/producer_threads_parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,27 @@ inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter<ibf_data

std::span<query_t const> records_slice{&records[start], &records[end]};

auto prefilter_cb = [&queue,&arguments,&verbose_out,&ibf](query_t const& record, std::unordered_set<size_t> const& bin_hits)
auto prefilter_cb = [&queue,&arguments,&verbose_out,&ibf](query_t const & record,
std::unordered_map<size_t, size_t> const & bin_hits,
uint64_t const & total_pattern_hits)
{
if (bin_hits.size() > std::max((size_t) 4, (size_t) std::round(ibf.bin_count() / 2.0)))
{
if (!arguments.keep_repeats)
if (arguments.verbose)
verbose_out.write_warning(record, bin_hits.size());
if (arguments.keep_repeats) // keep bin hits that are supported by the most patterns per query segment
{
verbose_out.write_disabled_record(record, bin_hits.size(), arguments.verbose);
return;
size_t mean_bin_support = std::max((size_t) 2, (size_t) std::round((double) total_pattern_hits / (double) bin_hits.size()));
for (auto const [bin, count] : bin_hits)
{
if (count > mean_bin_support)
queue.insert(bin, record);
}
}
else if (arguments.verbose)
verbose_out.write_warning(record, bin_hits.size());
return;
}

for (size_t const bin : bin_hits)
for (auto const [bin, count] : bin_hits)
{
queue.insert(bin, record);
}
Expand Down

0 comments on commit 90d411c

Please sign in to comment.