From db0dce5e401776b854b4872f9e775b4b68b478ca Mon Sep 17 00:00:00 2001 From: Evelin Date: Mon, 23 Sep 2024 16:05:04 +0200 Subject: [PATCH] Disable queries in repeats --- .../search/producer_threads_parallel.hpp | 9 +++-- include/valik/search/sync_out.hpp | 36 +++++++++++-------- include/valik/shared.hpp | 2 +- src/argument_parsing/search.cpp | 6 ++-- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/include/valik/search/producer_threads_parallel.hpp b/include/valik/search/producer_threads_parallel.hpp index ccfea993..7139284b 100644 --- a/include/valik/search/producer_threads_parallel.hpp +++ b/include/valik/search/producer_threads_parallel.hpp @@ -47,12 +47,15 @@ inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter std::max((size_t) 4, (size_t) std::round(ibf.bin_count() / 2.0))) { - if (arguments.very_verbose) - verbose_out.write_record(record, bin_hits.size()); + if (!arguments.keep_repeats) + { + verbose_out.write_disabled_record(record, bin_hits.size(), arguments.verbose); + return; + } else if (arguments.verbose) verbose_out.write_warning(record, bin_hits.size()); } - + for (size_t const bin : bin_hits) { queue.insert(bin, record); diff --git a/include/valik/search/sync_out.hpp b/include/valik/search/sync_out.hpp index 42287cab..5fcdadf4 100644 --- a/include/valik/search/sync_out.hpp +++ b/include/valik/search/sync_out.hpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -12,6 +13,10 @@ namespace valik::app class sync_out { + using types = seqan3::type_list; + using fields = seqan3::fields; + using sequence_record_type = seqan3::sequence_record; + using output_format = seqan3::type_list; public: sync_out() = default; @@ -21,34 +26,37 @@ class sync_out sync_out & operator=(sync_out &&) = default; ~sync_out() = default; - sync_out(std::filesystem::path const & path) : file{path} {} + sync_out(std::filesystem::path const & path) : fout{path} + { + warning_message = [](size_t const bin_count, size_t const query_length) + { + seqan3::debug_stream << "[Warning] Insufficient prefiltering. " << bin_count << + " bins match query of length " << query_length << '\n'; + }; + } template void write_warning(t && query_record, size_t const & bin_count) { std::lock_guard lock(write_mutex); - seqan3::debug_stream << "[Warning] Insufficient prefiltering. " << bin_count << " bins match query of length " << query_record.sequence.size() << '\n'; + warning_message(bin_count, query_record.sequence.size()); } // outfile gets unlocked as soon as the current thread exits the write function template - void write_record(t && query_record, size_t const & bin_count) - { - std::string fasta_string = ">"; - fasta_string += query_record.sequence_id; - fasta_string += '\n'; - for (auto & n : query_record.sequence) - fasta_string += seqan3::to_char(n); - fasta_string += '\n'; - + void write_disabled_record(t && query_record, size_t const & bin_count, bool const verbose) + { std::lock_guard lock(write_mutex); - seqan3::debug_stream << "[Warning] Insufficient prefiltering. " << bin_count << " bins match query:\n" << fasta_string << '\n'; + if (verbose) + warning_message(bin_count, query_record.sequence.size()); + sequence_record_type output_record{query_record.sequence, query_record.sequence_id}; + fout.push_back(output_record); } // outfile gets unlocked as soon as the current thread exits the write function private: - //seqan3::sequence_file_output fout; - std::ofstream file; + seqan3::sequence_file_output fout; + std::function warning_message; std::mutex write_mutex; }; diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index 8fd73483..f0f97d92 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -181,7 +181,7 @@ struct search_arguments final : public minimiser_threshold_arguments, search_pro bool write_time{false}; bool fast{false}; bool verbose{false}; - bool very_verbose{false}; + bool keep_repeats{false}; size_t cart_max_capacity{1000}; size_t max_queued_carts{std::numeric_limits::max()}; diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index faf68f76..15efb778 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -79,10 +79,10 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments) .long_id = "without-parameter-tuning", .description = "Preprocess database without setting default parameters.", .advanced = true}); - parser.add_flag(arguments.very_verbose, + parser.add_flag(arguments.keep_repeats, sharg::config{.short_id = '\0', - .long_id = "very-verbose", - .description = "Print very verbose output.", + .long_id = "keep-repeats", + .description = "Do not filter out query matches from repeat regions.", .advanced = true}); parser.add_option(arguments.seg_count_in, sharg::config{.short_id = 'n',