From fcffcaea7d47d5e865ee0f2096d0640cb82e97bc Mon Sep 17 00:00:00 2001 From: Evelin Date: Thu, 7 Nov 2024 16:09:05 +0100 Subject: [PATCH] Redo "Increase max bound for search arguments (#126)"" This reverts commit 08b6a7ef3c31bb1648a35b204e0b8e1d1657980f. --- include/utilities/threshold/basics.hpp | 6 +- include/utilities/threshold/kmer_loss.hpp | 12 +++- include/utilities/threshold/param_set.hpp | 11 +++- include/valik/search/search_local.hpp | 2 +- lib/seqan | 2 +- src/argument_parsing/split.cpp | 5 +- src/threshold/find.cpp | 64 +++++++++++-------- .../utilities/threshold/kmer_loss_test.cpp | 12 ++-- .../utilities/threshold/param_set_test.cpp | 6 +- test/cli/valik_test.cpp | 4 +- 10 files changed, 79 insertions(+), 45 deletions(-) diff --git a/include/utilities/threshold/basics.hpp b/include/utilities/threshold/basics.hpp index 43c96b32..71c7caca 100644 --- a/include/utilities/threshold/basics.hpp +++ b/include/utilities/threshold/basics.hpp @@ -57,9 +57,9 @@ inline size_t kmer_lemma_threshold(size_t const l, uint8_t const k, uint8_t cons struct param_space { constexpr static uint8_t max_errors{15}; - uint8_t max_thresh{20}; + uint16_t max_thresh{20}; constexpr static size_t max_len{150}; - constexpr static std::pair kmer_range{9, 21}; + constexpr static std::pair kmer_range{9, 35}; param_space() noexcept = default; param_space(param_space const &) noexcept = default; @@ -67,7 +67,7 @@ struct param_space param_space & operator=(param_space &&) noexcept = default; ~param_space() noexcept = default; - param_space(uint8_t thresh) : max_thresh(thresh) {} + param_space(uint16_t thresh) : max_thresh(thresh) {} uint8_t min_k() const { diff --git a/include/utilities/threshold/kmer_loss.hpp b/include/utilities/threshold/kmer_loss.hpp index 62da7f2b..1a1c04a0 100644 --- a/include/utilities/threshold/kmer_loss.hpp +++ b/include/utilities/threshold/kmer_loss.hpp @@ -6,6 +6,8 @@ #include #include +#include + namespace valik { @@ -81,8 +83,14 @@ struct kmer_loss if (kmer_lemma_threshold(pattern.l, params.k, pattern.e) >= params.t) return 0.0; if (params.t > fn_conf_counts.size()) - throw std::runtime_error("Calculated configuration count table for t=[1, " + std::to_string(fn_conf_counts.size()) + "]. " - "Can't find FNR for t=" + std::to_string(params.t)); + throw std::runtime_error("Calculated configuration count table for threshold=[1, " + std::to_string(fn_conf_counts.size()) + "]. " + "Can't find FNR for threshold=" + std::to_string(params.t)); + if (pattern.e >= fn_conf_counts[0].size()) + throw std::runtime_error("Calculated configuration count table for errors=[0, " + std::to_string(fn_conf_counts[0].size()) + "]. " + "Can't find FNR for errors=" + std::to_string(pattern.e)); + if (pattern.l >= fn_conf_counts[0][0].size()) + throw std::runtime_error("Calculated configuration count table for min_len=[1, " + std::to_string(fn_conf_counts[0][0].size()) + "]. " + "Can't find FNR for min_len=" + std::to_string(pattern.l)); return fn_conf_counts[params.t - 1][pattern.e][pattern.l] / (double) pattern.total_conf_count(); } diff --git a/include/utilities/threshold/param_set.hpp b/include/utilities/threshold/param_set.hpp index aeab7276..a0bc4411 100755 --- a/include/utilities/threshold/param_set.hpp +++ b/include/utilities/threshold/param_set.hpp @@ -18,7 +18,7 @@ namespace valik struct param_set { uint8_t k; - uint8_t t; + uint16_t t; param_set() noexcept = default; param_set(param_set const &) noexcept = default; @@ -26,7 +26,7 @@ struct param_set param_set & operator=(param_set &&) noexcept = default; ~param_set() noexcept = default; - param_set(uint8_t const kmer_size, uint8_t const thresh, param_space const & space) : k(kmer_size), t(thresh) + param_set(uint8_t const kmer_size, uint16_t const thresh, param_space const & space) : k(kmer_size), t(thresh) { if ((kmer_size < std::get<0>(space.kmer_range)) | (kmer_size > std::get<1>(space.kmer_range))) { @@ -34,8 +34,15 @@ struct param_set std::to_string(std::get<0>(space.kmer_range)) + ", " + std::to_string(std::get<1>(space.kmer_range)) + "]"}; } + + if (thresh > space.max_thresh) + { + throw std::runtime_error{"t=" + std::to_string(thresh) + " out of range [0, " + std::to_string(space.max_thresh) + "]"}; + } } + param_set(uint8_t const kmer_size, uint16_t const thresh) : k(kmer_size), t(thresh) {} + template void serialize(Archive & archive) { diff --git a/include/valik/search/search_local.hpp b/include/valik/search/search_local.hpp index 1c4d9409..9c7af3c5 100644 --- a/include/valik/search/search_local.hpp +++ b/include/valik/search/search_local.hpp @@ -113,7 +113,7 @@ bool search_local(search_arguments & arguments, search_time_statistics & time_st { search_pattern pattern(arguments.errors, arguments.pattern_size); param_space space; - param_set params(arguments.shape_size, arguments.threshold, space); + param_set params(arguments.shape_size, arguments.threshold); filtering_request request(pattern, ref_meta, query_meta.value()); if ((request.fpr(params) > 0.2) && (arguments.search_type != search_kind::STELLAR)) std::cerr << "WARNING: Prefiltering will be inefficient for a high error rate.\n"; diff --git a/lib/seqan b/lib/seqan index 8ce355dd..d1a9001f 160000 --- a/lib/seqan +++ b/lib/seqan @@ -1 +1 @@ -Subproject commit 8ce355dd960bbf7a5fa0292b49f7342f7e456da6 +Subproject commit d1a9001fc1ac69703d958fdbbccb6599206f1d53 diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index 1b52b0bc..b81f22e8 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -8,6 +8,7 @@ namespace valik::app void init_split_parser(sharg::parser & parser, split_arguments & arguments) { + param_space space{}; init_shared_meta(parser); parser.add_positional_option(arguments.db_file, sharg::config{.description = "File containing database sequences. If splitting --metagenome provide a text file with a list of cluster paths.", @@ -21,7 +22,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) sharg::config{.short_id = '\0', .long_id = "pattern", .description = "Choose how much consecutive segments overlap. This is the minimum length of a local alignment.", - .validator = positive_integer_validator{true}}); + .validator = sharg::arithmetic_range_validator{10u, 1000u}}); parser.add_option(arguments.error_rate, sharg::config{.short_id = 'e', .long_id = "error-rate", @@ -36,7 +37,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) sharg::config{.short_id = 'k', .long_id = "kmer", .description = "Choose the kmer size.", - .validator = positive_integer_validator{true}}); + .validator = sharg::arithmetic_range_validator{space.min_k(), space.max_k()}}); parser.add_option(arguments.seg_count_in, sharg::config{.short_id = 'n', .long_id = "seg-count", diff --git a/src/threshold/find.cpp b/src/threshold/find.cpp index e7c3d1ea..77b72548 100755 --- a/src/threshold/find.cpp +++ b/src/threshold/find.cpp @@ -12,6 +12,10 @@ param_set get_best_params(search_pattern const & pattern, bool const verbose) { param_space space = fn_attr.space; + + if (kmer_lemma_threshold(pattern.l, space.max_k(), pattern.e) > space.max_thresh) + return param_set(space.max_k(), kmer_lemma_threshold(pattern.l, space.max_k(), pattern.e)); + param_set best_params(space.min_k(), 1, space); auto best_score = score(fn_attr.get_kmer_loss(space.min_k()), pattern, best_params, ref_meta, PATTERNS_PER_SEGMENT); @@ -94,41 +98,51 @@ search_kmer_profile find_thresholds_for_kmer_size(metadata const & ref_meta, kmer_loss attr, uint8_t const max_errors) { - param_space space{}; + constexpr param_space space{}; search_kmer_profile kmer_thresh{attr.k, ref_meta.pattern_size}; - for (uint8_t errors{0}; errors <= max_errors && errors <= space.max_errors; errors++) + for (uint8_t errors{0}; errors <= max_errors; errors++) { search_pattern pattern(errors, ref_meta.pattern_size); search_kind search_type{search_kind::LEMMA}; - auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors), space); - auto try_params = best_params; - if ((best_params.t < THRESH_LOWER) || - segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER) + if (kmer_lemma_threshold(pattern.l, attr.k, errors) > space.max_thresh) { - search_type = search_kind::STELLAR; - try_params.t++; - while ((try_params.t <= space.max_thresh) && - (attr.fnr_for_param_set(pattern, try_params) < FNR_UPPER)) + auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors)); + double fnr{0}; // == attr.fnr_for_param_set(pattern, best_params) + double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params); + kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern}); + } + else + { + auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors), space); + auto try_params = best_params; + if ((best_params.t < THRESH_LOWER) || + segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER) { - if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER) + search_type = search_kind::STELLAR; + try_params.t++; + while ((try_params.t <= space.max_thresh) && + (attr.fnr_for_param_set(pattern, try_params) < FNR_UPPER)) { - best_params = std::move(try_params); - search_type = search_kind::HEURISTIC; + if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER) + { + best_params = std::move(try_params); + search_type = search_kind::HEURISTIC; + } + try_params.t++; } - try_params.t++; } - } - - if (search_type == search_kind::STELLAR) - { - kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type}); - } - else - { - double fnr = attr.fnr_for_param_set(pattern, best_params); - double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params); - kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern}); + + if (search_type == search_kind::STELLAR) + { + kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type}); + } + else + { + double fnr = attr.fnr_for_param_set(pattern, best_params); + double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params); + kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern}); + } } } diff --git a/test/api/utilities/threshold/kmer_loss_test.cpp b/test/api/utilities/threshold/kmer_loss_test.cpp index 35afbf56..bf6ca008 100644 --- a/test/api/utilities/threshold/kmer_loss_test.cpp +++ b/test/api/utilities/threshold/kmer_loss_test.cpp @@ -171,11 +171,11 @@ TEST(kmer_loss, exhaustive_comparison) } } -void try_fnr(uint8_t e, size_t l, uint8_t k, uint8_t t, double expected_fnr) +void try_fnr(uint8_t e, size_t l, uint8_t k, uint16_t t, double expected_fnr) { valik::param_space space{}; valik::search_pattern pattern(e, l); - valik::param_set param(k, t, space); + valik::param_set param(k, t); valik::fn_confs fn_attr(space); @@ -187,6 +187,10 @@ TEST(false_negative, try_kmer_lemma_thershold) try_fnr(0u, (size_t) 50, 16u, 35u, 0.0); try_fnr(1u, (size_t) 50, 16u, 19u, 0.0); try_fnr(2u, (size_t) 50, 16u, 3u, 0.0); + + try_fnr(0u, (size_t) 1000, 35u, 966u, 0.0); + try_fnr(1u, (size_t) 1000, 35u, 931u, 0.0); + try_fnr(2u, (size_t) 1000, 35u, 896u, 0.0); } TEST(false_negative, try_threshold_above_kmer_lemma) @@ -200,8 +204,8 @@ TEST(false_negative, try_threshold_above_kmer_lemma) } catch( const std::runtime_error& e ) { - EXPECT_STREQ( ("Calculated configuration count table for t=[1, " + std::to_string(space.max_thresh) + "]. " - "Can't find FNR for t=" + std::to_string(t)).c_str(), e.what() ); + EXPECT_STREQ( ("Calculated configuration count table for threshold=[1, " + std::to_string(space.max_thresh) + "]. " + "Can't find FNR for threshold=" + std::to_string(t)).c_str(), e.what() ); throw; } }, std::runtime_error ); diff --git a/test/api/utilities/threshold/param_set_test.cpp b/test/api/utilities/threshold/param_set_test.cpp index 39203773..71de64cf 100644 --- a/test/api/utilities/threshold/param_set_test.cpp +++ b/test/api/utilities/threshold/param_set_test.cpp @@ -14,7 +14,7 @@ TEST(error_handling, kmer_size) } catch( const std::runtime_error& e ) { - EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 21]").c_str(), e.what() ); + EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 35]").c_str(), e.what() ); throw; } }, std::runtime_error ); @@ -24,7 +24,7 @@ TEST(error_handling, kmer_size) valik::param_set(kmer_size, thresh, space); }); - kmer_size = 22; + kmer_size = 36; EXPECT_THROW({ try { @@ -32,7 +32,7 @@ TEST(error_handling, kmer_size) } catch( const std::runtime_error& e ) { - EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 21]").c_str(), e.what() ); + EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 35]").c_str(), e.what() ); throw; } }, std::runtime_error ); diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index 386f72cc..2de1f137 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -178,7 +178,7 @@ struct split_options : public valik_base {}; TEST_F(split_options, too_few_segments) { size_t n = 30; - size_t o = 0; + size_t o = 10; cli_test_result const result = execute_app("valik", "split", data("query.fasta"), "--seg-count", std::to_string(n), "--pattern", std::to_string(o), @@ -197,7 +197,7 @@ TEST_F(split_options, too_few_segments) TEST_F(split_options, overlap_too_large) { size_t n = 30; - size_t o = 2000; + size_t o = 1000; cli_test_result const result = execute_app("valik", "split", data("query.fasta"), "--seg-count", std::to_string(n), "--pattern", std::to_string(o),