From b72b46e1ab384e4ee76ec9a5fe7e3a73cbdb2bbe Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Wed, 8 May 2024 18:59:11 +0200 Subject: [PATCH] Check FPR of heuristic threshold --- include/utilities/threshold/find.hpp | 16 ++++++++++ .../threshold/search_kmer_profile.hpp | 11 ++++--- include/valik/split/metadata.hpp | 15 ---------- src/argument_parsing/search.cpp | 3 +- src/threshold/find.cpp | 29 ++++++++----------- 5 files changed, 35 insertions(+), 39 deletions(-) diff --git a/include/utilities/threshold/find.hpp b/include/utilities/threshold/find.hpp index 97115569..562a0779 100755 --- a/include/utilities/threshold/find.hpp +++ b/include/utilities/threshold/find.hpp @@ -8,6 +8,12 @@ namespace valik { +//!TODO: determine suitable parameters +constexpr double FPR_UPPER{0.1}; +constexpr double FNR_UPPER{0.05}; +constexpr uint8_t THRESH_LOWER{4}; +constexpr size_t PATTERNS_PER_SEGMENT{5000}; + /** * @brief The false positive probability of a query segment that contains partially overlapping patterns. */ @@ -17,6 +23,16 @@ inline double segment_fpr(double const pattern_p, size_t const patterns_per_segm return std::min(1 - none_match_p, 1.0); } +/** +* @brief The maximum length of a query segment that does not appear spuriously in reference bins. +*/ +inline uint64_t max_segment_len(double const pattern_p, size_t const pattern_size, uint8_t query_every) +{ + //!TODO: this const is the *minimum* number of patterns per segment + // higher values are possible if pattern_p is small + return pattern_size + query_every * (PATTERNS_PER_SEGMENT - 1); +} + /** * @brief Score of the objective function for a parameter set. Smaller values are better. */ diff --git a/include/utilities/threshold/search_kmer_profile.hpp b/include/utilities/threshold/search_kmer_profile.hpp index 9b6e759f..265690fc 100644 --- a/include/utilities/threshold/search_kmer_profile.hpp +++ b/include/utilities/threshold/search_kmer_profile.hpp @@ -28,7 +28,6 @@ struct search_error_profile search_kind search_type{search_kind::LEMMA}; double fnr; double fp_per_pattern; - uint64_t max_segment_len; search_error_profile() noexcept = default; search_error_profile(search_error_profile const &) noexcept = default; @@ -37,9 +36,9 @@ struct search_error_profile ~search_error_profile() noexcept = default; search_error_profile(param_set const & best_params, search_pattern const & eps_pattern, search_kind const which_search, - double const false_neg, double const false_pos, uint64_t const max_len) : + double const false_neg, double const false_pos) : params(best_params), pattern(eps_pattern), search_type(which_search), fnr(false_neg), - fp_per_pattern(false_pos), max_segment_len(max_len) {} + fp_per_pattern(false_pos) {} search_error_profile(param_set const & best_params, search_pattern const & eps_pattern, search_kind const which_search) : params(best_params), pattern(eps_pattern), search_type(which_search) @@ -53,7 +52,7 @@ struct search_error_profile void serialize(Archive & archive) { auto type_int = static_cast(search_type); - archive(params, pattern, type_int, fnr, fp_per_pattern, max_segment_len); + archive(params, pattern, type_int, fnr, fp_per_pattern); search_type = static_cast(type_int); } @@ -70,7 +69,7 @@ struct search_error_profile case search_kind::LEMMA: std::cout << "kmer lemma"; break; default: break; } - std::cout << '\t' << std::to_string(params.t) << '\t' << fnr << '\t' << fp_per_pattern << '\t' << max_segment_len << '\n'; + std::cout << '\t' << std::to_string(params.t) << '\t' << fnr << '\t' << fp_per_pattern << '\n'; } } } @@ -158,7 +157,7 @@ struct search_kmer_profile std::cout.precision(3); std::cout << "\nRecommended shared " << std::to_string(k) << "-mer thresholds for matches of (min_length=" << std::to_string(l) << "; max_error_rate=" << max_error_rate() << ")\n"; - std::cout << "errors\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_segment_len\n"; + std::cout << "errors\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\n"; for (uint8_t er{0}; er <= max_errors; er++) { diff --git a/include/valik/split/metadata.hpp b/include/valik/split/metadata.hpp index 80087a7b..b9a9fc31 100644 --- a/include/valik/split/metadata.hpp +++ b/include/valik/split/metadata.hpp @@ -651,21 +651,6 @@ struct metadata return std::max(0.0, fpr - precision); } - - /** - * @brief The maximum length of a query segment that does not appear spuriously in reference bins. - */ - uint64_t max_segment_len(param_set const & params) const - { - double fp_per_pattern = pattern_spurious_match_prob(params); - if (fp_per_pattern < 9e-6) // avoid very small floating point numbers - return 1e4; - - constexpr double FPR_LIMIT = 0.05; // allow FPR of 5% per query segment - size_t max_patterns_per_segment = std::floor(log(1 - FPR_LIMIT) / log(1 - fp_per_pattern)); - - return pattern_size + query_every * (std::max(max_patterns_per_segment, (size_t) 2) - 1); - } }; } // namespace valik diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index fdfef198..e8da06c3 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -210,6 +210,7 @@ void run_search(sharg::parser & parser) } else { + //TODO: this can be removed? if (arguments.split_query && arguments.manual_parameters) { throw std::runtime_error{"Provide the chosen number of query segments with --seg-count " @@ -335,7 +336,7 @@ void run_search(sharg::parser & parser) } else { - arguments.max_segment_len = error_profile.max_segment_len; + arguments.max_segment_len = max_segment_len(error_profile.fp_per_pattern, arguments.pattern_size, arguments.query_every); arguments.threshold = error_profile.params.t; arguments.threshold_was_set = true; // use raptor::threshold_kinds::percentage if (arguments.threshold > arguments.pattern_size - arguments.shape.size() + 1) diff --git a/src/threshold/find.cpp b/src/threshold/find.cpp index a1aa568e..4cfb01f8 100755 --- a/src/threshold/find.cpp +++ b/src/threshold/find.cpp @@ -3,13 +3,6 @@ namespace valik { -//!TODO: determine suitable parameters -constexpr double FPR_UPPER{0.1}; -constexpr double FNR_UPPER{0.05}; -constexpr uint8_t THRESH_LOWER{4}; -constexpr size_t DIST_GRANULARITY_LOWER{100}; -constexpr size_t PATTERNS_PER_SEGMENT{5000}; - /** * @brief For a pattern (with min length and max number of errors) find the best k-mer size and threshold for a given reference database. */ @@ -113,29 +106,31 @@ search_kmer_profile find_thresholds_for_kmer_size(metadata const & ref_meta, if ((best_params.t < THRESH_LOWER) || segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER) { - search_type = search_kind::HEURISTIC; - double best_score = pattern.l; - - uint8_t t{1}; + search_type = search_kind::STELLAR; + uint8_t t{best_params.t + 1u}; while ((t < space.max_thresh) && (attr.fnr_for_param_set(pattern, best_params) < FNR_UPPER)) { - best_params = param_set(attr.k, t, space); + param_set try_params = param_set(attr.k, t, space); + if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER) + { + best_params = std::move(try_params); + search_type = search_kind::HEURISTIC; + } t++; } + if (t >= space.max_thresh) + seqan3::debug_stream << "TODO: Need to calculate a bigger FNR table\n"; } - //!TODO: find segment length cutoff - uint64_t max_len = ref_meta.max_segment_len(best_params); - if ((pattern.l * DIST_GRANULARITY_LOWER) > max_len) + if (search_type == search_kind::STELLAR) { - search_type = search_kind::STELLAR; kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type}); } else { double fnr = attr.fnr_for_param_set(pattern, best_params); double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params); - kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern, max_len}); + kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern}); } }