diff --git a/include/utilities/threshold/basics.hpp b/include/utilities/threshold/basics.hpp index d5602a41..99a3fdc7 100644 --- a/include/utilities/threshold/basics.hpp +++ b/include/utilities/threshold/basics.hpp @@ -30,10 +30,9 @@ double expected_kmer_occurrences(var_t const & bin_size, /** * @brief K-mer lemma threshold. */ -template -var_t kmer_lemma_threshold(size_t const l, var_t const k, var_t const e) +inline size_t kmer_lemma_threshold(size_t const l, uint8_t const k, uint8_t const e) { - if (((int64_t) l - k + 1) <= (int64_t) e*k) + if ((l - k + 1) <= (size_t) e*k) return 0; return l - k + 1 - e * k; diff --git a/include/utilities/threshold/filtering_request.hpp b/include/utilities/threshold/filtering_request.hpp index 3fe2cc37..b079d439 100755 --- a/include/utilities/threshold/filtering_request.hpp +++ b/include/utilities/threshold/filtering_request.hpp @@ -6,6 +6,8 @@ namespace valik { +constexpr uint8_t query_every = 2; // query every 2nd pattern by default + /** * @brief The user requests filtering by setting the following parameters. * @@ -90,21 +92,60 @@ struct filtering_request return std::max(0.0, fpr); } + /** + * @brief The probability of at least threshold k-mers matching spuriously between a query pattern and a reference bin. + */ + double pattern_spurious_match_prob(param_set const & params, uint64_t const & ref_s, size_t const ref_b) const + { + double fpr{1}; + //!TODO: expected k-mer occurrences and pattern spurious match probability should be independent of the current sequence being processed + double p = std::min(1.0, expected_kmer_occurrences(std::round(ref_s / (double) ref_b), params.k)); + /* + For parameters + + s reference size + b number of bins + k kmer size + l min match length + t threshold + + find the probability of t or more k-mers matching spuriously between a pattern of length l and a reference bin. + + The probability of a pattern exceeding the threshold by chance is equal to 1 - P(0 k-mers match) - + P(1 k-mer matches) - + ... - + P(t - 1 k-mers match). + + Given p which is the probability of a single k-mer appearing spuriously in the reference bin and n = (l - k + 1) k-mers per pattern + the probability of 0 k-mers matching spuriously is P(0 k-mers match) = (1 - p)^n + and + the probability of 1 k-mer out of n matching is P(1 k-mer matches) = (n take 1) * p^1 * (1 - p)^(n - 1). + */ + + size_t kmers_per_pattern = l - params.k + 1; + for (uint8_t matching_kmer_count{0}; matching_kmer_count < params.t; matching_kmer_count++) + { + fpr -= combinations(matching_kmer_count, kmers_per_pattern) * + pow(p, matching_kmer_count) * + pow(1 - p, kmers_per_pattern - matching_kmer_count); + } + + return std::max(0.0, fpr); + } + /** * @brief An approximation of the probability of a query segment having a spurious match in a reference bin. */ - double fpr(param_set const & params, uint64_t const & query_seg_len) const + double fpr(param_set const & params, uint64_t const & query_seg_len, uint64_t const & ref_size, size_t const ref_bins) const { - double pattern_p = pattern_spurious_match_prob(params); + double pattern_p = pattern_spurious_match_prob(params, ref_size, ref_bins); /* uint64_t independent_patterns_per_segment = query_seg_len / (double) l; double independent_patterns_p = std::min(1.0, pattern_p * independent_patterns_per_segment); std::cout << "independent_patterns_p\t" << independent_patterns_p << '\n'; */ - constexpr uint8_t query_every = 2; // query every 2nd pattern by default uint64_t total_patterns_per_segment = std::round((query_seg_len - l + 1) / (double) query_every); double total_patterns_p = std::min(1.0, pattern_p * total_patterns_per_segment); - std::cout << "total_patterns_p\t" << total_patterns_p << '\n'; return total_patterns_p; } @@ -114,8 +155,8 @@ struct filtering_request uint64_t max_segment_len(param_set const & params) const { double pattern_p = pattern_spurious_match_prob(params); - size_t max_independent_patterns_per_segment = std::round(1.0f / pattern_p) - 1; - return l * max_independent_patterns_per_segment; + size_t max_patterns_per_segment = std::round(1.0 / pattern_p) - 1; + return l + query_every * (std::max(max_patterns_per_segment, (size_t) 2) - 1); } }; diff --git a/include/utilities/threshold/find.hpp b/include/utilities/threshold/find.hpp index 769234b9..d1351418 100755 --- a/include/utilities/threshold/find.hpp +++ b/include/utilities/threshold/find.hpp @@ -22,18 +22,17 @@ param_set get_best_params(param_space const & space, /** * @brief For a chosen kmer size and error rate find the best threshold. */ -uint8_t find_threshold(param_space const & space, - metadata const & ref_meta, - metadata const & query_meta, - split_arguments const & arguments, - kmer_attributes const att); +uint8_t find_heuristic_threshold(metadata const & ref_meta, + metadata const & query_meta, + split_arguments const & arguments, + kmer_attributes const attr); /** * @brief For a chosen kmer size and some maximum error rate find the best threshold. */ void find_thresholds_for_kmer_size(param_space const & space, metadata const & meta, - kmer_attributes const att, + kmer_attributes const attr, double const max_err); } // namespace valik diff --git a/include/utilities/threshold/kmer_attributes.hpp b/include/utilities/threshold/kmer_attributes.hpp index 79d21332..e7a4e1bf 100755 --- a/include/utilities/threshold/kmer_attributes.hpp +++ b/include/utilities/threshold/kmer_attributes.hpp @@ -71,7 +71,9 @@ struct kmer_attributes * @brief False negative rate for a parameter set. */ double fnr_for_param_set(filtering_request const & request, param_set const & params) const - { + { + if (kmer_lemma_threshold(request.l, params.k, request.e) > 1) + return 0.0; return fn_conf_counts[params.t - 1][request.e][request.l] / (double) request.total_conf_count(); } diff --git a/include/utilities/threshold/param_set.hpp b/include/utilities/threshold/param_set.hpp index 735f1f64..76f2222c 100755 --- a/include/utilities/threshold/param_set.hpp +++ b/include/utilities/threshold/param_set.hpp @@ -18,6 +18,7 @@ struct param_set param_set(uint8_t const kmer_size, uint8_t const thresh, param_space const & space) : k(kmer_size), t(thresh) { + /* if ((kmer_size < std::get<0>(space.kmer_range)) | (kmer_size > std::get<1>(space.kmer_range))) { throw std::runtime_error{"k=" + std::to_string(kmer_size) + " out of range [" + @@ -30,6 +31,7 @@ struct param_set throw std::runtime_error{"threshold=" + std::to_string(thresh) + " out of range [0, " + std::to_string(space.max_thresh) + "]"}; } + */ } }; diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index 25527b7c..fb616aff 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -243,7 +243,6 @@ void run_search(sharg::parser & parser) if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.shape_size, (size_t) arguments.errors) <= 1) { //!TODO: read in parameter metadata file - arguments.threshold = 2; arguments.threshold_was_set = true; } diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index d0f0da44..ceb8409d 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -27,6 +27,11 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) .long_id = "error-rate", .description = "Choose the upper bound for the maximum allowed error rate of a local match.", .validator = sharg::arithmetic_range_validator{0.0f, 0.1f}}); + parser.add_option(arguments.kmer_size, + sharg::config{.short_id = 'k', + .long_id = "kmer", + .description = "Choose the kmer size.", + .validator = positive_integer_validator{true}}); parser.add_option(arguments.seg_count_in, sharg::config{.short_id = 'n', .long_id = "seg-count", diff --git a/src/threshold/find.cpp b/src/threshold/find.cpp index d37461eb..47b29c76 100755 --- a/src/threshold/find.cpp +++ b/src/threshold/find.cpp @@ -91,21 +91,22 @@ param_set get_best_params(param_space const & space, */ void find_thresholds_for_kmer_size(param_space const & space, metadata const & meta, - kmer_attributes const att, + kmer_attributes const attr, double const max_err) { std::cout.precision(3); - std::cout << "Recommended shared " << std::to_string(att.k) << "-mer thresholds for different error rates\n"; + std::cout << "Recommended shared " << std::to_string(attr.k) << "-mer thresholds for different error rates\n"; std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_query_seg_len\n"; - auto best_params = param_set(att.k, space.max_thresh, space); - for (uint8_t errors{1}; errors < (meta.segment_overlap() * max_err); errors++) + auto best_params = param_set(attr.k, space.max_thresh, space); + for (uint8_t errors{1}; errors <= std::ceil(meta.segment_overlap() * max_err); errors++) { filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count); std::cout << errors / (double) request.l << '\t'; - if (att.fnr_for_param_set(request, best_params) == 0) + if (kmer_lemma_threshold(request.l, attr.k, errors) > 1) { - std::cout << "kmer lemma\t" << std::to_string(kmer_lemma_threshold(request.l, att.k, errors)) << '\t' << 0; + best_params.t = kmer_lemma_threshold(request.l, attr.k, errors); + std::cout << "kmer lemma\t"; } else { @@ -113,55 +114,47 @@ void find_thresholds_for_kmer_size(param_space const & space, double best_score = request.l; for (uint8_t t{1}; t <= space.max_thresh; t++) { - auto params = param_set(att.k, t, space); - auto score = param_score(request, params, att); + auto params = param_set(attr.k, t, space); + auto score = param_score(request, params, attr); if (score <= best_score) { best_params = params; best_score = score; } } - std::cout << std::to_string(best_params.t) << '\t' << att.fnr_for_param_set(request, best_params); } - std::cout << '\t' << request.pattern_spurious_match_prob(best_params) << '\t' << request.max_segment_len(best_params) << '\n'; + std::cout << std::to_string(best_params.t) << '\t' << attr.fnr_for_param_set(request, best_params) << '\t' + << request.pattern_spurious_match_prob(best_params) << '\n'; + // << request.max_segment_len(best_params) << '\n'; } } /** - * @brief For a chosen kmer size and error rate find the best threshold. + * @brief For a chosen kmer size and error rate find the best heuristic threshold. */ -uint8_t find_threshold(param_space const & space, - metadata const & ref_meta, - metadata const & query_meta, - split_arguments const & arguments, - kmer_attributes const att) +uint8_t find_heuristic_threshold(metadata const & ref_meta, + metadata const & query_meta, + split_arguments const & arguments, + kmer_attributes const attr) { - + param_space space; filtering_request request(arguments.errors, arguments.pattern_size, ref_meta.total_len, ref_meta.seg_count); - auto best_params = param_set(arguments.kmer_size, space.max_thresh, space); + auto best_thresh = space.max_thresh; double best_score = arguments.pattern_size; for (uint8_t t{1}; t <= space.max_thresh; t++) { - auto params = param_set(att.k, t, space); - auto score = param_score(request, params, att); + auto params = param_set(arguments.kmer_size, t, space); + auto score = param_score(request, params, attr); + if (score <= best_score) { - best_params = params; + best_thresh = params.t; best_score = score; } - } - - std::cout.precision(3); - if (arguments.verbose) - { - std::cout << "threshold " << std::to_string(best_params.t) << '\n'; - std::cout << "FNR " << att.fnr_for_param_set(request, best_params) << '\n'; - std::cout << "segment len " << std::to_string(std::round(query_meta.total_len / (double) query_meta.seg_count)) << '\n'; - std::cout << "FPR " << request.fpr(best_params, std::round(query_meta.total_len / (double) query_meta.seg_count)) << '\n'; - } + } - return best_params.t; + return best_thresh; } } // namespace valik diff --git a/src/valik_split.cpp b/src/valik_split.cpp index fd997816..4b4e747e 100644 --- a/src/valik_split.cpp +++ b/src/valik_split.cpp @@ -28,43 +28,61 @@ void valik_split(split_arguments & arguments) if (!read_fn_confs(attr_vec)) precalc_fn_confs(attr_vec); + filtering_request request(arguments.errors, arguments.pattern_size, meta.total_len, meta.seg_count); if (arguments.split_index) { - filtering_request request(arguments.errors, arguments.pattern_size, meta.total_len, meta.seg_count); auto best_params = get_best_params(space, request, attr_vec); arguments.kmer_size = best_params.k; + kmer_attributes attr = attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)]; if (arguments.verbose) { - std::cout << "Build index for:\n"; + std::cout << "\n-----------Index build parameters-----------\n"; std::cout << "db length " << meta.total_len << "bp\n"; std::cout << "min local match length " << arguments.pattern_size << "bp\n"; std::cout << "max error rate " << arguments.error_rate << "\n"; std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n'; - find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate); + find_thresholds_for_kmer_size(space, meta, attr, arguments.error_rate); } } else { - if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.kmer_size, (size_t) arguments.errors) <= 1) + kmer_attributes attr = attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)]; + auto best_threshold = kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.kmer_size, (size_t) arguments.errors); + if (arguments.verbose) { - metadata ref_meta = metadata(arguments.ref_meta_path); - auto best_threshold = find_threshold(space, ref_meta, meta, arguments, attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)]); + std::cout.precision(3); + std::cout << "\n-----------Search parameters-----------\n"; + std::cout << "segment len " << std::to_string((uint64_t) std::round(meta.total_len / (double) meta.seg_count)) << '\n'; + std::cout << "min local match length " << arguments.pattern_size << "bp\n"; + std::cout << "error rate " << arguments.error_rate << "\n"; + std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n'; + } + + metadata ref_meta = metadata(arguments.ref_meta_path); + if (best_threshold <= 1) + { + best_threshold = find_heuristic_threshold(ref_meta, meta, arguments, attr); if (arguments.verbose) { - std::cout.precision(3); - std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n"; - std::cout << "min local match length " << arguments.pattern_size << "bp\n"; - std::cout << "error rate " << arguments.error_rate << "\n"; - std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n'; - - std::cout << "Applying heuristic threshold\n"; - std::cout << "use threshold " << best_threshold; + std::cout << "Apply heuristic threshold "; } } - } + else + { + if (arguments.verbose) + std::cout << "Use k-mer lemma threshold "; + } + if (arguments.verbose) + { + param_set best_params(arguments.kmer_size, best_threshold, space); + std::cout << best_threshold << '\n'; + std::cout << "FNR " << attr.fnr_for_param_set(request, best_params) << '\n'; + std::cout << "FPR " << request.fpr(best_params, std::round(meta.total_len / (double) meta.seg_count), ref_meta.total_len, ref_meta.seg_count) << '\n'; + } + } } if (arguments.write_out)