From a8bab6d1f45c4a1d6ab28ac80b400298bced75ac Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Sun, 4 Feb 2024 19:17:53 +0100 Subject: [PATCH] Argument deduction in split algorithm --- .../utilities/threshold/filtering_request.hpp | 70 ++++++++++++-- include/utilities/threshold/find.hpp | 5 +- include/valik/argument_parsing/build.hpp | 2 - include/valik/shared.hpp | 11 ++- src/argument_parsing/build.cpp | 31 +------ src/argument_parsing/search.cpp | 21 +---- src/argument_parsing/split.cpp | 33 +++++-- src/threshold/find.cpp | 91 ++++++++++++------- src/valik_search.cpp | 2 +- src/valik_split.cpp | 53 ++++++++++- 10 files changed, 213 insertions(+), 106 deletions(-) diff --git a/include/utilities/threshold/filtering_request.hpp b/include/utilities/threshold/filtering_request.hpp index 6c92eb63..9519c780 100755 --- a/include/utilities/threshold/filtering_request.hpp +++ b/include/utilities/threshold/filtering_request.hpp @@ -43,28 +43,78 @@ struct filtering_request } /** - * @brief Expected number of spuriously matching k-mers in a bin. + * @brief Probability of a k-mer appearing spuriously in a bin. */ - double spurious_matches(uint8_t const kmer_size) const + double kmer_spurious_match_prob(uint8_t const kmer_size) const { return std::min(1.0f, expected_kmer_occurrences(std::round(s / (float) b), kmer_size)); } /** - * @brief The probability of at least threshold k-mers matching spuriously. + * @brief The probability of at least threshold k-mers matching spuriously between a query pattern and a reference bin. */ - double fpr(param_set const & params) const + double pattern_spurious_match_prob(param_set const & params) const { double fpr{1}; - double p = spurious_matches(params.k); - //std::cout << "Calc fpr\n"; - for (uint8_t matching_kmer_count{1}; matching_kmer_count < params.t; matching_kmer_count++) + double p = kmer_spurious_match_prob(params.k); + /* + For parameters + + s reference size + b number of bins + k kmer size + l min match length + t threshold + + find the probability of t or more k-mers matching spuriously between a pattern of length l and a reference bin. + + The probability of a pattern exceeding the threshold by chance is equal to 1 - P(0 k-mers match) - + P(1 k-mer matches) - + ... - + P(t - 1 k-mers match). + + Given p which is the probability of a single k-mer appearing spuriously in the reference bin and n = (l - k + 1) k-mers per pattern + the probability of 0 k-mers matching spuriously is P(0 k-mers match) = (1 - p)^n + and + the probability of 1 k-mer out of n matching is P(1 k-mer matches) = (n take 1) * p^1 * (1 - p)^(n - 1). + */ + + size_t kmers_per_pattern = l - params.k + 1; + for (uint8_t matching_kmer_count{0}; matching_kmer_count < params.t; matching_kmer_count++) { - // std::cout << fpr << '\n'; - fpr -= combinations(matching_kmer_count, l) * pow(p, matching_kmer_count) * pow(1 - p, l - matching_kmer_count); + fpr -= combinations(matching_kmer_count, kmers_per_pattern) * + pow(p, matching_kmer_count) * + pow(1 - p, kmers_per_pattern - matching_kmer_count); } - return fpr; + return std::max(0.0, fpr); + } + + /** + * @brief An approximation of the probability of a query segment having a spurious match in a reference bin. + */ + double fpr(param_set const & params, uint64_t const & query_seg_len, uint64_t const & overlap) const + { + double pattern_p = pattern_spurious_match_prob(params); + /* + uint64_t independent_patterns_per_segment = query_seg_len / (double) l; + double independent_patterns_p = std::min(1.0, pattern_p * independent_patterns_per_segment); + std::cout << "independent_patterns_p\t" << independent_patterns_p << '\n'; + */ + uint64_t total_patterns_per_segment = std::round((query_seg_len - l + 1) / (double) (l - overlap)); + double total_patterns_p = std::min(1.0, pattern_p * total_patterns_per_segment); + std::cout << "total_patterns_p\t" << total_patterns_p << '\n'; + return total_patterns_p; + } + + /** + * @brief The maximum length of a query segment that does not appear spuriously in reference bins. + */ + uint64_t max_segment_len(param_set const & params) const + { + double pattern_p = pattern_spurious_match_prob(params); + size_t max_independent_patterns_per_segment = std::round(1.0f / pattern_p) - 1; + return l * max_independent_patterns_per_segment; } }; diff --git a/include/utilities/threshold/find.hpp b/include/utilities/threshold/find.hpp index 8d9d984d..769234b9 100755 --- a/include/utilities/threshold/find.hpp +++ b/include/utilities/threshold/find.hpp @@ -23,8 +23,9 @@ param_set get_best_params(param_space const & space, * @brief For a chosen kmer size and error rate find the best threshold. */ uint8_t find_threshold(param_space const & space, - metadata const & meta, - search_arguments const & arguments, + metadata const & ref_meta, + metadata const & query_meta, + split_arguments const & arguments, kmer_attributes const att); /** diff --git a/include/valik/argument_parsing/build.hpp b/include/valik/argument_parsing/build.hpp index 7af10489..2aa04f96 100644 --- a/include/valik/argument_parsing/build.hpp +++ b/include/valik/argument_parsing/build.hpp @@ -3,8 +3,6 @@ #include #include #include -#include -#include namespace valik::app { diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index 4eec4039..2889be12 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -62,8 +62,14 @@ struct split_arguments uint32_t seg_count{64}; uint32_t seg_count_in{64}; bool split_index{false}; - bool write_ref{false}; - bool write_query{false}; + float error_rate{0.05}; + uint8_t errors{0}; + size_t pattern_size{100}; + uint8_t kmer_size{20}; + size_t threshold{}; + std::filesystem::path ref_meta_path{}; + bool write_out{false}; + bool verbose{false}; }; struct build_arguments @@ -77,7 +83,6 @@ struct build_arguments std::vector> bin_path{}; std::filesystem::path bin_file{}; std::filesystem::path out_path{"./"}; - float error_rate{0.05}; std::string size{}; uint64_t bins{64}; uint64_t bits{4096}; diff --git a/src/argument_parsing/build.cpp b/src/argument_parsing/build.cpp index ed08c72a..5ca89412 100644 --- a/src/argument_parsing/build.cpp +++ b/src/argument_parsing/build.cpp @@ -16,11 +16,6 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments) .description = "Provide an output filepath.", .required = true, .validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create, {}}}); - parser.add_option(arguments.error_rate, - sharg::config{.short_id = 'e', - .long_id = "error-rate", - .description = "Choose the upper bound for the maximum allowed error rate of a local match.", - .validator = sharg::arithmetic_range_validator{0.0f, 0.1f}}); parser.add_option(arguments.size, sharg::config{.short_id = '\0', .long_id = "size", @@ -116,29 +111,9 @@ void run_build(sharg::parser & parser) if (!parser.is_option_set("kmer") && !parser.is_option_set("window")) { - // ========================================== - // Parameter deduction - // ========================================== - auto space = param_space(); - std::vector attr_vec; - if (!read_fn_confs(attr_vec)) - precalc_fn_confs(attr_vec); - - size_t errors = meta.segment_overlap() * arguments.error_rate; - filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count); - auto best_params = get_best_params(space, request, attr_vec); - arguments.kmer_size = best_params.k; - - if (arguments.verbose) - { - std::cout << "Build index for:\n"; - std::cout << "db length " << meta.total_len << "bp\n"; - std::cout << "min local match length " << meta.segment_overlap() << "bp\n"; - std::cout << "max error rate " << arguments.error_rate << "\n"; - std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n'; - - find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate); - } + //!TODO: read in parameter metadata file + arguments.kmer_size = 12; + arguments.window_size = 12; } } diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index cd3b5219..25527b7c 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -240,25 +240,10 @@ void run_search(sharg::parser & parser) // ========================================== // Parameter deduction // ========================================== - //!TODO: the parameter deduction should not necessarily depend on the metadata - // also the metadata is deserialised twice currently - if (!arguments.ref_meta_path.empty() && - kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.shape_size, (size_t) arguments.errors) <= 1) + if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.shape_size, (size_t) arguments.errors) <= 1) { - metadata meta = metadata(arguments.ref_meta_path); - std::cout.precision(3); - std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n"; - std::cout << "min local match length " << arguments.pattern_size << "bp\n"; - std::cout << "error rate " << arguments.error_rate << "\n"; - std::cout << "kmer size " << std::to_string(arguments.shape_size) << '\n'; - - std::cout << "Applying heuristic threshold\n"; - auto space = param_space(); - std::vector attr_vec; - if (!read_fn_confs(attr_vec)) - precalc_fn_confs(attr_vec); - - arguments.threshold = find_threshold(space, meta, arguments, attr_vec[arguments.shape_size - std::get<0>(space.kmer_range)]); + //!TODO: read in parameter metadata file + arguments.threshold = 2; arguments.threshold_was_set = true; } diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index dd315194..52eed073 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -22,6 +22,11 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) .long_id = "overlap", .description = "Choose how much consecutive segments overlap.", .validator = positive_integer_validator{true}}); + parser.add_option(arguments.error_rate, + sharg::config{.short_id = 'e', + .long_id = "error-rate", + .description = "Choose the upper bound for the maximum allowed error rate of a local match.", + .validator = sharg::arithmetic_range_validator{0.0f, 0.1f}}); parser.add_option(arguments.seg_count_in, sharg::config{.short_id = 'n', .long_id = "seg-count", @@ -31,15 +36,20 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) sharg::config{.short_id = '\0', .long_id = "split-index", .description = "Adjust the suggested segment count to create a multiple of 64 segments instead. This is suitable for building an IBF."}); - parser.add_flag(arguments.write_ref, + parser.add_option(arguments.ref_meta_path, + sharg::config{.short_id = '\0', + .long_id = "ref-meta", + .description = "Path to reference metadata file created by split.", + .validator = sharg::input_file_validator{}}); + parser.add_flag(arguments.write_out, sharg::config{.short_id = '\0', .long_id = "write-ref", - .description = "Write an output FASTA file for each segment.", + .description = "Write an output FASTA file for each reference segment or write all query segments into a single output FASTA file..", .advanced = true}); - parser.add_flag(arguments.write_query, - sharg::config{.short_id = '\0', - .long_id = "write-query", - .description = "Write segments into a single output FASTA file.", + parser.add_flag(arguments.verbose, + sharg::config{.short_id = 'v', + .long_id = "verbose", + .description = "Print verbose output.", .advanced = true}); } @@ -50,6 +60,17 @@ void run_split(sharg::parser & parser) init_split_parser(parser, arguments); try_parsing(parser); + arguments.errors = std::ceil(arguments.error_rate * arguments.pattern_size); + + if (!parser.is_option_set("out")) + { + arguments.meta_out = arguments.seq_file; + arguments.meta_out.replace_extension("meta"); + } + + if (!arguments.split_index && !parser.is_option_set("ref-meta-path")) + throw sharg::parser_error{"Need to provide path to reference metadata to process a query database."}; + // ========================================== // Dispatch // ========================================== diff --git a/src/threshold/find.cpp b/src/threshold/find.cpp index f962aed2..5d9e6377 100755 --- a/src/threshold/find.cpp +++ b/src/threshold/find.cpp @@ -8,7 +8,7 @@ namespace valik */ double param_score(filtering_request const & request, param_set const & params, kmer_attributes const & attr) { - return attr.fnr_for_param_set(request, params) + request.fpr(params); + return attr.fnr_for_param_set(request, params) + request.pattern_spurious_match_prob(params); } param_set get_best_params(param_space const & space, @@ -38,7 +38,7 @@ param_set get_best_params(param_space const & space, auto score = param_score(request, params, att); kmer_scores.push_back(score); kmer_fn.push_back(att.fnr_for_param_set(request, params)); - kmer_fp.push_back(request.fpr(params)); + kmer_fp.push_back(request.pattern_spurious_match_prob(params)); if (score < best_score) { best_score = score; @@ -50,41 +50,33 @@ param_set get_best_params(param_space const & space, fp_rates.push_back(kmer_fp); } - return best_params; -} - -/** - * @brief For a chosen kmer size and error rate find the best threshold. -*/ -uint8_t find_threshold(param_space const & space, - metadata const & meta, - search_arguments const & arguments, - kmer_attributes const att) -{ + std::cout << '\t'; + for (size_t i{1}; i <= space.max_thresh; i++) + std::cout << i << '\t'; + std::cout << '\n'; - filtering_request request(arguments.errors, arguments.pattern_size, meta.total_len, meta.seg_count); - auto best_params = param_set(arguments.shape_size, space.max_thresh, space); - double best_score = arguments.pattern_size; - for (uint8_t t{1}; t <= space.max_thresh; t++) + for (size_t j{0}; j < fn_rates.size(); j++) { - auto params = param_set(att.k, t, space); - auto score = param_score(request, params, att); - if (score <= best_score) - { - best_params = params; - best_score = score; - } + std::cout << std::get<0>(space.kmer_range) + j << '\t'; + for (auto & cell : fn_rates[j]) + std::cout << cell << '\t'; + std::cout << '\n'; } - std::cout.precision(3); - if (arguments.verbose) + std::cout << '\t'; + for (size_t i{1}; i <= space.max_thresh; i++) + std::cout << i << '\t'; + std::cout << '\n'; + + for (size_t j{0}; j < fp_rates.size(); j++) { - std::cout << "threshold " << std::to_string(best_params.t) << '\n'; - std::cout << "FNR " << att.fnr_for_param_set(request, best_params) << '\n'; - std::cout << "FP_per_bin " << request.fpr(best_params) << '\n'; + std::cout << std::get<0>(space.kmer_range) + j << '\t'; + for (auto & cell : fp_rates[j]) + std::cout << cell << '\t'; + std::cout << '\n'; } - return best_params.t; + return best_params; } /** @@ -97,7 +89,7 @@ void find_thresholds_for_kmer_size(param_space const & space, { std::cout.precision(3); std::cout << "Recommended shared " << std::to_string(att.k) << "-mer thresholds for different error rates\n"; - std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_bin\n"; + std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_query_seg_len\n"; auto best_params = param_set(att.k, space.max_thresh, space); for (uint8_t errors{1}; errors < (meta.segment_overlap() * max_err); errors++) @@ -125,9 +117,44 @@ void find_thresholds_for_kmer_size(param_space const & space, std::cout << std::to_string(best_params.t) << '\t' << att.fnr_for_param_set(request, best_params); } - std::cout << '\t' << request.fpr(best_params) << '\n'; + std::cout << '\t' << request.pattern_spurious_match_prob(best_params) << '\t' << request.max_segment_len(best_params) << '\n'; } } +/** + * @brief For a chosen kmer size and error rate find the best threshold. +*/ +uint8_t find_threshold(param_space const & space, + metadata const & ref_meta, + metadata const & query_meta, + split_arguments const & arguments, + kmer_attributes const att) +{ + + filtering_request request(arguments.errors, arguments.pattern_size, ref_meta.total_len, ref_meta.seg_count); + auto best_params = param_set(arguments.kmer_size, space.max_thresh, space); + double best_score = arguments.pattern_size; + for (uint8_t t{1}; t <= space.max_thresh; t++) + { + auto params = param_set(att.k, t, space); + auto score = param_score(request, params, att); + if (score <= best_score) + { + best_params = params; + best_score = score; + } + } + + std::cout.precision(3); + if (arguments.verbose) + { + std::cout << "threshold " << std::to_string(best_params.t) << '\n'; + std::cout << "FNR " << att.fnr_for_param_set(request, best_params) << '\n'; + std::cout << "segment len " << std::to_string(std::round(query_meta.total_len / (double) query_meta.seg_count)) << '\n'; + std::cout << "FPR " << request.fpr(best_params, std::round(query_meta.total_len / (double) query_meta.seg_count), arguments.overlap) << '\n'; + } + + return best_params.t; +} } // namespace valik diff --git a/src/valik_search.cpp b/src/valik_search.cpp index 3eebadbf..af40ee7c 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -49,7 +49,7 @@ void valik_search(search_arguments const & arguments) runtime_to_compile_time([&]() { failed = search_local(arguments, time_statistics); - }, arguments.compressed, arguments.query_meta_path.empty()); + }, arguments.compressed, !arguments.query_meta_path.empty()); } // Consolidate matches (not necessary when searching a metagenomic database) diff --git a/src/valik_split.cpp b/src/valik_split.cpp index b363b248..4811aa10 100644 --- a/src/valik_split.cpp +++ b/src/valik_split.cpp @@ -18,10 +18,55 @@ void valik_split(split_arguments & arguments) metadata meta(arguments); meta.to_file(arguments.meta_out); - if (arguments.write_ref) - write_reference_segments(meta, arguments.meta_out); - if (arguments.write_query) - write_query_segments(meta, arguments.meta_out); + // ========================================== + // Parameter deduction + // ========================================== + auto space = param_space(); + std::vector attr_vec; + if (!read_fn_confs(attr_vec)) + precalc_fn_confs(attr_vec); + + if (arguments.split_index) + { + size_t errors = meta.segment_overlap() * arguments.error_rate; + filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count); + auto best_params = get_best_params(space, request, attr_vec); + arguments.kmer_size = best_params.k; + + if (arguments.verbose) + { + std::cout << "Build index for:\n"; + std::cout << "db length " << meta.total_len << "bp\n"; + std::cout << "min local match length " << meta.segment_overlap() << "bp\n"; + std::cout << "max error rate " << arguments.error_rate << "\n"; + std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n'; + + find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate); + } + } + else + { + if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.kmer_size, (size_t) arguments.errors) <= 1) + { + metadata ref_meta = metadata(arguments.ref_meta_path); + std::cout.precision(3); + std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n"; + std::cout << "min local match length " << arguments.pattern_size << "bp\n"; + std::cout << "error rate " << arguments.error_rate << "\n"; + std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n'; + + std::cout << "Applying heuristic threshold\n"; + std::cout << "use threshold " << find_threshold(space, ref_meta, meta, arguments, attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)]); + } + } + + if (arguments.write_out) + { + if (arguments.split_index) + write_reference_segments(meta, arguments.meta_out); + else + write_query_segments(meta, arguments.meta_out); + } } } // namespace valik::app