From a07e911811bdbaa1df147a1b834e4bfa9bd2d688 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Mon, 29 Jan 2024 17:27:14 +0100 Subject: [PATCH] Deduce parameters as part of build --- .../utilities/threshold/filtering_request.hpp | 1 + include/utilities/threshold/find.hpp | 19 ++++- .../utilities/threshold/kmer_attributes.hpp | 2 +- include/utilities/threshold/shared.hpp | 9 +++ include/valik/argument_parsing/build.hpp | 4 + include/valik/shared.hpp | 2 + include/valik/split/metadata.hpp | 6 ++ src/argument_parsing/build.cpp | 77 ++++++++++++++----- src/argument_parsing/search.cpp | 2 +- src/threshold/find.cpp | 61 ++++++++++----- src/valik_split.cpp | 23 ------ test/cli/valik_options_test.cpp | 4 +- 12 files changed, 143 insertions(+), 67 deletions(-) diff --git a/include/utilities/threshold/filtering_request.hpp b/include/utilities/threshold/filtering_request.hpp index 886a3e7e..4a64d059 100755 --- a/include/utilities/threshold/filtering_request.hpp +++ b/include/utilities/threshold/filtering_request.hpp @@ -25,6 +25,7 @@ struct filtering_request size_t const bins) : s(ref_size), b(bins) { auto space = param_space(); + //!TODO: error handling if (errors > space.max_errors) std::cout << "Error: error count out of range\n"; else diff --git a/include/utilities/threshold/find.hpp b/include/utilities/threshold/find.hpp index ba95ca55..140fcb35 100755 --- a/include/utilities/threshold/find.hpp +++ b/include/utilities/threshold/find.hpp @@ -1,8 +1,8 @@ #pragma once -#include #include #include +#include namespace valik { @@ -12,8 +12,19 @@ namespace valik */ double param_score(filtering_request const & request, param_set const & params, kmer_attributes const & attr); -void get_best_params(param_space const & space, - filtering_request const & request, - std::vector const & attribute_vec); +/** + * @brief Look through the parameter space to find the best kmer size and threshold. +*/ +param_set get_best_params(param_space const & space, + filtering_request const & request, + std::vector const & attribute_vec); + +/** + * @brief For a chosen kmer size and some maximum error rate find the best threshold. +*/ +void find_thresholds_for_kmer_size(param_space const & space, + metadata const & meta, + kmer_attributes const att, + double const max_err); } // namespace valik diff --git a/include/utilities/threshold/kmer_attributes.hpp b/include/utilities/threshold/kmer_attributes.hpp index 85f53237..ff811232 100755 --- a/include/utilities/threshold/kmer_attributes.hpp +++ b/include/utilities/threshold/kmer_attributes.hpp @@ -67,7 +67,7 @@ struct kmer_attributes k(kmer_size), fn_conf_counts(count_err_conf_below_thresh()) { } - kmer_attributes(size_t const kmer_size, mat_t const & matrix) : k(kmer_size), fn_conf_counts(matrix) {} + kmer_attributes(size_t const kmer_size, mat_t const & matrix) : k(kmer_size), fn_conf_counts(matrix) { } /** * @brief False negative rate for a parameter set. diff --git a/include/utilities/threshold/shared.hpp b/include/utilities/threshold/shared.hpp index 57e32bbd..0145a968 100755 --- a/include/utilities/threshold/shared.hpp +++ b/include/utilities/threshold/shared.hpp @@ -54,6 +54,15 @@ float expected_kmer_occurrences(var_t const & bin_size, return (float) (bin_size - kmer_size + 1) / (float) pow(alphabet_size, kmer_size); } +/** + * @brief K-mer lemma threhold. +*/ +template +var_t kmer_lemma_threshold(var_t const l, var_t const k, var_t const e) +{ + return l - k + 1 - e * k; +} + /** * @brief Definition of the search space for the parameter tuning algorithm. * diff --git a/include/valik/argument_parsing/build.hpp b/include/valik/argument_parsing/build.hpp index 9a363844..7af10489 100644 --- a/include/valik/argument_parsing/build.hpp +++ b/include/valik/argument_parsing/build.hpp @@ -1,6 +1,10 @@ #pragma once #include +#include +#include +#include +#include namespace valik::app { diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index fb777079..4eec4039 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -77,12 +77,14 @@ struct build_arguments std::vector> bin_path{}; std::filesystem::path bin_file{}; std::filesystem::path out_path{"./"}; + float error_rate{0.05}; std::string size{}; uint64_t bins{64}; uint64_t bits{4096}; uint64_t hash{2}; bool compressed{false}; bool fast{false}; + bool verbose{false}; std::filesystem::path ref_meta_path{}; }; diff --git a/include/valik/split/metadata.hpp b/include/valik/split/metadata.hpp index 871f9a80..42075757 100644 --- a/include/valik/split/metadata.hpp +++ b/include/valik/split/metadata.hpp @@ -463,6 +463,12 @@ struct metadata return stdev / mean; } + + size_t segment_overlap() const + { + assert(segments.size() > 1); + return segments[0].len - segments[1].start; + } }; } // namespace valik diff --git a/src/argument_parsing/build.cpp b/src/argument_parsing/build.cpp index da35a916..ae1cfc64 100644 --- a/src/argument_parsing/build.cpp +++ b/src/argument_parsing/build.cpp @@ -1,7 +1,4 @@ #include -#include - -#include namespace valik::app { @@ -13,33 +10,23 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments) sharg::config{.description = "File containing one file per line per bin when building from clustered sequences. " "Input sequence file when building from overlapping segments.", .validator = sharg::input_file_validator{}}); - parser.add_option(arguments.window_size, - sharg::config{.short_id = '\0', - .long_id = "window", - .description = "Choose the window size.", - .validator = positive_integer_validator{}}); - parser.add_option(arguments.kmer_size, - sharg::config{.short_id = '\0', - .long_id = "kmer", - .description = "Choose the kmer size.", - .validator = sharg::arithmetic_range_validator{1, 32}}); parser.add_option(arguments.out_path, sharg::config{.short_id = '\0', .long_id = "output", .description = "Provide an output filepath.", .required = true, .validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create, {}}}); + parser.add_option(arguments.error_rate, + sharg::config{.short_id = 'e', + .long_id = "error-rate", + .description = "Choose the upper bound for the maximum allowed error rate of a local match.", + .validator = sharg::arithmetic_range_validator{0.0f, 0.1f}}); parser.add_option(arguments.size, sharg::config{.short_id = '\0', .long_id = "size", .description = "Choose the size of the resulting IBF.", .required = true, .validator = size_validator{"\\d+\\s{0,1}[k,m,g,t,K,M,G,T]"}}); - parser.add_option(arguments.hash, - sharg::config{.short_id = '\0', - .long_id = "hash", - .description = "Choose the number of hashes.", - .validator = sharg::arithmetic_range_validator{1, 5}}); parser.add_flag(arguments.compressed, sharg::config{.short_id = '\0', .long_id = "compressed", @@ -60,6 +47,33 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments) sharg::config{.short_id = '\0', .long_id = "fast", .description = "Build the index in fast mode when few false negatives can be tolerated in the following search."}); + parser.add_flag(arguments.verbose, + sharg::config{.short_id = '\0', + .long_id = "verbose", + .description = "Print verbose output.", + .advanced = true}); + + ///////////////////////////////////////// + // Advanced options + ///////////////////////////////////////// + parser.add_option(arguments.window_size, + sharg::config{.short_id = '\0', + .long_id = "window", + .description = "Choose the window size.", + .advanced = true, + .validator = positive_integer_validator{}}); + parser.add_option(arguments.kmer_size, + sharg::config{.short_id = '\0', + .long_id = "kmer", + .description = "Choose the kmer size.", + .advanced = true, + .validator = sharg::arithmetic_range_validator{1, 32}}); + parser.add_option(arguments.hash, + sharg::config{.short_id = '\0', + .long_id = "hash", + .description = "Choose the number of hashes.", + .advanced = true, + .validator = sharg::arithmetic_range_validator{1, 5}}); } void run_build(sharg::parser & parser) @@ -99,6 +113,33 @@ void run_build(sharg::parser & parser) sequence_file_validator(arguments.bin_file); std::string bin_string{arguments.bin_file.string()}; arguments.bin_path.emplace_back(std::vector{bin_string}); + + if (!parser.is_option_set("kmer") && !parser.is_option_set("window")) + { + // ========================================== + // Search parameter tuning + // ========================================== + auto space = param_space(); + std::vector attr_vec; + if (!read_fn_confs(attr_vec)) + precalc_fn_confs(attr_vec); + + size_t errors = meta.segment_overlap() * arguments.error_rate; + filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count); + auto best_params = get_best_params(space, request, attr_vec); + + std::cout.precision(3); + if (arguments.verbose) + { + std::cout << "db length: " << meta.total_len << "bp\n"; + std::cout << "min local match length: " << meta.segment_overlap() << "bp\n"; + std::cout << "max error rate: " << arguments.error_rate << "\n"; + std::cout << "best kmer size: " << best_params.k << '\n'; + + find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate); + } + + } } // ========================================== diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index c63d8635..cd31367a 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -33,7 +33,7 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments) sharg::config{.short_id = 'e', .long_id = "error-rate", .description = "Choose the maximum allowed error rate of a local match.", - .validator = sharg::arithmetic_range_validator{0.0f, 0.2f}}); + .validator = sharg::arithmetic_range_validator{0.0f, 0.1f}}); parser.add_option(arguments.pattern_size, sharg::config{.short_id = '\0', .long_id = "pattern", diff --git a/src/threshold/find.cpp b/src/threshold/find.cpp index 26fca48f..73f3af97 100755 --- a/src/threshold/find.cpp +++ b/src/threshold/find.cpp @@ -11,9 +11,9 @@ double param_score(filtering_request const & request, param_set const & params, return attr.fnr_for_param_set(request, params) + request.fpr(params.k) / (double) params.t; } -void get_best_params(param_space const & space, - filtering_request const & request, - std::vector const & attribute_vec) +param_set get_best_params(param_space const & space, + filtering_request const & request, + std::vector const & attribute_vec) { param_set best_params(attribute_vec[0].k, 1, space); auto best_score = param_score(request, best_params, attribute_vec[0]); @@ -48,25 +48,50 @@ void get_best_params(param_space const & space, fp_rates.push_back(request.fpr(att.k)); } - /* - std::cout << '\t'; - for (size_t t{1}; t <= scores[0].size(); t++) - std::cout << "t=" << t << '\t'; - std::cout << "FPR" << '\n'; - for (size_t i{0}; i < scores.size(); i++) + return best_params; +} + +/** + * @brief For a chosen kmer size and some maximum error rate find the best threshold. +*/ +void find_thresholds_for_kmer_size(param_space const & space, + metadata const & meta, + kmer_attributes const att, + double const max_err) +{ + std::cout.precision(3); + std::cout << "Recommended shared " << att.k << "-mer thresholds for different error rates\n"; + std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_bin\n"; + + auto best_params = param_set(att.k, space.max_thresh, space); + for (size_t errors{1}; errors < (meta.segment_overlap() * max_err); errors++) { - std::cout << "k=" << std::get<0>(space.kmer_range) + i << '\t'; - for (size_t j{0}; j < scores[0].size(); j++) + filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count); + std::cout << errors / (double) request.l << '\t'; + if (att.fnr_for_param_set(request, best_params) == 0) { - std::cout << fn_rates[i][j] << '\t'; + std::cout << "kmer lemma\t" << kmer_lemma_threshold(request.l, att.k, errors) << '\t' << 0; } - std::cout << '\t' << fp_rates[i] << '\n'; + else + { + std::cout << "heuristic\t"; + double best_score = request.l; + for (size_t t{1}; t <= space.max_thresh; t++) + { + auto params = param_set(att.k, t, space); + auto score = param_score(request, params, att); + if (score < best_score) + { + best_params = params; + best_score = score; + } + } + std::cout << best_params.t << '\t' << att.fnr_for_param_set(request, best_params); + } + + std::cout << '\t' << request.fpr(att.k) << '\n'; } - */ - - std::cout << best_params.k << '\t' << best_params.t << '\t' - << fn_rates[best_params.k - std::get<0>(space.kmer_range)][best_params.t - 1] << '\t' - << fp_rates[best_params.k - std::get<0>(space.kmer_range)] << '\n'; } + } // namespace valik diff --git a/src/valik_split.cpp b/src/valik_split.cpp index 289ff921..b363b248 100644 --- a/src/valik_split.cpp +++ b/src/valik_split.cpp @@ -18,29 +18,6 @@ void valik_split(split_arguments & arguments) metadata meta(arguments); meta.to_file(arguments.meta_out); - if (arguments.split_index) - { - // ========================================== - // Search parameter tuning - // ========================================== - auto space = param_space(); - std::vector attr_vec; - if (!read_fn_confs(attr_vec)) - precalc_fn_confs(attr_vec); - - std::cout << "db length: " << meta.total_len << "bp\n"; - std::cout << "min local match length: " << arguments.overlap << "bp\n"; - std::cout << "Recommended parameters depending on the chosen error rate\n\n"; - std::cout << "max_error_rate\tkmer_size\tthreshold\tFNR\tFP_per_bin\n"; - for (size_t errors{1}; errors <= std::round(arguments.overlap * 0.1); errors++) - { - std::cout.precision(3); - std::cout << errors / (double) arguments.overlap << '\t'; - filtering_request request(errors, arguments.overlap, meta.total_len, meta.seg_count); - get_best_params(space, request, attr_vec); - } - } - if (arguments.write_ref) write_reference_segments(meta, arguments.meta_out); if (arguments.write_query) diff --git a/test/cli/valik_options_test.cpp b/test/cli/valik_options_test.cpp index e31514ad..da0cc188 100644 --- a/test/cli/valik_options_test.cpp +++ b/test/cli/valik_options_test.cpp @@ -298,10 +298,10 @@ TEST_F(argparse_search, incorrect_error_rate) "--query ", data("query.fq"), "--index ", data("8bins19window.ibf"), "--output search.gff", - "--error-rate 0.21"); + "--error-rate 0.11"); EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option -e/--error-rate: Value 0.210000 is not in range [0.000000,0.200000].\n"}); + EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option -e/--error-rate: Value 0.110000 is not in range [0.000000,0.100000].\n"}); } TEST_F(argparse_search, not_dist_no_meta)