Skip to content

Commit

Permalink
Automatic parameter deduction in search 1/2
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Jan 29, 2024
1 parent a07e911 commit 5eba4da
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 16 deletions.
8 changes: 8 additions & 0 deletions include/utilities/threshold/find.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ param_set get_best_params(param_space const & space,
filtering_request const & request,
std::vector<kmer_attributes> const & attribute_vec);

/**
* @brief For a chosen kmer size and error rate find the best threshold.
*/
size_t find_threshold(param_space const & space,
metadata const & meta,
search_arguments const & arguments,
kmer_attributes const att);

/**
* @brief For a chosen kmer size and some maximum error rate find the best threshold.
*/
Expand Down
3 changes: 3 additions & 0 deletions include/utilities/threshold/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ float expected_kmer_occurrences(var_t const & bin_size,
template <typename var_t>
var_t kmer_lemma_threshold(var_t const l, var_t const k, var_t const e)
{
if ((l - k + 1) <= e*k)
return 0;

return l - k + 1 - e * k;
}

Expand Down
2 changes: 2 additions & 0 deletions include/valik/argument_parsing/search.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include <utilities/threshold/io.hpp>
#include <utilities/threshold/find.hpp>
#include <valik/argument_parsing/shared.hpp>

namespace valik::app
Expand Down
14 changes: 7 additions & 7 deletions src/argument_parsing/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ void run_build(sharg::parser & parser)
if (!parser.is_option_set("kmer") && !parser.is_option_set("window"))
{
// ==========================================
// Search parameter tuning
// Parameter deduction
// ==========================================
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
Expand All @@ -127,18 +127,18 @@ void run_build(sharg::parser & parser)
size_t errors = meta.segment_overlap() * arguments.error_rate;
filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count);
auto best_params = get_best_params(space, request, attr_vec);
arguments.kmer_size = best_params.k;

std::cout.precision(3);
if (arguments.verbose)
{
std::cout << "db length: " << meta.total_len << "bp\n";
std::cout << "min local match length: " << meta.segment_overlap() << "bp\n";
std::cout << "max error rate: " << arguments.error_rate << "\n";
std::cout << "best kmer size: " << best_params.k << '\n';
std::cout << "Build index for:\n";
std::cout << "db length " << meta.total_len << "bp\n";
std::cout << "min local match length " << meta.segment_overlap() << "bp\n";
std::cout << "max error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << best_params.k << '\n';

find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate);
}

}
}

Expand Down
42 changes: 35 additions & 7 deletions src/argument_parsing/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,6 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments)
.long_id = "query-meta",
.description = "Path to query genome metadata for finding all local alignment between two long sequences.",
.validator = sharg::input_file_validator{}});
parser.add_option(arguments.threshold,
sharg::config{.short_id = '\0',
.long_id = "threshold",
.description = "If set, this threshold is used instead of the kmer lemma.",
.advanced = true,
.validator = sharg::arithmetic_range_validator{1, 100}});
parser.add_flag(arguments.distribute,
sharg::config{.short_id = '\0',
.long_id = "distribute",
Expand All @@ -85,6 +79,12 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments)
/////////////////////////////////////////
// Advanced options
/////////////////////////////////////////
parser.add_option(arguments.threshold,
sharg::config{.short_id = '\0',
.long_id = "threshold",
.description = "If set, this threshold is used instead of the kmer lemma or probabilistic minimiser threshold.",
.advanced = true,
.validator = sharg::arithmetic_range_validator{1, 500}});
parser.add_flag(arguments.cache_thresholds,
sharg::config{.short_id = '\0',
.long_id = "cache-thresholds",
Expand Down Expand Up @@ -216,6 +216,9 @@ void run_search(sharg::parser & parser)
arguments.bin_path = tmp.bin_path();
}

//!TODO: why no kmer length??
std::cout << "kmer_length\t" << arguments.shape_size << '\n';

// ==========================================
// Process --pattern.
// ==========================================
Expand All @@ -237,10 +240,35 @@ void run_search(sharg::parser & parser)
}
}

// ==========================================
// Parameter deduction
// ==========================================
//!TODO: the parameter deduction should not necessarily depend on the metadata
// also the metadata is deserialised twice currently
if (!arguments.ref_meta_path.empty() &&
kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.shape_size, (size_t) arguments.errors) <= 1)
{
metadata meta = metadata(arguments.ref_meta_path);
std::cout.precision(3);
std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n";
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << arguments.shape_size << '\n';

std::cout << "Applying heuristic threshold\n";
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

arguments.threshold = find_threshold(space, meta, arguments, attr_vec[arguments.shape_size - std::get<0>(space.kmer_range)]);
arguments.threshold_was_set = true;
}

// ==========================================
// Process --threshold.
// ==========================================
if (parser.is_option_set("threshold"))
if (arguments.threshold_was_set || parser.is_option_set("threshold"))
{
arguments.threshold_was_set = true; // use raptor::threshold_kinds::percentage
arguments.threshold_percentage = arguments.threshold / (double) (arguments.pattern_size - arguments.shape.size() + 1);
Expand Down
38 changes: 36 additions & 2 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,41 @@ param_set get_best_params(param_space const & space,
}

/**
* @brief For a chosen kmer size and some maximum error rate find the best threshold.
* @brief For a chosen kmer size and error rate find the best threshold.
*/
size_t find_threshold(param_space const & space,
metadata const & meta,
search_arguments const & arguments,
kmer_attributes const att)
{

filtering_request request(arguments.errors, arguments.pattern_size, meta.total_len, meta.seg_count);
auto best_params = param_set(arguments.shape_size, space.max_thresh, space);
double best_score = arguments.pattern_size;
for (size_t t{1}; t <= space.max_thresh; t++)
{
auto params = param_set(att.k, t, space);
auto score = param_score(request, params, att);
if (score <= best_score)
{
best_params = params;
best_score = score;
}
}

std::cout.precision(3);
if (arguments.verbose)
{
std::cout << "threshold " << best_params.t << '\n';
std::cout << "FNR " << att.fnr_for_param_set(request, best_params) << '\n';
std::cout << "FP_per_bin " << request.fpr(att.k) << '\n';
}

return best_params.t;
}

/**
* @brief For a chosen kmer size and up to some maximum error rate find the best thresholds.
*/
void find_thresholds_for_kmer_size(param_space const & space,
metadata const & meta,
Expand Down Expand Up @@ -80,7 +114,7 @@ void find_thresholds_for_kmer_size(param_space const & space,
{
auto params = param_set(att.k, t, space);
auto score = param_score(request, params, att);
if (score < best_score)
if (score <= best_score)
{
best_params = params;
best_score = score;
Expand Down

0 comments on commit 5eba4da

Please sign in to comment.