Skip to content

Commit

Permalink
Deduce parameters as part of build
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Jan 29, 2024
1 parent 51421de commit a07e911
Show file tree
Hide file tree
Showing 12 changed files with 143 additions and 67 deletions.
1 change: 1 addition & 0 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ struct filtering_request
size_t const bins) : s(ref_size), b(bins)
{
auto space = param_space();
//!TODO: error handling
if (errors > space.max_errors)
std::cout << "Error: error count out of range\n";
else
Expand Down
19 changes: 15 additions & 4 deletions include/utilities/threshold/find.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#pragma once

#include <utilities/threshold/shared.hpp>
#include <utilities/threshold/filtering_request.hpp>
#include <utilities/threshold/kmer_attributes.hpp>
#include <valik/split/metadata.hpp>

namespace valik
{
Expand All @@ -12,8 +12,19 @@ namespace valik
*/
double param_score(filtering_request const & request, param_set const & params, kmer_attributes const & attr);

void get_best_params(param_space const & space,
filtering_request const & request,
std::vector<kmer_attributes> const & attribute_vec);
/**
* @brief Look through the parameter space to find the best kmer size and threshold.
*/
param_set get_best_params(param_space const & space,
filtering_request const & request,
std::vector<kmer_attributes> const & attribute_vec);

/**
* @brief For a chosen kmer size and some maximum error rate find the best threshold.
*/
void find_thresholds_for_kmer_size(param_space const & space,
metadata const & meta,
kmer_attributes const att,
double const max_err);

} // namespace valik
2 changes: 1 addition & 1 deletion include/utilities/threshold/kmer_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ struct kmer_attributes
k(kmer_size),
fn_conf_counts(count_err_conf_below_thresh()) { }

kmer_attributes(size_t const kmer_size, mat_t const & matrix) : k(kmer_size), fn_conf_counts(matrix) {}
kmer_attributes(size_t const kmer_size, mat_t const & matrix) : k(kmer_size), fn_conf_counts(matrix) { }

/**
* @brief False negative rate for a parameter set.
Expand Down
9 changes: 9 additions & 0 deletions include/utilities/threshold/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ float expected_kmer_occurrences(var_t const & bin_size,
return (float) (bin_size - kmer_size + 1) / (float) pow(alphabet_size, kmer_size);
}

/**
* @brief K-mer lemma threhold.
*/
template <typename var_t>
var_t kmer_lemma_threshold(var_t const l, var_t const k, var_t const e)
{
return l - k + 1 - e * k;
}

/**
* @brief Definition of the search space for the parameter tuning algorithm.
*
Expand Down
4 changes: 4 additions & 0 deletions include/valik/argument_parsing/build.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#pragma once

#include <valik/argument_parsing/shared.hpp>
#include <valik/build/build.hpp>
#include <valik/split/metadata.hpp>
#include <utilities/threshold/io.hpp>
#include <utilities/threshold/find.hpp>

namespace valik::app
{
Expand Down
2 changes: 2 additions & 0 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,14 @@ struct build_arguments
std::vector<std::vector<std::string>> bin_path{};
std::filesystem::path bin_file{};
std::filesystem::path out_path{"./"};
float error_rate{0.05};
std::string size{};
uint64_t bins{64};
uint64_t bits{4096};
uint64_t hash{2};
bool compressed{false};
bool fast{false};
bool verbose{false};

std::filesystem::path ref_meta_path{};
};
Expand Down
6 changes: 6 additions & 0 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,12 @@ struct metadata

return stdev / mean;
}

size_t segment_overlap() const
{
assert(segments.size() > 1);
return segments[0].len - segments[1].start;
}
};

} // namespace valik
77 changes: 59 additions & 18 deletions src/argument_parsing/build.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
#include <valik/argument_parsing/build.hpp>
#include <valik/build/build.hpp>

#include <valik/split/metadata.hpp>

namespace valik::app
{
Expand All @@ -13,33 +10,23 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments)
sharg::config{.description = "File containing one file per line per bin when building from clustered sequences. "
"Input sequence file when building from overlapping segments.",
.validator = sharg::input_file_validator{}});
parser.add_option(arguments.window_size,
sharg::config{.short_id = '\0',
.long_id = "window",
.description = "Choose the window size.",
.validator = positive_integer_validator{}});
parser.add_option(arguments.kmer_size,
sharg::config{.short_id = '\0',
.long_id = "kmer",
.description = "Choose the kmer size.",
.validator = sharg::arithmetic_range_validator{1, 32}});
parser.add_option(arguments.out_path,
sharg::config{.short_id = '\0',
.long_id = "output",
.description = "Provide an output filepath.",
.required = true,
.validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create, {}}});
parser.add_option(arguments.error_rate,
sharg::config{.short_id = 'e',
.long_id = "error-rate",
.description = "Choose the upper bound for the maximum allowed error rate of a local match.",
.validator = sharg::arithmetic_range_validator{0.0f, 0.1f}});
parser.add_option(arguments.size,
sharg::config{.short_id = '\0',
.long_id = "size",
.description = "Choose the size of the resulting IBF.",
.required = true,
.validator = size_validator{"\\d+\\s{0,1}[k,m,g,t,K,M,G,T]"}});
parser.add_option(arguments.hash,
sharg::config{.short_id = '\0',
.long_id = "hash",
.description = "Choose the number of hashes.",
.validator = sharg::arithmetic_range_validator{1, 5}});
parser.add_flag(arguments.compressed,
sharg::config{.short_id = '\0',
.long_id = "compressed",
Expand All @@ -60,6 +47,33 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments)
sharg::config{.short_id = '\0',
.long_id = "fast",
.description = "Build the index in fast mode when few false negatives can be tolerated in the following search."});
parser.add_flag(arguments.verbose,
sharg::config{.short_id = '\0',
.long_id = "verbose",
.description = "Print verbose output.",
.advanced = true});

/////////////////////////////////////////
// Advanced options
/////////////////////////////////////////
parser.add_option(arguments.window_size,
sharg::config{.short_id = '\0',
.long_id = "window",
.description = "Choose the window size.",
.advanced = true,
.validator = positive_integer_validator{}});
parser.add_option(arguments.kmer_size,
sharg::config{.short_id = '\0',
.long_id = "kmer",
.description = "Choose the kmer size.",
.advanced = true,
.validator = sharg::arithmetic_range_validator{1, 32}});
parser.add_option(arguments.hash,
sharg::config{.short_id = '\0',
.long_id = "hash",
.description = "Choose the number of hashes.",
.advanced = true,
.validator = sharg::arithmetic_range_validator{1, 5}});
}

void run_build(sharg::parser & parser)
Expand Down Expand Up @@ -99,6 +113,33 @@ void run_build(sharg::parser & parser)
sequence_file_validator(arguments.bin_file);
std::string bin_string{arguments.bin_file.string()};
arguments.bin_path.emplace_back(std::vector<std::string>{bin_string});

if (!parser.is_option_set("kmer") && !parser.is_option_set("window"))
{
// ==========================================
// Search parameter tuning
// ==========================================
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

size_t errors = meta.segment_overlap() * arguments.error_rate;
filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count);
auto best_params = get_best_params(space, request, attr_vec);

std::cout.precision(3);
if (arguments.verbose)
{
std::cout << "db length: " << meta.total_len << "bp\n";
std::cout << "min local match length: " << meta.segment_overlap() << "bp\n";
std::cout << "max error rate: " << arguments.error_rate << "\n";
std::cout << "best kmer size: " << best_params.k << '\n';

find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate);
}

}
}

// ==========================================
Expand Down
2 changes: 1 addition & 1 deletion src/argument_parsing/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments)
sharg::config{.short_id = 'e',
.long_id = "error-rate",
.description = "Choose the maximum allowed error rate of a local match.",
.validator = sharg::arithmetic_range_validator{0.0f, 0.2f}});
.validator = sharg::arithmetic_range_validator{0.0f, 0.1f}});
parser.add_option(arguments.pattern_size,
sharg::config{.short_id = '\0',
.long_id = "pattern",
Expand Down
61 changes: 43 additions & 18 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ double param_score(filtering_request const & request, param_set const & params,
return attr.fnr_for_param_set(request, params) + request.fpr(params.k) / (double) params.t;
}

void get_best_params(param_space const & space,
filtering_request const & request,
std::vector<kmer_attributes> const & attribute_vec)
param_set get_best_params(param_space const & space,
filtering_request const & request,
std::vector<kmer_attributes> const & attribute_vec)
{
param_set best_params(attribute_vec[0].k, 1, space);
auto best_score = param_score(request, best_params, attribute_vec[0]);
Expand Down Expand Up @@ -48,25 +48,50 @@ void get_best_params(param_space const & space,
fp_rates.push_back(request.fpr(att.k));
}

/*
std::cout << '\t';
for (size_t t{1}; t <= scores[0].size(); t++)
std::cout << "t=" << t << '\t';
std::cout << "FPR" << '\n';
for (size_t i{0}; i < scores.size(); i++)
return best_params;
}

/**
* @brief For a chosen kmer size and some maximum error rate find the best threshold.
*/
void find_thresholds_for_kmer_size(param_space const & space,
metadata const & meta,
kmer_attributes const att,
double const max_err)
{
std::cout.precision(3);
std::cout << "Recommended shared " << att.k << "-mer thresholds for different error rates\n";
std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_bin\n";

auto best_params = param_set(att.k, space.max_thresh, space);
for (size_t errors{1}; errors < (meta.segment_overlap() * max_err); errors++)
{
std::cout << "k=" << std::get<0>(space.kmer_range) + i << '\t';
for (size_t j{0}; j < scores[0].size(); j++)
filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count);
std::cout << errors / (double) request.l << '\t';
if (att.fnr_for_param_set(request, best_params) == 0)
{
std::cout << fn_rates[i][j] << '\t';
std::cout << "kmer lemma\t" << kmer_lemma_threshold(request.l, att.k, errors) << '\t' << 0;
}
std::cout << '\t' << fp_rates[i] << '\n';
else
{
std::cout << "heuristic\t";
double best_score = request.l;
for (size_t t{1}; t <= space.max_thresh; t++)
{
auto params = param_set(att.k, t, space);
auto score = param_score(request, params, att);
if (score < best_score)
{
best_params = params;
best_score = score;
}
}
std::cout << best_params.t << '\t' << att.fnr_for_param_set(request, best_params);
}

std::cout << '\t' << request.fpr(att.k) << '\n';
}
*/

std::cout << best_params.k << '\t' << best_params.t << '\t'
<< fn_rates[best_params.k - std::get<0>(space.kmer_range)][best_params.t - 1] << '\t'
<< fp_rates[best_params.k - std::get<0>(space.kmer_range)] << '\n';
}


} // namespace valik
23 changes: 0 additions & 23 deletions src/valik_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,6 @@ void valik_split(split_arguments & arguments)
metadata meta(arguments);
meta.to_file(arguments.meta_out);

if (arguments.split_index)
{
// ==========================================
// Search parameter tuning
// ==========================================
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

std::cout << "db length: " << meta.total_len << "bp\n";
std::cout << "min local match length: " << arguments.overlap << "bp\n";
std::cout << "Recommended parameters depending on the chosen error rate\n\n";
std::cout << "max_error_rate\tkmer_size\tthreshold\tFNR\tFP_per_bin\n";
for (size_t errors{1}; errors <= std::round(arguments.overlap * 0.1); errors++)
{
std::cout.precision(3);
std::cout << errors / (double) arguments.overlap << '\t';
filtering_request request(errors, arguments.overlap, meta.total_len, meta.seg_count);
get_best_params(space, request, attr_vec);
}
}

if (arguments.write_ref)
write_reference_segments(meta, arguments.meta_out);
if (arguments.write_query)
Expand Down
4 changes: 2 additions & 2 deletions test/cli/valik_options_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,10 @@ TEST_F(argparse_search, incorrect_error_rate)
"--query ", data("query.fq"),
"--index ", data("8bins19window.ibf"),
"--output search.gff",
"--error-rate 0.21");
"--error-rate 0.11");
EXPECT_NE(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option -e/--error-rate: Value 0.210000 is not in range [0.000000,0.200000].\n"});
EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option -e/--error-rate: Value 0.110000 is not in range [0.000000,0.100000].\n"});
}

TEST_F(argparse_search, not_dist_no_meta)
Expand Down

0 comments on commit a07e911

Please sign in to comment.