Skip to content

Commit

Permalink
Clean up output printing
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 8, 2024
1 parent 71fa45d commit ccdcc0e
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 64 deletions.
5 changes: 2 additions & 3 deletions include/utilities/threshold/basics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ double expected_kmer_occurrences(var_t const & bin_size,
/**
* @brief K-mer lemma threshold.
*/
template <typename var_t>
var_t kmer_lemma_threshold(size_t const l, var_t const k, var_t const e)
inline size_t kmer_lemma_threshold(size_t const l, uint8_t const k, uint8_t const e)
{
if (((int64_t) l - k + 1) <= (int64_t) e*k)
if ((l - k + 1) <= (size_t) e*k)
return 0;

return l - k + 1 - e * k;
Expand Down
53 changes: 47 additions & 6 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
namespace valik
{

constexpr uint8_t query_every = 2; // query every 2nd pattern by default

/**
* @brief The user requests filtering by setting the following parameters.
*
Expand Down Expand Up @@ -90,21 +92,60 @@ struct filtering_request
return std::max(0.0, fpr);
}

/**
* @brief The probability of at least threshold k-mers matching spuriously between a query pattern and a reference bin.
*/
double pattern_spurious_match_prob(param_set const & params, uint64_t const & ref_s, size_t const ref_b) const
{
double fpr{1};
//!TODO: expected k-mer occurrences and pattern spurious match probability should be independent of the current sequence being processed
double p = std::min(1.0, expected_kmer_occurrences(std::round(ref_s / (double) ref_b), params.k));
/*
For parameters
s reference size
b number of bins
k kmer size
l min match length
t threshold
find the probability of t or more k-mers matching spuriously between a pattern of length l and a reference bin.
The probability of a pattern exceeding the threshold by chance is equal to 1 - P(0 k-mers match) -
P(1 k-mer matches) -
... -
P(t - 1 k-mers match).
Given p which is the probability of a single k-mer appearing spuriously in the reference bin and n = (l - k + 1) k-mers per pattern
the probability of 0 k-mers matching spuriously is P(0 k-mers match) = (1 - p)^n
and
the probability of 1 k-mer out of n matching is P(1 k-mer matches) = (n take 1) * p^1 * (1 - p)^(n - 1).
*/

size_t kmers_per_pattern = l - params.k + 1;
for (uint8_t matching_kmer_count{0}; matching_kmer_count < params.t; matching_kmer_count++)
{
fpr -= combinations(matching_kmer_count, kmers_per_pattern) *
pow(p, matching_kmer_count) *
pow(1 - p, kmers_per_pattern - matching_kmer_count);
}

return std::max(0.0, fpr);
}

/**
* @brief An approximation of the probability of a query segment having a spurious match in a reference bin.
*/
double fpr(param_set const & params, uint64_t const & query_seg_len) const
double fpr(param_set const & params, uint64_t const & query_seg_len, uint64_t const & ref_size, size_t const ref_bins) const
{
double pattern_p = pattern_spurious_match_prob(params);
double pattern_p = pattern_spurious_match_prob(params, ref_size, ref_bins);
/*
uint64_t independent_patterns_per_segment = query_seg_len / (double) l;
double independent_patterns_p = std::min(1.0, pattern_p * independent_patterns_per_segment);
std::cout << "independent_patterns_p\t" << independent_patterns_p << '\n';
*/
constexpr uint8_t query_every = 2; // query every 2nd pattern by default
uint64_t total_patterns_per_segment = std::round((query_seg_len - l + 1) / (double) query_every);
double total_patterns_p = std::min(1.0, pattern_p * total_patterns_per_segment);
std::cout << "total_patterns_p\t" << total_patterns_p << '\n';
return total_patterns_p;
}

Expand All @@ -114,8 +155,8 @@ struct filtering_request
uint64_t max_segment_len(param_set const & params) const
{
double pattern_p = pattern_spurious_match_prob(params);
size_t max_independent_patterns_per_segment = std::round(1.0f / pattern_p) - 1;
return l * max_independent_patterns_per_segment;
size_t max_patterns_per_segment = std::round(1.0 / pattern_p) - 1;
return l + query_every * (std::max(max_patterns_per_segment, (size_t) 2) - 1);
}
};

Expand Down
11 changes: 5 additions & 6 deletions include/utilities/threshold/find.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,17 @@ param_set get_best_params(param_space const & space,
/**
* @brief For a chosen kmer size and error rate find the best threshold.
*/
uint8_t find_threshold(param_space const & space,
metadata const & ref_meta,
metadata const & query_meta,
split_arguments const & arguments,
kmer_attributes const att);
uint8_t find_heuristic_threshold(metadata const & ref_meta,
metadata const & query_meta,
split_arguments const & arguments,
kmer_attributes const attr);

/**
* @brief For a chosen kmer size and some maximum error rate find the best threshold.
*/
void find_thresholds_for_kmer_size(param_space const & space,
metadata const & meta,
kmer_attributes const att,
kmer_attributes const attr,
double const max_err);

} // namespace valik
4 changes: 3 additions & 1 deletion include/utilities/threshold/kmer_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ struct kmer_attributes
* @brief False negative rate for a parameter set.
*/
double fnr_for_param_set(filtering_request const & request, param_set const & params) const
{
{
if (kmer_lemma_threshold(request.l, params.k, request.e) > 1)
return 0.0;
return fn_conf_counts[params.t - 1][request.e][request.l] / (double) request.total_conf_count();
}

Expand Down
2 changes: 2 additions & 0 deletions include/utilities/threshold/param_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ struct param_set

param_set(uint8_t const kmer_size, uint8_t const thresh, param_space const & space) : k(kmer_size), t(thresh)
{
/*
if ((kmer_size < std::get<0>(space.kmer_range)) | (kmer_size > std::get<1>(space.kmer_range)))
{
throw std::runtime_error{"k=" + std::to_string(kmer_size) + " out of range [" +
Expand All @@ -30,6 +31,7 @@ struct param_set
throw std::runtime_error{"threshold=" + std::to_string(thresh) + " out of range [0, " +
std::to_string(space.max_thresh) + "]"};
}
*/
}
};

Expand Down
1 change: 0 additions & 1 deletion src/argument_parsing/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ void run_search(sharg::parser & parser)
if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.shape_size, (size_t) arguments.errors) <= 1)
{
//!TODO: read in parameter metadata file
arguments.threshold = 2;
arguments.threshold_was_set = true;
}

Expand Down
5 changes: 5 additions & 0 deletions src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
.long_id = "error-rate",
.description = "Choose the upper bound for the maximum allowed error rate of a local match.",
.validator = sharg::arithmetic_range_validator{0.0f, 0.1f}});
parser.add_option(arguments.kmer_size,
sharg::config{.short_id = 'k',
.long_id = "kmer",
.description = "Choose the kmer size.",
.validator = positive_integer_validator{true}});
parser.add_option(arguments.seg_count_in,
sharg::config{.short_id = 'n',
.long_id = "seg-count",
Expand Down
57 changes: 25 additions & 32 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,77 +91,70 @@ param_set get_best_params(param_space const & space,
*/
void find_thresholds_for_kmer_size(param_space const & space,
metadata const & meta,
kmer_attributes const att,
kmer_attributes const attr,
double const max_err)
{
std::cout.precision(3);
std::cout << "Recommended shared " << std::to_string(att.k) << "-mer thresholds for different error rates\n";
std::cout << "Recommended shared " << std::to_string(attr.k) << "-mer thresholds for different error rates\n";
std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_query_seg_len\n";

auto best_params = param_set(att.k, space.max_thresh, space);
for (uint8_t errors{1}; errors < (meta.segment_overlap() * max_err); errors++)
auto best_params = param_set(attr.k, space.max_thresh, space);
for (uint8_t errors{1}; errors <= std::ceil(meta.segment_overlap() * max_err); errors++)
{
filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count);
std::cout << errors / (double) request.l << '\t';
if (att.fnr_for_param_set(request, best_params) == 0)
if (kmer_lemma_threshold(request.l, attr.k, errors) > 1)
{
std::cout << "kmer lemma\t" << std::to_string(kmer_lemma_threshold(request.l, att.k, errors)) << '\t' << 0;
best_params.t = kmer_lemma_threshold(request.l, attr.k, errors);
std::cout << "kmer lemma\t";
}
else
{
std::cout << "heuristic\t";
double best_score = request.l;
for (uint8_t t{1}; t <= space.max_thresh; t++)
{
auto params = param_set(att.k, t, space);
auto score = param_score(request, params, att);
auto params = param_set(attr.k, t, space);
auto score = param_score(request, params, attr);
if (score <= best_score)
{
best_params = params;
best_score = score;
}
}
std::cout << std::to_string(best_params.t) << '\t' << att.fnr_for_param_set(request, best_params);
}

std::cout << '\t' << request.pattern_spurious_match_prob(best_params) << '\t' << request.max_segment_len(best_params) << '\n';
std::cout << std::to_string(best_params.t) << '\t' << attr.fnr_for_param_set(request, best_params) << '\t'
<< request.pattern_spurious_match_prob(best_params) << '\n';
// << request.max_segment_len(best_params) << '\n';
}
}

/**
* @brief For a chosen kmer size and error rate find the best threshold.
* @brief For a chosen kmer size and error rate find the best heuristic threshold.
*/
uint8_t find_threshold(param_space const & space,
metadata const & ref_meta,
metadata const & query_meta,
split_arguments const & arguments,
kmer_attributes const att)
uint8_t find_heuristic_threshold(metadata const & ref_meta,
metadata const & query_meta,
split_arguments const & arguments,
kmer_attributes const attr)
{

param_space space;
filtering_request request(arguments.errors, arguments.pattern_size, ref_meta.total_len, ref_meta.seg_count);
auto best_params = param_set(arguments.kmer_size, space.max_thresh, space);
auto best_thresh = space.max_thresh;
double best_score = arguments.pattern_size;
for (uint8_t t{1}; t <= space.max_thresh; t++)
{
auto params = param_set(att.k, t, space);
auto score = param_score(request, params, att);
auto params = param_set(arguments.kmer_size, t, space);
auto score = param_score(request, params, attr);

if (score <= best_score)
{
best_params = params;
best_thresh = params.t;
best_score = score;
}
}

std::cout.precision(3);
if (arguments.verbose)
{
std::cout << "threshold " << std::to_string(best_params.t) << '\n';
std::cout << "FNR " << att.fnr_for_param_set(request, best_params) << '\n';
std::cout << "segment len " << std::to_string(std::round(query_meta.total_len / (double) query_meta.seg_count)) << '\n';
std::cout << "FPR " << request.fpr(best_params, std::round(query_meta.total_len / (double) query_meta.seg_count)) << '\n';
}
}

return best_params.t;
return best_thresh;
}

} // namespace valik
48 changes: 33 additions & 15 deletions src/valik_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,43 +28,61 @@ void valik_split(split_arguments & arguments)
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

filtering_request request(arguments.errors, arguments.pattern_size, meta.total_len, meta.seg_count);
if (arguments.split_index)
{
filtering_request request(arguments.errors, arguments.pattern_size, meta.total_len, meta.seg_count);
auto best_params = get_best_params(space, request, attr_vec);
arguments.kmer_size = best_params.k;
kmer_attributes attr = attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)];

if (arguments.verbose)
{
std::cout << "Build index for:\n";
std::cout << "\n-----------Index build parameters-----------\n";
std::cout << "db length " << meta.total_len << "bp\n";
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "max error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';

find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate);
find_thresholds_for_kmer_size(space, meta, attr, arguments.error_rate);
}
}
else
{
if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.kmer_size, (size_t) arguments.errors) <= 1)
kmer_attributes attr = attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)];
auto best_threshold = kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.kmer_size, (size_t) arguments.errors);
if (arguments.verbose)
{
metadata ref_meta = metadata(arguments.ref_meta_path);
auto best_threshold = find_threshold(space, ref_meta, meta, arguments, attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)]);
std::cout.precision(3);
std::cout << "\n-----------Search parameters-----------\n";
std::cout << "segment len " << std::to_string((uint64_t) std::round(meta.total_len / (double) meta.seg_count)) << '\n';
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';
}

metadata ref_meta = metadata(arguments.ref_meta_path);
if (best_threshold <= 1)
{
best_threshold = find_heuristic_threshold(ref_meta, meta, arguments, attr);
if (arguments.verbose)
{
std::cout.precision(3);
std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n";
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';

std::cout << "Applying heuristic threshold\n";
std::cout << "use threshold " << best_threshold;
std::cout << "Apply heuristic threshold ";
}
}
}
else
{
if (arguments.verbose)
std::cout << "Use k-mer lemma threshold ";
}

if (arguments.verbose)
{
param_set best_params(arguments.kmer_size, best_threshold, space);
std::cout << best_threshold << '\n';
std::cout << "FNR " << attr.fnr_for_param_set(request, best_params) << '\n';
std::cout << "FPR " << request.fpr(best_params, std::round(meta.total_len / (double) meta.seg_count), ref_meta.total_len, ref_meta.seg_count) << '\n';
}
}
}

if (arguments.write_out)
Expand Down

0 comments on commit ccdcc0e

Please sign in to comment.