Skip to content

Commit

Permalink
Calculate FPR for spuriously matching k-mers
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Jan 31, 2024
1 parent 394775a commit b046a5c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 23 deletions.
30 changes: 22 additions & 8 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <utilities/threshold/shared.hpp>
#include <utilities/threshold/param_set.hpp>

namespace valik
{
Expand All @@ -22,35 +23,48 @@ struct filtering_request

filtering_request(uint8_t const errors, size_t const min_len,
uint64_t const ref_size,
size_t const bins) : s(ref_size), b(bins)
size_t const bins) : e(errors), l(min_len), s(ref_size), b(bins)
{
auto space = param_space();
//!TODO: error handling
if (errors > space.max_errors)
std::cout << "Error: error count out of range\n";
else
e = errors;

if (min_len > space.max_len)
std::cout << "Error: min len out of range\n";
else
l = min_len;
}

/**
* @brief Total number of error configurations.
*/
uint64_t total_conf_count() const
{
return total_err_conf_count<size_t, uint64_t>(e, l);
return combinations<size_t, uint64_t>(e, l);
}

/**
* @brief Expected number of spuriously matching k-mers in a bin.
*/
double fpr(uint8_t const & kmer_size) const
double spurious_matches(uint8_t const kmer_size) const
{
return expected_kmer_occurrences(std::round(s / (float) b), kmer_size);
return std::min(1.0f, expected_kmer_occurrences(std::round(s / (float) b), kmer_size));
}

/**
* @brief The probability of at least threshold k-mers matching spuriously.
*/
double fpr(param_set const & params) const
{
double fpr{1};
double p = spurious_matches(params.k);
//std::cout << "Calc fpr\n";
for (uint8_t matching_kmer_count{1}; matching_kmer_count < params.t; matching_kmer_count++)
{
// std::cout << fpr << '\n';
fpr -= combinations<uint8_t, uint64_t>(matching_kmer_count, l) * pow(p, matching_kmer_count) * pow(1 - p, l - matching_kmer_count);
}

return fpr;
}
};

Expand Down
2 changes: 1 addition & 1 deletion include/utilities/threshold/kmer_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct kmer_attributes
{
int shared_base_count = l - e;
if ((int) (k + t) - 1 > shared_base_count) // all error distributions destroy enough k-mers
row.push_back(total_err_conf_count<size_t, uint64_t>(e, l));
row.push_back(combinations<size_t, uint64_t>(e, l));
else if (e == 0)
row.push_back(0);
else
Expand Down
15 changes: 7 additions & 8 deletions include/utilities/threshold/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,20 @@ struct param_space
};

/**
* @brief Total number of error configurations. Same as the number of combinations of len take error_count.
* @brief Number of combinations of n take k.
*
* @param error_count Number of errors.
* @param len Sequence length.
* Same as the number of different error configurations if n=sequence length and k=error_count.
*/
template <typename par_t, typename val_t>
val_t total_err_conf_count(par_t const error_count, size_t const len)
val_t combinations(par_t const k, size_t const n)
{
if (len >= error_count)
if (n >= k)
{
val_t combinations{1};
for (size_t i = len - error_count + 1; i <= len; i++)
for (size_t i = n - k + 1; i <= n; i++)
combinations *= i;
combinations = combinations / factorial(error_count);
return combinations; // same as (uint64_t) (factorial(len) / (factorial(len - error_count) * factorial(error_count)));
combinations = combinations / factorial(k);
return combinations; // same as (uint64_t) (factorial(n) / (factorial(n - k) * factorial(k)));
}
else
return (val_t) 0;
Expand Down
14 changes: 8 additions & 6 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace valik
*/
double param_score(filtering_request const & request, param_set const & params, kmer_attributes const & attr)
{
return attr.fnr_for_param_set(request, params) + request.fpr(params.k) / (double) params.t;
return attr.fnr_for_param_set(request, params) + request.fpr(params);
}

param_set get_best_params(param_space const & space,
Expand All @@ -20,23 +20,25 @@ param_set get_best_params(param_space const & space,

std::vector<std::vector<double>> scores;
std::vector<std::vector<double>> fn_rates;
std::vector<double> fp_rates;
std::vector<std::vector<double>> fp_rates;
scores.reserve(std::get<1>(space.kmer_range) - std::get<0>(space.kmer_range) + 1);
fn_rates.reserve(std::get<1>(space.kmer_range) - std::get<0>(space.kmer_range) + 1);
fp_rates.reserve(std::get<1>(space.kmer_range) - std::get<0>(space.kmer_range) + 1);
fp_rates.reserve(space.max_thresh);

for (size_t i{0}; i < attribute_vec.size(); i++)
{
auto att = attribute_vec[i];
std::vector<double> kmer_scores;
std::vector<double> kmer_fn;
std::vector<double> kmer_fp;
uint8_t k = att.k;
for (uint8_t t = 1; t <= space.max_thresh; t++)
{
param_set params(k, t, param_space());
auto score = param_score(request, params, att);
kmer_scores.push_back(score);
kmer_fn.push_back(att.fnr_for_param_set(request, params));
kmer_fp.push_back(request.fpr(params));
if (score < best_score)
{
best_score = score;
Expand All @@ -45,7 +47,7 @@ param_set get_best_params(param_space const & space,
}
scores.push_back(kmer_scores);
fn_rates.push_back(kmer_fn);
fp_rates.push_back(request.fpr(att.k));
fp_rates.push_back(kmer_fp);
}

return best_params;
Expand Down Expand Up @@ -79,7 +81,7 @@ uint8_t find_threshold(param_space const & space,
{
std::cout << "threshold " << std::to_string(best_params.t) << '\n';
std::cout << "FNR " << att.fnr_for_param_set(request, best_params) << '\n';
std::cout << "FP_per_bin " << request.fpr(att.k) << '\n';
std::cout << "FP_per_bin " << request.fpr(best_params) << '\n';
}

return best_params.t;
Expand Down Expand Up @@ -123,7 +125,7 @@ void find_thresholds_for_kmer_size(param_space const & space,
std::cout << std::to_string(best_params.t) << '\t' << att.fnr_for_param_set(request, best_params);
}

std::cout << '\t' << request.fpr(att.k) << '\n';
std::cout << '\t' << request.fpr(best_params) << '\n';
}
}

Expand Down

0 comments on commit b046a5c

Please sign in to comment.