Skip to content

Commit

Permalink
Check FPR of heuristic threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed May 8, 2024
1 parent cb4abb9 commit b72b46e
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 39 deletions.
16 changes: 16 additions & 0 deletions include/utilities/threshold/find.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
namespace valik
{

//!TODO: determine suitable parameters
constexpr double FPR_UPPER{0.1};
constexpr double FNR_UPPER{0.05};
constexpr uint8_t THRESH_LOWER{4};
constexpr size_t PATTERNS_PER_SEGMENT{5000};

/**
* @brief The false positive probability of a query segment that contains partially overlapping patterns.
*/
Expand All @@ -17,6 +23,16 @@ inline double segment_fpr(double const pattern_p, size_t const patterns_per_segm
return std::min(1 - none_match_p, 1.0);
}

/**
* @brief The maximum length of a query segment that does not appear spuriously in reference bins.
*/
inline uint64_t max_segment_len(double const pattern_p, size_t const pattern_size, uint8_t query_every)
{
//!TODO: this const is the *minimum* number of patterns per segment
// higher values are possible if pattern_p is small
return pattern_size + query_every * (PATTERNS_PER_SEGMENT - 1);
}

/**
* @brief Score of the objective function for a parameter set. Smaller values are better.
*/
Expand Down
11 changes: 5 additions & 6 deletions include/utilities/threshold/search_kmer_profile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ struct search_error_profile
search_kind search_type{search_kind::LEMMA};
double fnr;
double fp_per_pattern;
uint64_t max_segment_len;

search_error_profile() noexcept = default;
search_error_profile(search_error_profile const &) noexcept = default;
Expand All @@ -37,9 +36,9 @@ struct search_error_profile
~search_error_profile() noexcept = default;

search_error_profile(param_set const & best_params, search_pattern const & eps_pattern, search_kind const which_search,
double const false_neg, double const false_pos, uint64_t const max_len) :
double const false_neg, double const false_pos) :
params(best_params), pattern(eps_pattern), search_type(which_search), fnr(false_neg),
fp_per_pattern(false_pos), max_segment_len(max_len) {}
fp_per_pattern(false_pos) {}

search_error_profile(param_set const & best_params, search_pattern const & eps_pattern, search_kind const which_search) :
params(best_params), pattern(eps_pattern), search_type(which_search)
Expand All @@ -53,7 +52,7 @@ struct search_error_profile
void serialize(Archive & archive)
{
auto type_int = static_cast<int>(search_type);
archive(params, pattern, type_int, fnr, fp_per_pattern, max_segment_len);
archive(params, pattern, type_int, fnr, fp_per_pattern);
search_type = static_cast<search_kind>(type_int);
}

Expand All @@ -70,7 +69,7 @@ struct search_error_profile
case search_kind::LEMMA: std::cout << "kmer lemma"; break;
default: break;
}
std::cout << '\t' << std::to_string(params.t) << '\t' << fnr << '\t' << fp_per_pattern << '\t' << max_segment_len << '\n';
std::cout << '\t' << std::to_string(params.t) << '\t' << fnr << '\t' << fp_per_pattern << '\n';
}
}
}
Expand Down Expand Up @@ -158,7 +157,7 @@ struct search_kmer_profile
std::cout.precision(3);
std::cout << "\nRecommended shared " << std::to_string(k) << "-mer thresholds for matches of (min_length=" << std::to_string(l)
<< "; max_error_rate=" << max_error_rate() << ")\n";
std::cout << "errors\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_segment_len\n";
std::cout << "errors\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\n";

for (uint8_t er{0}; er <= max_errors; er++)
{
Expand Down
15 changes: 0 additions & 15 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,21 +651,6 @@ struct metadata

return std::max(0.0, fpr - precision);
}

/**
* @brief The maximum length of a query segment that does not appear spuriously in reference bins.
*/
uint64_t max_segment_len(param_set const & params) const
{
double fp_per_pattern = pattern_spurious_match_prob(params);
if (fp_per_pattern < 9e-6) // avoid very small floating point numbers
return 1e4;

constexpr double FPR_LIMIT = 0.05; // allow FPR of 5% per query segment
size_t max_patterns_per_segment = std::floor(log(1 - FPR_LIMIT) / log(1 - fp_per_pattern));

return pattern_size + query_every * (std::max(max_patterns_per_segment, (size_t) 2) - 1);
}
};

} // namespace valik
3 changes: 2 additions & 1 deletion src/argument_parsing/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ void run_search(sharg::parser & parser)
}
else
{
//TODO: this can be removed?
if (arguments.split_query && arguments.manual_parameters)
{
throw std::runtime_error{"Provide the chosen number of query segments with --seg-count "
Expand Down Expand Up @@ -335,7 +336,7 @@ void run_search(sharg::parser & parser)
}
else
{
arguments.max_segment_len = error_profile.max_segment_len;
arguments.max_segment_len = max_segment_len(error_profile.fp_per_pattern, arguments.pattern_size, arguments.query_every);
arguments.threshold = error_profile.params.t;
arguments.threshold_was_set = true; // use raptor::threshold_kinds::percentage
if (arguments.threshold > arguments.pattern_size - arguments.shape.size() + 1)
Expand Down
29 changes: 12 additions & 17 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,6 @@
namespace valik
{

//!TODO: determine suitable parameters
constexpr double FPR_UPPER{0.1};
constexpr double FNR_UPPER{0.05};
constexpr uint8_t THRESH_LOWER{4};
constexpr size_t DIST_GRANULARITY_LOWER{100};
constexpr size_t PATTERNS_PER_SEGMENT{5000};

/**
* @brief For a pattern (with min length and max number of errors) find the best k-mer size and threshold for a given reference database.
*/
Expand Down Expand Up @@ -113,29 +106,31 @@ search_kmer_profile find_thresholds_for_kmer_size(metadata const & ref_meta,
if ((best_params.t < THRESH_LOWER) ||
segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER)
{
search_type = search_kind::HEURISTIC;
double best_score = pattern.l;

uint8_t t{1};
search_type = search_kind::STELLAR;
uint8_t t{best_params.t + 1u};
while ((t < space.max_thresh) && (attr.fnr_for_param_set(pattern, best_params) < FNR_UPPER))
{
best_params = param_set(attr.k, t, space);
param_set try_params = param_set(attr.k, t, space);
if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER)
{
best_params = std::move(try_params);
search_type = search_kind::HEURISTIC;
}
t++;
}
if (t >= space.max_thresh)
seqan3::debug_stream << "TODO: Need to calculate a bigger FNR table\n";
}

//!TODO: find segment length cutoff
uint64_t max_len = ref_meta.max_segment_len(best_params);
if ((pattern.l * DIST_GRANULARITY_LOWER) > max_len)
if (search_type == search_kind::STELLAR)
{
search_type = search_kind::STELLAR;
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type});
}
else
{
double fnr = attr.fnr_for_param_set(pattern, best_params);
double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params);
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern, max_len});
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern});
}
}

Expand Down

0 comments on commit b72b46e

Please sign in to comment.