Skip to content

Commit

Permalink
Estimate k-mer (or minimiser) count for IBF bin size calculation
Browse files Browse the repository at this point in the history
Include IBF FPR in pattern false positive rate
  • Loading branch information
eaasna committed Apr 22, 2024
1 parent 0e2f78b commit b69f432
Show file tree
Hide file tree
Showing 49 changed files with 368 additions and 439 deletions.
3 changes: 2 additions & 1 deletion include/utilities/prepare/compute_bin_size.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

#include <valik/shared.hpp>
#include <valik/build/call_parallel_on_bins.hpp>
#include <utilities/prepare/cutoff.hpp>
#include <valik/split/metadata.hpp>
#include <utilities/prepare/parse_bin_paths.hpp>

namespace seqan::hibf::build
{
Expand Down
137 changes: 0 additions & 137 deletions include/utilities/prepare/cutoff.hpp

This file was deleted.

39 changes: 39 additions & 0 deletions include/utilities/prepare/parse_bin_paths.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#pragma once

#include <valik/shared.hpp>
#include <valik/split/metadata.hpp>

namespace valik
{

inline auto parse_bin_paths(build_arguments const & arguments)
{
std::vector<std::vector<std::string>> minimiser_files{};
if (arguments.bin_path.size() > 1)
{
for (std::vector<std::string> bin_files : arguments.bin_path)
{
std::vector<std::string> bin_headers;
for (std::filesystem::path file : bin_files)
{
bin_headers.push_back(file.replace_extension("minimiser"));
//seqan3::debug_stream << bin_headers[bin_headers.size() - 1] << '\n';
}
minimiser_files.push_back(bin_headers);
}
}
else
{
valik::metadata meta(arguments.ref_meta_path);
for (size_t bin{0}; bin < arguments.bins; bin++)
{
std::filesystem::path file = arguments.bin_path[0][0];
minimiser_files.emplace_back((std::vector<std::string>){file.replace_extension(std::to_string(bin) + ".minimiser")});
//seqan3::debug_stream << minimiser_files[minimiser_files.size() - 1][0] << '\n';
}
}

return minimiser_files;
}

} // namespace valik
2 changes: 1 addition & 1 deletion include/utilities/threshold/basics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,6 @@ inline uint64_t combinations(size_t const k, size_t const n)
return 0;
}

enum class search_kind {LEMMA, HEURISTIC, STELLAR};
enum class search_kind {LEMMA, HEURISTIC, MINIMISER, STELLAR};

} //namespace valik
22 changes: 0 additions & 22 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,28 +45,6 @@ struct filtering_request
return std::min(1 - none_match_p, 1.0);
}

/**
* @brief For a chosen kmer size and error rate find the best heuristic threshold.
*/
uint8_t find_heuristic_threshold(kmer_loss const attr) const
{
param_space space;
auto best_thresh = space.max_thresh;
double best_score = pattern.l;
for (uint8_t t{1}; t <= space.max_thresh; t++)
{
auto params = param_set(attr.k, t, space);
auto param_score = score(attr, pattern, params, ref_meta);

if (param_score <= best_score)
{
best_thresh = params.t;
best_score = param_score;
}
}

return best_thresh;
}
};

} // namespace valik
9 changes: 7 additions & 2 deletions include/utilities/threshold/find.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@ namespace valik
inline double score(kmer_loss const & attr,
search_pattern const & pattern,
param_set const & params,
metadata const & ref_meta)
metadata const & ref_meta,
size_t const PATTERNS_PER_SEGMENT)
{
return attr.fnr_for_param_set(pattern, params) + ref_meta.pattern_spurious_match_prob(params);

double none_match_p = pow(1 - ref_meta.pattern_spurious_match_prob(params), PATTERNS_PER_SEGMENT);
double fpr = std::min(1 - none_match_p, 1.0);

return attr.fnr_for_param_set(pattern, params) + fpr;
}

/**
Expand Down
18 changes: 12 additions & 6 deletions include/utilities/threshold/search_kmer_profile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ struct search_error_profile
search_type = static_cast<search_kind>(type_int);
}

void print()
void print() const
{
switch (search_type)
{
Expand Down Expand Up @@ -148,15 +148,21 @@ struct search_kmer_profile
max_errors = error_table.size();
}

void print()
double max_error_rate() const
{
return max_errors / (double) l;
}

void print() const
{
std::cout.precision(3);
std::cout << "\nRecommended shared " << std::to_string(k) << "-mer thresholds for different error rates\n";
std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_segment_len\n";
std::cout << "\nRecommended shared " << std::to_string(k) << "-mer thresholds for matches of (min_length=" << std::to_string(l)
<< "; max_error_rate=" << max_error_rate() << ")\n";
std::cout << "errors\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_segment_len\n";

for (uint8_t er{1}; er <= max_errors; er++)
for (uint8_t er{0}; er <= max_errors; er++)
{
std::cout << er / (double) l << '\t';
std::cout << std::to_string(er) << '\t';
auto error_thresh = error_table.find(er);
if (error_thresh != error_table.end())
error_thresh->second.print();
Expand Down
Loading

0 comments on commit b69f432

Please sign in to comment.