Skip to content

Commit

Permalink
Argument deduction in split algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 4, 2024
1 parent b046a5c commit a8bab6d
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 106 deletions.
70 changes: 60 additions & 10 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,28 +43,78 @@ struct filtering_request
}

/**
* @brief Expected number of spuriously matching k-mers in a bin.
* @brief Probability of a k-mer appearing spuriously in a bin.
*/
double spurious_matches(uint8_t const kmer_size) const
double kmer_spurious_match_prob(uint8_t const kmer_size) const
{
return std::min(1.0f, expected_kmer_occurrences(std::round(s / (float) b), kmer_size));
}

/**
* @brief The probability of at least threshold k-mers matching spuriously.
* @brief The probability of at least threshold k-mers matching spuriously between a query pattern and a reference bin.
*/
double fpr(param_set const & params) const
double pattern_spurious_match_prob(param_set const & params) const
{
double fpr{1};
double p = spurious_matches(params.k);
//std::cout << "Calc fpr\n";
for (uint8_t matching_kmer_count{1}; matching_kmer_count < params.t; matching_kmer_count++)
double p = kmer_spurious_match_prob(params.k);
/*
For parameters
s reference size
b number of bins
k kmer size
l min match length
t threshold
find the probability of t or more k-mers matching spuriously between a pattern of length l and a reference bin.
The probability of a pattern exceeding the threshold by chance is equal to 1 - P(0 k-mers match) -
P(1 k-mer matches) -
... -
P(t - 1 k-mers match).
Given p which is the probability of a single k-mer appearing spuriously in the reference bin and n = (l - k + 1) k-mers per pattern
the probability of 0 k-mers matching spuriously is P(0 k-mers match) = (1 - p)^n
and
the probability of 1 k-mer out of n matching is P(1 k-mer matches) = (n take 1) * p^1 * (1 - p)^(n - 1).
*/

size_t kmers_per_pattern = l - params.k + 1;
for (uint8_t matching_kmer_count{0}; matching_kmer_count < params.t; matching_kmer_count++)
{
// std::cout << fpr << '\n';
fpr -= combinations<uint8_t, uint64_t>(matching_kmer_count, l) * pow(p, matching_kmer_count) * pow(1 - p, l - matching_kmer_count);
fpr -= combinations<uint8_t, uint64_t>(matching_kmer_count, kmers_per_pattern) *
pow(p, matching_kmer_count) *
pow(1 - p, kmers_per_pattern - matching_kmer_count);
}

return fpr;
return std::max(0.0, fpr);
}

/**
* @brief An approximation of the probability of a query segment having a spurious match in a reference bin.
*/
double fpr(param_set const & params, uint64_t const & query_seg_len, uint64_t const & overlap) const
{
double pattern_p = pattern_spurious_match_prob(params);
/*
uint64_t independent_patterns_per_segment = query_seg_len / (double) l;
double independent_patterns_p = std::min(1.0, pattern_p * independent_patterns_per_segment);
std::cout << "independent_patterns_p\t" << independent_patterns_p << '\n';
*/
uint64_t total_patterns_per_segment = std::round((query_seg_len - l + 1) / (double) (l - overlap));
double total_patterns_p = std::min(1.0, pattern_p * total_patterns_per_segment);
std::cout << "total_patterns_p\t" << total_patterns_p << '\n';
return total_patterns_p;
}

/**
* @brief The maximum length of a query segment that does not appear spuriously in reference bins.
*/
uint64_t max_segment_len(param_set const & params) const
{
double pattern_p = pattern_spurious_match_prob(params);
size_t max_independent_patterns_per_segment = std::round(1.0f / pattern_p) - 1;
return l * max_independent_patterns_per_segment;
}
};

Expand Down
5 changes: 3 additions & 2 deletions include/utilities/threshold/find.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ param_set get_best_params(param_space const & space,
* @brief For a chosen kmer size and error rate find the best threshold.
*/
uint8_t find_threshold(param_space const & space,
metadata const & meta,
search_arguments const & arguments,
metadata const & ref_meta,
metadata const & query_meta,
split_arguments const & arguments,
kmer_attributes const att);

/**
Expand Down
2 changes: 0 additions & 2 deletions include/valik/argument_parsing/build.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
#include <valik/argument_parsing/shared.hpp>
#include <valik/build/build.hpp>
#include <valik/split/metadata.hpp>
#include <utilities/threshold/io.hpp>
#include <utilities/threshold/find.hpp>

namespace valik::app
{
Expand Down
11 changes: 8 additions & 3 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,14 @@ struct split_arguments
uint32_t seg_count{64};
uint32_t seg_count_in{64};
bool split_index{false};
bool write_ref{false};
bool write_query{false};
float error_rate{0.05};
uint8_t errors{0};
size_t pattern_size{100};
uint8_t kmer_size{20};
size_t threshold{};
std::filesystem::path ref_meta_path{};
bool write_out{false};
bool verbose{false};
};

struct build_arguments
Expand All @@ -77,7 +83,6 @@ struct build_arguments
std::vector<std::vector<std::string>> bin_path{};
std::filesystem::path bin_file{};
std::filesystem::path out_path{"./"};
float error_rate{0.05};
std::string size{};
uint64_t bins{64};
uint64_t bits{4096};
Expand Down
31 changes: 3 additions & 28 deletions src/argument_parsing/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments)
.description = "Provide an output filepath.",
.required = true,
.validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create, {}}});
parser.add_option(arguments.error_rate,
sharg::config{.short_id = 'e',
.long_id = "error-rate",
.description = "Choose the upper bound for the maximum allowed error rate of a local match.",
.validator = sharg::arithmetic_range_validator{0.0f, 0.1f}});
parser.add_option(arguments.size,
sharg::config{.short_id = '\0',
.long_id = "size",
Expand Down Expand Up @@ -116,29 +111,9 @@ void run_build(sharg::parser & parser)

if (!parser.is_option_set("kmer") && !parser.is_option_set("window"))
{
// ==========================================
// Parameter deduction
// ==========================================
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

size_t errors = meta.segment_overlap() * arguments.error_rate;
filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count);
auto best_params = get_best_params(space, request, attr_vec);
arguments.kmer_size = best_params.k;

if (arguments.verbose)
{
std::cout << "Build index for:\n";
std::cout << "db length " << meta.total_len << "bp\n";
std::cout << "min local match length " << meta.segment_overlap() << "bp\n";
std::cout << "max error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';

find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate);
}
//!TODO: read in parameter metadata file
arguments.kmer_size = 12;
arguments.window_size = 12;
}
}

Expand Down
21 changes: 3 additions & 18 deletions src/argument_parsing/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,25 +240,10 @@ void run_search(sharg::parser & parser)
// ==========================================
// Parameter deduction
// ==========================================
//!TODO: the parameter deduction should not necessarily depend on the metadata
// also the metadata is deserialised twice currently
if (!arguments.ref_meta_path.empty() &&
kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.shape_size, (size_t) arguments.errors) <= 1)
if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.shape_size, (size_t) arguments.errors) <= 1)
{
metadata meta = metadata(arguments.ref_meta_path);
std::cout.precision(3);
std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n";
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.shape_size) << '\n';

std::cout << "Applying heuristic threshold\n";
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

arguments.threshold = find_threshold(space, meta, arguments, attr_vec[arguments.shape_size - std::get<0>(space.kmer_range)]);
//!TODO: read in parameter metadata file
arguments.threshold = 2;
arguments.threshold_was_set = true;
}

Expand Down
33 changes: 27 additions & 6 deletions src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
.long_id = "overlap",
.description = "Choose how much consecutive segments overlap.",
.validator = positive_integer_validator{true}});
parser.add_option(arguments.error_rate,
sharg::config{.short_id = 'e',
.long_id = "error-rate",
.description = "Choose the upper bound for the maximum allowed error rate of a local match.",
.validator = sharg::arithmetic_range_validator{0.0f, 0.1f}});
parser.add_option(arguments.seg_count_in,
sharg::config{.short_id = 'n',
.long_id = "seg-count",
Expand All @@ -31,15 +36,20 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
sharg::config{.short_id = '\0',
.long_id = "split-index",
.description = "Adjust the suggested segment count to create a multiple of 64 segments instead. This is suitable for building an IBF."});
parser.add_flag(arguments.write_ref,
parser.add_option(arguments.ref_meta_path,
sharg::config{.short_id = '\0',
.long_id = "ref-meta",
.description = "Path to reference metadata file created by split.",
.validator = sharg::input_file_validator{}});
parser.add_flag(arguments.write_out,
sharg::config{.short_id = '\0',
.long_id = "write-ref",
.description = "Write an output FASTA file for each segment.",
.description = "Write an output FASTA file for each reference segment or write all query segments into a single output FASTA file..",
.advanced = true});
parser.add_flag(arguments.write_query,
sharg::config{.short_id = '\0',
.long_id = "write-query",
.description = "Write segments into a single output FASTA file.",
parser.add_flag(arguments.verbose,
sharg::config{.short_id = 'v',
.long_id = "verbose",
.description = "Print verbose output.",
.advanced = true});

}
Expand All @@ -50,6 +60,17 @@ void run_split(sharg::parser & parser)
init_split_parser(parser, arguments);
try_parsing(parser);

arguments.errors = std::ceil(arguments.error_rate * arguments.pattern_size);

if (!parser.is_option_set("out"))
{
arguments.meta_out = arguments.seq_file;
arguments.meta_out.replace_extension("meta");
}

if (!arguments.split_index && !parser.is_option_set("ref-meta-path"))
throw sharg::parser_error{"Need to provide path to reference metadata to process a query database."};

// ==========================================
// Dispatch
// ==========================================
Expand Down
Loading

0 comments on commit a8bab6d

Please sign in to comment.