Skip to content

Commit

Permalink
[TEST] heuristic threshold lib
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 6, 2024
1 parent 8151cfc commit 0d6624c
Show file tree
Hide file tree
Showing 10 changed files with 324 additions and 23 deletions.
12 changes: 7 additions & 5 deletions include/utilities/threshold/shared.hpp → include/utilities/threshold/basics.hpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,20 @@ val_t factorial(val_t const n)
*
*/
template <typename var_t, typename par_t>
float expected_kmer_occurrences(var_t const & bin_size,
double expected_kmer_occurrences(var_t const & bin_size,
par_t const & kmer_size)
{
constexpr uint8_t alphabet_size{4};
return (float) (bin_size - kmer_size + 1) / (float) pow(alphabet_size, kmer_size);
return (double) (bin_size - kmer_size + 1) / (double) pow(alphabet_size, kmer_size);
}

/**
* @brief K-mer lemma threhold.
* @brief K-mer lemma threshold.
*/
template <typename var_t>
var_t kmer_lemma_threshold(size_t const l, var_t const k, var_t const e)
{
if ((l - k + 1) <= e*k)
if (((int64_t) l - k + 1) <= (int64_t) e*k)
return 0;

return l - k + 1 - e * k;
Expand All @@ -76,7 +76,7 @@ var_t kmer_lemma_threshold(size_t const l, var_t const k, var_t const e)
*/
struct param_space
{
constexpr static uint8_t max_errors{15};
constexpr static uint8_t max_errors{5};
constexpr static uint8_t max_thresh{5};
constexpr static size_t max_len{150};
constexpr static std::pair<uint8_t, uint8_t> kmer_range{9, 21};
Expand All @@ -87,6 +87,8 @@ struct param_space
*
* Same as the number of different error configurations if n=sequence length and k=error_count.
*/
//!TODO: need strong types to not mess up the order of these parameters
// should this be an inline function?
template <typename par_t, typename val_t>
val_t combinations(par_t const k, size_t const n)
{
Expand Down
14 changes: 7 additions & 7 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#include <utilities/threshold/shared.hpp>
#include <utilities/threshold/basics.hpp>
#include <utilities/threshold/param_set.hpp>

namespace valik
Expand All @@ -10,7 +10,7 @@ namespace valik
* @brief The user requests filtering by setting the following parameters.
*
* @param e Number of errors.
* @param l Minimum length of local match.
* @param l Minimum length of local match i.e pattern length.
* @param s Reference size in bases.
* @param b Number of reference segments.
*/
Expand All @@ -26,12 +26,12 @@ struct filtering_request
size_t const bins) : e(errors), l(min_len), s(ref_size), b(bins)
{
auto space = param_space();
//!TODO: error handling
if (errors > space.max_errors)
std::cout << "Error: error count out of range\n";

throw std::runtime_error{"error_count=" + std::to_string(errors) + " out of range [0, " +
std::to_string(space.max_errors) + "]"};
if (min_len > space.max_len)
std::cout << "Error: min len out of range\n";
throw std::runtime_error{"min_len=" + std::to_string(min_len) + " out of range [0, " +
std::to_string(space.max_len) + "]"};
}

/**
Expand All @@ -47,7 +47,7 @@ struct filtering_request
*/
double kmer_spurious_match_prob(uint8_t const kmer_size) const
{
return std::min(1.0f, expected_kmer_occurrences(std::round(s / (float) b), kmer_size));
return std::min(1.0, expected_kmer_occurrences(std::round(s / (double) b), kmer_size));
}

/**
Expand Down
8 changes: 3 additions & 5 deletions include/utilities/threshold/kmer_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,12 @@ struct kmer_attributes
else
{
row.push_back(row.back() + table.back()[l - 1]);

size_t band = l - k - t;
row.back() -= table.back()[band + t - 1];
row.back() -= table.back()[l - k - 1];

for (uint8_t i = 1; i < t; i++)
{
row.back() -= matrix[matrix.size() - i][e - 1][band + t - i - 1];
row.back() += matrix[matrix.size() - i][e - 1][band + t - i];
row.back() -= matrix[matrix.size() - i][e - 1][l - k - i - 1];
row.back() += matrix[matrix.size() - i][e - 1][l - k - i];
}
}
}
Expand Down
15 changes: 11 additions & 4 deletions include/utilities/threshold/param_set.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#include <utilities/threshold/shared.hpp>
#include <utilities/threshold/basics.hpp>

namespace valik
{
Expand All @@ -19,10 +19,17 @@ struct param_set
param_set(uint8_t const kmer_size, uint8_t const thresh, param_space const & space) : k(kmer_size), t(thresh)
{
if ((kmer_size < std::get<0>(space.kmer_range)) | (kmer_size > std::get<1>(space.kmer_range)))
std::cout << "Error: k out of range\n";

{
throw std::runtime_error{"k=" + std::to_string(kmer_size) + " out of range [" +
std::to_string(std::get<0>(space.kmer_range)) + ", " +
std::to_string(std::get<1>(space.kmer_range)) + "]"};
}

if (thresh > space.max_thresh)
std::cout << "Error: thresh out of range\n";
{
throw std::runtime_error{"threshold=" + std::to_string(thresh) + " out of range [0, " +
std::to_string(space.max_thresh) + "]"};
}
}
};

Expand Down
2 changes: 1 addition & 1 deletion include/valik/search/prefilter_queries_parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <valik/search/query_record.hpp>
#include <valik/search/sync_out.hpp>
#include <utilities/cart_queue.hpp>
#include <utilities/threshold/shared.hpp>
#include <utilities/threshold/basics.hpp>

#include <raptor/threshold/threshold.hpp>

Expand Down
2 changes: 1 addition & 1 deletion include/valik/split/split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include <utilities/threshold/find.hpp>
#include <utilities/threshold/io.hpp>
#include <utilities/threshold/shared.hpp>
#include <utilities/threshold/basics.hpp>
#include <valik/argument_parsing/shared.hpp>
#include <valik/shared.hpp>
#include <valik/split/metadata.hpp>
Expand Down
2 changes: 2 additions & 0 deletions test/api/utilities/threshold/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
add_api_test (basics_test.cpp)
add_api_test (kmer_attributes_test.cpp)
add_api_test (param_set_test.cpp)
75 changes: 75 additions & 0 deletions test/api/utilities/threshold/basics_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include <gtest/gtest.h>

#include <utilities/threshold/basics.hpp>

TEST(factorial, manual_check)
{
EXPECT_EQ(valik::factorial((uint64_t) 0), 1);
EXPECT_EQ(valik::factorial((uint64_t) 1), 1);
EXPECT_EQ(valik::factorial((uint64_t)7), 5040);
}

TEST(factorial, stirling_accuracy)
{
double accuracy_bound = 0.05; // allow 5% error
std::vector<uint64_t> test_n{5, 10, 12, 17};

for (auto n : test_n)
EXPECT_TRUE(std::abs(1 - valik::exact_factorial(n) / (double) valik::stirling_factorial(n)) < accuracy_bound);
}

TEST(kmer_occurrences, small)
{
uint8_t kmer_size = 4;
uint64_t genome_len = 256 + kmer_size - 1;
EXPECT_EQ(valik::expected_kmer_occurrences(genome_len, kmer_size), 1.0);
}

TEST(kmer_occurrences, human_genome)
{
uint64_t human_genome_len = 3e9;
uint8_t kmer_size = 16;
EXPECT_LE(valik::expected_kmer_occurrences(human_genome_len, kmer_size), 1.0);
}

TEST(kmer_lemma, manual_test)
{
size_t l = 100;
uint8_t k = 10;

uint8_t shared_kmer_count = l - k + 1; // 91
for (uint8_t e{0}; e < 9; e++)
EXPECT_EQ(valik::kmer_lemma_threshold(l, k, e), shared_kmer_count - k * e);

uint8_t e = 10;
EXPECT_EQ(valik::kmer_lemma_threshold(l, k, e), 0);

l = 15;
EXPECT_EQ(valik::kmer_lemma_threshold(l, k, e), 0);
}

TEST(combinations, edge_cases)
{
uint8_t k = 3;
size_t n = 3;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 1);

k = 4;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 0);

k = 0;
n = 0;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 1);
}

TEST(combinations, manual_test)
{
uint8_t k = 3;
size_t n = 4;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 4);
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), (valik::combinations<uint8_t, uint64_t>(n - k, n)));

n = 5;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 10);
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), (valik::combinations<uint8_t, uint64_t>(n - k, n)));
}
154 changes: 154 additions & 0 deletions test/api/utilities/threshold/kmer_attributes_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,157 @@ TEST(kmer_attributes, equal_after_serialization)
}
}
}

TEST(kmer_attributes, basic_checks)
{
valik::param_space space;

std::vector<valik::kmer_attributes> attr_vec;
if (!valik::read_fn_confs(attr_vec))
valik::precalc_fn_confs(attr_vec);

EXPECT_EQ(attr_vec.size(), std::get<1>(space.kmer_range) - std::get<0>(space.kmer_range) + 1);

for (auto & attr : attr_vec)
{
EXPECT_EQ(attr.fn_conf_counts.size(), space.max_thresh);
for (auto & thresh_table : attr.fn_conf_counts)
{
EXPECT_EQ(thresh_table.size(), space.max_errors + 1);
for (auto & error_row : thresh_table)
{
EXPECT_EQ(error_row.size(), space.max_len + 1);
}
}
}
}

// Test: case where there are no shared k-mers because the sequence length is less than the k-mer length
// the conf count is equal to (l take e) because all possible error configuration destroy all shared k-mers
static void check_len_less_than_kmer_size(valik::kmer_attributes const & attr)
{
for (auto & thresh_table : attr.fn_conf_counts)
{
for (size_t e{0}; e < thresh_table.size(); e++)
{
auto error_row = thresh_table[e];
for (size_t l{0}; l < attr.k; l++)
{
EXPECT_EQ(error_row[l], (valik::combinations<size_t, uint64_t>(e, l)));
}
}
}
}

// Test: case where the thresh parameter is less or equal to the k-mer lemma threshold
// the conf count is 0 because no error configuration destroys more than the allowed number of k-mers
static void check_thresh_from_kmer_lemma(valik::kmer_attributes const & attr)
{
for (uint8_t t{1}; t <= attr.fn_conf_counts.size(); t++)
{
auto thresh_table = attr.fn_conf_counts[t - 1];
for (uint8_t e{0}; e < thresh_table.size(); e++)
{
auto error_row = thresh_table[e];
for (size_t l{0}; l < error_row.size(); l++)
{
if (valik::kmer_lemma_threshold(l, attr.k, e) >= t)
{
if (error_row[l] != 0)
{
std::cout << "l\t" << std::to_string(l) << '\n';
std::cout << "k\t" << std::to_string(attr.k) << '\n';
std::cout << "e\t" << std::to_string(e) << '\n';
std::cout << "t\t" << std::to_string(t) << '\n';
}
EXPECT_EQ(error_row[l], 0);
}
}
}
}
}

TEST(kmer_attributes, edge_cases)
{
std::vector<valik::kmer_attributes> attr_vec;
if (!valik::read_fn_confs(attr_vec))
valik::precalc_fn_confs(attr_vec);

/*
size_t k = 9;
size_t t = 5;
size_t e = 15;
size_t l = 148;
// what it should be
for (size_t i{1}; i <= k; i++)
std::cout << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i]) << '\n';
std::cout << "extra\n";
for (size_t j{1}; j < t; j++)
std::cout << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j]) << '\n';
std::cout << "------------------shortcut------------------\n";
//row.push_back(row.back() + table.back()[l - 1]);
std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e][l - 1]) << '\n';
std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - 1]) << '\n';
//row.back() -= table.back()[band + t - 1];
std::cout << "-" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - k - 1]) << '\n';
for (uint8_t i = 1; i < t; i++)
{
//row.back() -= matrix[matrix.size() - i][e - 1][band + t - i - 1];
std::cout << "-" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - i][e - 1][l - k - i - 1]) << '\n';
//row.back() += matrix[matrix.size() - i][e - 1][band + t - i];
std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - i][e - 1][l - k - i]) << '\n';
}
*/
for (auto & attr : attr_vec)
{
check_len_less_than_kmer_size(attr);
check_thresh_from_kmer_lemma(attr);
}
}

TEST(kmer_attributes, exhaustive_comparison)
{
std::vector<valik::kmer_attributes> attr_vec;
if (!valik::read_fn_confs(attr_vec))
valik::precalc_fn_confs(attr_vec);

//e = 15; // why does this not work?
valik::param_space space;
for (uint8_t k{std::get<0>(space.kmer_range)}; k <= std::get<1>(space.kmer_range); k++)
{
auto kmer_attr = attr_vec[k - 9];
for (size_t t{1}; t <= space.max_thresh; t++)
{
for (size_t e{1}; e <= space.max_errors; e++)
{
for (size_t l = k + t - 1; l <= space.max_len; l++)
{
uint64_t calc_sum = attr_vec[k - 9].fn_conf_counts[t - 1][e][l];
uint64_t proper_sum{0};
// what it should be
for (size_t i{1}; i <= k; i++)
proper_sum += attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i];

uint64_t row_sum = proper_sum;

for (size_t j{1}; j < t; j++)
proper_sum += attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j];

if (calc_sum != proper_sum)
{
std::cout << "row_sum\tproper_sum\tcalc_sum\tk\tt\te\tl\n";
std::cout << row_sum << '\t' << proper_sum << '\t' << calc_sum << '\t' << std::to_string(k) << '\t' << t << '\t' << e << '\t' << l << '\n';
}
EXPECT_EQ(proper_sum, calc_sum);
}
}
}
}
}
Loading

0 comments on commit 0d6624c

Please sign in to comment.