Skip to content

Commit

Permalink
[FIX] numerical overflow error
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 7, 2024
1 parent 0d6624c commit a1f05f5
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 210 deletions.
73 changes: 36 additions & 37 deletions include/utilities/threshold/basics.hpp
Original file line number Diff line number Diff line change
@@ -1,44 +1,17 @@
#pragma once

#include <algorithm>
#include <iostream>
#include <fstream>
#include <cmath>
#include <numbers>
#include <numeric>
#include <vector>
#include <utility>
#include <filesystem>

namespace valik
{

template <typename val_t>
val_t stirling_factorial(val_t const n)
{
return sqrt(2 * std::numbers::pi * n) * pow((n / std::numbers::e), (double) n);
}

template <typename val_t>
val_t exact_factorial(val_t const n)
{
val_t fact{1};
for (size_t i = 2; i <= n; i++)
fact *= i;

return fact;
}

/**
* @brief Factorial of integer n.
*/
template <typename val_t>
val_t factorial(val_t const n)
{
if (n < 25)
return exact_factorial(n);
else
return stirling_factorial(n);
}

/**
* @brief Assuming k-mers are distributed uniformly randomly, what is the expected count of a k-mer in a bin.
*
Expand Down Expand Up @@ -76,7 +49,7 @@ var_t kmer_lemma_threshold(size_t const l, var_t const k, var_t const e)
*/
struct param_space
{
constexpr static uint8_t max_errors{5};
constexpr static uint8_t max_errors{15};
constexpr static uint8_t max_thresh{5};
constexpr static size_t max_len{150};
constexpr static std::pair<uint8_t, uint8_t> kmer_range{9, 21};
Expand All @@ -87,21 +60,47 @@ struct param_space
*
* Same as the number of different error configurations if n=sequence length and k=error_count.
*/
//!TODO: need strong types to not mess up the order of these parameters
// should this be an inline function?
template <typename par_t, typename val_t>
val_t combinations(par_t const k, size_t const n)
inline uint64_t combinations(size_t const k, size_t const n)
{
if (n >= k)
{
val_t combinations{1};
std::vector<size_t> divisors(k);
std::iota(divisors.begin(), divisors.end(), 1); // fill vector with 1, 2, ..., k
std::vector<bool> was_divided(k);

uint64_t combinations{1};
for (size_t i = n - k + 1; i <= n; i++)
{
// avoid overflowing 64bits
if (combinations >= (std::numeric_limits<uint64_t>::max() / n))
{
for (size_t i{0}; i < divisors.size(); i++)
{
if (!was_divided[i])
{
if (combinations % divisors[i] == 0)
{
combinations = combinations / divisors[i];
was_divided[i] = true;
}
}
}
}
combinations *= i;
combinations = combinations / factorial(k);
}

if (std::find(was_divided.begin(), was_divided.end(), false) != was_divided.end())
{
for (size_t i{0}; i < divisors.size(); i++)
{
if (!was_divided[i])
combinations = combinations / divisors[i];
}
}
return combinations; // same as (uint64_t) (factorial(n) / (factorial(n - k) * factorial(k)));
}
else
return (val_t) 0;
return 0;
}

} //namespace valik
Expand Down
8 changes: 4 additions & 4 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct filtering_request
*/
uint64_t total_conf_count() const
{
return combinations<size_t, uint64_t>(e, l);
return combinations(e, l);
}

/**
Expand Down Expand Up @@ -82,9 +82,9 @@ struct filtering_request
size_t kmers_per_pattern = l - params.k + 1;
for (uint8_t matching_kmer_count{0}; matching_kmer_count < params.t; matching_kmer_count++)
{
fpr -= combinations<uint8_t, uint64_t>(matching_kmer_count, kmers_per_pattern) *
pow(p, matching_kmer_count) *
pow(1 - p, kmers_per_pattern - matching_kmer_count);
fpr -= combinations(matching_kmer_count, kmers_per_pattern) *
pow(p, matching_kmer_count) *
pow(1 - p, kmers_per_pattern - matching_kmer_count);
}

return std::max(0.0, fpr);
Expand Down
6 changes: 3 additions & 3 deletions include/utilities/threshold/kmer_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ struct kmer_attributes
{
int shared_base_count = l - e;
if ((int) (k + t) - 1 > shared_base_count) // all error distributions destroy enough k-mers
row.push_back(combinations<size_t, uint64_t>(e, l));
else if (e == 0)
row.push_back(combinations(e, l));
else if (e == 0) // this has to be after the first condition; not everything in first row should be 0
row.push_back(0);
else
{
{
row.push_back(row.back() + table.back()[l - 1]);
row.back() -= table.back()[l - k - 1];

Expand Down
34 changes: 11 additions & 23 deletions test/api/utilities/threshold/basics_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,6 @@

#include <utilities/threshold/basics.hpp>

TEST(factorial, manual_check)
{
EXPECT_EQ(valik::factorial((uint64_t) 0), 1);
EXPECT_EQ(valik::factorial((uint64_t) 1), 1);
EXPECT_EQ(valik::factorial((uint64_t)7), 5040);
}

TEST(factorial, stirling_accuracy)
{
double accuracy_bound = 0.05; // allow 5% error
std::vector<uint64_t> test_n{5, 10, 12, 17};

for (auto n : test_n)
EXPECT_TRUE(std::abs(1 - valik::exact_factorial(n) / (double) valik::stirling_factorial(n)) < accuracy_bound);
}

TEST(kmer_occurrences, small)
{
uint8_t kmer_size = 4;
Expand Down Expand Up @@ -52,24 +36,28 @@ TEST(combinations, edge_cases)
{
uint8_t k = 3;
size_t n = 3;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 1);
EXPECT_EQ((valik::combinations(k, n)), 1);

k = 4;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 0);
EXPECT_EQ((valik::combinations(k, n)), 0);

k = 0;
n = 0;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 1);
EXPECT_EQ((valik::combinations(k, n)), 1);
}

TEST(combinations, manual_test)
{
uint8_t k = 3;
size_t n = 4;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 4);
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), (valik::combinations<uint8_t, uint64_t>(n - k, n)));
EXPECT_EQ((valik::combinations(k, n)), 4);
EXPECT_EQ((valik::combinations(k, n)), (valik::combinations(n - k, n)));

n = 5;
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 10);
EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), (valik::combinations<uint8_t, uint64_t>(n - k, n)));
EXPECT_EQ((valik::combinations(k, n)), 10);
EXPECT_EQ((valik::combinations(k, n)), (valik::combinations(n - k, n)));

k = 13;
n = 37;
EXPECT_EQ((valik::combinations(k, n)), 3562467300); // check for numerical overflow
}
63 changes: 15 additions & 48 deletions test/api/utilities/threshold/kmer_attributes_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ static void check_len_less_than_kmer_size(valik::kmer_attributes const & attr)
auto error_row = thresh_table[e];
for (size_t l{0}; l < attr.k; l++)
{
EXPECT_EQ(error_row[l], (valik::combinations<size_t, uint64_t>(e, l)));
EXPECT_EQ(error_row[l], (valik::combinations(e, l)));
}
}
}
Expand Down Expand Up @@ -113,52 +113,23 @@ TEST(kmer_attributes, edge_cases)
if (!valik::read_fn_confs(attr_vec))
valik::precalc_fn_confs(attr_vec);

/*
size_t k = 9;
size_t t = 5;
size_t e = 15;
size_t l = 148;
// what it should be
for (size_t i{1}; i <= k; i++)
std::cout << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i]) << '\n';
std::cout << "extra\n";
for (size_t j{1}; j < t; j++)
std::cout << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j]) << '\n';
std::cout << "------------------shortcut------------------\n";
//row.push_back(row.back() + table.back()[l - 1]);
std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e][l - 1]) << '\n';
std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - 1]) << '\n';
//row.back() -= table.back()[band + t - 1];
std::cout << "-" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - k - 1]) << '\n';
for (uint8_t i = 1; i < t; i++)
{
//row.back() -= matrix[matrix.size() - i][e - 1][band + t - i - 1];
std::cout << "-" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - i][e - 1][l - k - i - 1]) << '\n';
//row.back() += matrix[matrix.size() - i][e - 1][band + t - i];
std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - i][e - 1][l - k - i]) << '\n';
}
*/
for (auto & attr : attr_vec)
{
check_len_less_than_kmer_size(attr);
check_thresh_from_kmer_lemma(attr);
}
}

/*
Let f(k, t, e, l) be the number of error configurations that destroy more than t k-mers.
The number of error configurations can be calculated in two ways.
*/
TEST(kmer_attributes, exhaustive_comparison)
{
std::vector<valik::kmer_attributes> attr_vec;
if (!valik::read_fn_confs(attr_vec))
valik::precalc_fn_confs(attr_vec);

//e = 15; // why does this not work?
valik::param_space space;
for (uint8_t k{std::get<0>(space.kmer_range)}; k <= std::get<1>(space.kmer_range); k++)
{
Expand All @@ -167,25 +138,21 @@ TEST(kmer_attributes, exhaustive_comparison)
{
for (size_t e{1}; e <= space.max_errors; e++)
{
for (size_t l = k + t - 1; l <= space.max_len; l++)
for (size_t l = k + t - 1 + e; l <= space.max_len; l++)
{
uint64_t calc_sum = attr_vec[k - 9].fn_conf_counts[t - 1][e][l];
uint64_t proper_sum{0};
// what it should be
for (size_t i{1}; i <= k; i++)
proper_sum += attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i];
// f(k, t, e, l) is calculated based on f(k, t, e, l - 1)
uint64_t calc_conf_count = attr_vec[k - 9].fn_conf_counts[t - 1][e][l];

uint64_t row_sum = proper_sum;
// f(k, t, e, l) is calculated based on a subvector sum in row (e - 1)
// i.e f(k, t, e, l) = f(k, t, e - 1, l - 1) + f(k, t, e - 1, l - 2) ...
uint64_t row_sum_conf_count{0};
for (size_t i{1}; i <= k; i++)
row_sum_conf_count += attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i];

for (size_t j{1}; j < t; j++)
proper_sum += attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j];
row_sum_conf_count += attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j];

if (calc_sum != proper_sum)
{
std::cout << "row_sum\tproper_sum\tcalc_sum\tk\tt\te\tl\n";
std::cout << row_sum << '\t' << proper_sum << '\t' << calc_sum << '\t' << std::to_string(k) << '\t' << t << '\t' << e << '\t' << l << '\n';
}
EXPECT_EQ(proper_sum, calc_sum);
EXPECT_EQ(row_sum_conf_count, calc_conf_count);
}
}
}
Expand Down
Loading

0 comments on commit a1f05f5

Please sign in to comment.