[FIX] numerical overflow error

eaasna · Feb 7, 2024 · a1f05f5 · a1f05f5
1 parent 0d6624c
commit a1f05f5
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 210 deletions.
diff --git a/include/utilities/threshold/basics.hpp b/include/utilities/threshold/basics.hpp
@@ -1,44 +1,17 @@
 #pragma once
 
+#include <algorithm>
 #include <iostream>
 #include <fstream>
 #include <cmath>
-#include <numbers>
+#include <numeric>
 #include <vector>
 #include <utility>
 #include <filesystem>
 
 namespace valik
 {
 
-template <typename val_t>
-val_t stirling_factorial(val_t const n)
-{
-    return sqrt(2 * std::numbers::pi * n) * pow((n / std::numbers::e), (double) n);
-}
-
-template <typename val_t>
-val_t exact_factorial(val_t const n)
-{
-    val_t fact{1};
-    for (size_t i = 2; i <= n; i++)
-        fact *= i;
-
-    return fact;
-}
-
-/**
- * @brief Factorial of integer n. 
-*/
-template <typename val_t>
-val_t factorial(val_t const n)
-{
-    if (n < 25)
-        return exact_factorial(n);
-    else 
-        return stirling_factorial(n);
-}
-
 /**
  *  @brief Assuming k-mers are distributed uniformly randomly, what is the expected count of a k-mer in a bin.
  * 
@@ -76,7 +49,7 @@ var_t kmer_lemma_threshold(size_t const l, var_t const k, var_t const e)
 */
 struct param_space
 {
-    constexpr static uint8_t max_errors{5};    
+    constexpr static uint8_t max_errors{15};    
     constexpr static uint8_t max_thresh{5};
     constexpr static size_t max_len{150};
     constexpr static std::pair<uint8_t, uint8_t> kmer_range{9, 21};
@@ -87,21 +60,47 @@ struct param_space
  * 
  * Same as the number of different error configurations if n=sequence length and k=error_count. 
 */
-//!TODO: need strong types to not mess up the order of these parameters
-// should this be an inline function?
-template <typename par_t, typename val_t>
-val_t combinations(par_t const k, size_t const n)
+inline uint64_t combinations(size_t const k, size_t const n)
 {
     if (n >= k)
     {
-        val_t combinations{1};
+        std::vector<size_t> divisors(k);
+        std::iota(divisors.begin(), divisors.end(), 1); // fill vector with 1, 2, ..., k
+        std::vector<bool> was_divided(k);
+
+        uint64_t combinations{1};
         for (size_t i = n - k + 1; i <= n; i++)
+        {
+            // avoid overflowing 64bits
+            if (combinations >= (std::numeric_limits<uint64_t>::max() / n))
+            {
+                for (size_t i{0}; i < divisors.size(); i++)
+                {
+                    if (!was_divided[i])
+                    {
+                        if (combinations % divisors[i] == 0)
+                        {
+                            combinations = combinations / divisors[i];
+                            was_divided[i] = true;
+                        }
+                    }
+                }
+            }
             combinations *= i;
-        combinations = combinations / factorial(k);
+        }
+
+        if (std::find(was_divided.begin(), was_divided.end(), false) != was_divided.end())
+        {
+            for (size_t i{0}; i < divisors.size(); i++)
+            {
+                if (!was_divided[i])
+                    combinations = combinations / divisors[i];
+            }
+        }
         return combinations;   // same as (uint64_t) (factorial(n) / (factorial(n - k) * factorial(k)));
     }
     else
-        return (val_t) 0;
+        return 0;
 }
 
 }   //namespace valik

diff --git a/include/utilities/threshold/filtering_request.hpp b/include/utilities/threshold/filtering_request.hpp
@@ -39,7 +39,7 @@ struct filtering_request
     */
     uint64_t total_conf_count() const
     {
-        return combinations<size_t, uint64_t>(e, l);
+        return combinations(e, l);
     }
 
     /**
@@ -82,9 +82,9 @@ struct filtering_request
         size_t kmers_per_pattern = l - params.k + 1;
         for (uint8_t matching_kmer_count{0}; matching_kmer_count < params.t; matching_kmer_count++)
         {
-            fpr -= combinations<uint8_t, uint64_t>(matching_kmer_count, kmers_per_pattern) * 
-                                                  pow(p, matching_kmer_count) * 
-                                                  pow(1 - p, kmers_per_pattern - matching_kmer_count);
+            fpr -= combinations(matching_kmer_count, kmers_per_pattern) * 
+                                pow(p, matching_kmer_count) * 
+                                pow(1 - p, kmers_per_pattern - matching_kmer_count);
         }
 
         return std::max(0.0, fpr);

diff --git a/include/utilities/threshold/kmer_attributes.hpp b/include/utilities/threshold/kmer_attributes.hpp
@@ -39,11 +39,11 @@ struct kmer_attributes
                 {       
                     int shared_base_count = l - e;
                     if ((int) (k + t) - 1 > shared_base_count) // all error distributions destroy enough k-mers
-                        row.push_back(combinations<size_t, uint64_t>(e, l));
-                    else if (e == 0)
+                        row.push_back(combinations(e, l));
+                    else if (e == 0)    // this has to be after the first condition; not everything in first row should be 0
                         row.push_back(0);
                     else
-                    {
+                    {                        
                         row.push_back(row.back() + table.back()[l - 1]);
                         row.back() -= table.back()[l - k - 1];
 

diff --git a/test/api/utilities/threshold/basics_test.cpp b/test/api/utilities/threshold/basics_test.cpp
@@ -2,22 +2,6 @@
 
 #include <utilities/threshold/basics.hpp>
 
-TEST(factorial, manual_check)
-{
-    EXPECT_EQ(valik::factorial((uint64_t) 0), 1);
-    EXPECT_EQ(valik::factorial((uint64_t) 1), 1);
-    EXPECT_EQ(valik::factorial((uint64_t)7), 5040);
-}
-
-TEST(factorial, stirling_accuracy)
-{
-    double accuracy_bound = 0.05;   // allow 5% error
-    std::vector<uint64_t> test_n{5, 10, 12, 17};
-
-    for (auto n : test_n)
-        EXPECT_TRUE(std::abs(1 - valik::exact_factorial(n) / (double) valik::stirling_factorial(n)) < accuracy_bound);
-}
-
 TEST(kmer_occurrences, small)
 {
     uint8_t kmer_size = 4;
@@ -52,24 +36,28 @@ TEST(combinations, edge_cases)
 {
     uint8_t k = 3;
     size_t n = 3;
-    EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 1);
+    EXPECT_EQ((valik::combinations(k, n)), 1);
 
     k = 4;
-    EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 0);
+    EXPECT_EQ((valik::combinations(k, n)), 0);
 
     k = 0;
     n = 0;
-    EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 1);
+    EXPECT_EQ((valik::combinations(k, n)), 1);
 }
 
 TEST(combinations, manual_test)
 {
     uint8_t k = 3;
     size_t n = 4;
-    EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 4);
-    EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), (valik::combinations<uint8_t, uint64_t>(n - k, n)));
+    EXPECT_EQ((valik::combinations(k, n)), 4);
+    EXPECT_EQ((valik::combinations(k, n)), (valik::combinations(n - k, n)));
 
     n = 5;
-    EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), 10);
-    EXPECT_EQ((valik::combinations<uint8_t, uint64_t>(k, n)), (valik::combinations<uint8_t, uint64_t>(n - k, n)));
+    EXPECT_EQ((valik::combinations(k, n)), 10);
+    EXPECT_EQ((valik::combinations(k, n)), (valik::combinations(n - k, n)));
+
+    k = 13;
+    n = 37;
+    EXPECT_EQ((valik::combinations(k, n)), 3562467300); // check for numerical overflow
 }
diff --git a/test/api/utilities/threshold/kmer_attributes_test.cpp b/test/api/utilities/threshold/kmer_attributes_test.cpp
@@ -73,7 +73,7 @@ static void check_len_less_than_kmer_size(valik::kmer_attributes const & attr)
             auto error_row = thresh_table[e];
             for (size_t l{0}; l < attr.k; l++)
             {
-                EXPECT_EQ(error_row[l], (valik::combinations<size_t, uint64_t>(e, l)));
+                EXPECT_EQ(error_row[l], (valik::combinations(e, l)));
             }
         }
     }
@@ -113,52 +113,23 @@ TEST(kmer_attributes, edge_cases)
     if (!valik::read_fn_confs(attr_vec))
         valik::precalc_fn_confs(attr_vec);
 
-    /*
-    size_t k = 9;
-    size_t t = 5;
-    size_t e = 15;
-    size_t l = 148;
-
-    // what it should be
-    for (size_t i{1}; i <= k; i++)
-        std::cout << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i]) << '\n';
-    
-    std::cout << "extra\n";
-    
-    for (size_t j{1}; j < t; j++)
-        std::cout << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j]) << '\n';
-
-    std::cout << "------------------shortcut------------------\n";
-    //row.push_back(row.back() + table.back()[l - 1]);
-    std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e][l - 1]) << '\n';
-    std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - 1]) << '\n';
-    
-    //row.back() -= table.back()[band + t - 1];
-    std::cout << "-" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - k - 1]) << '\n';
-    
-    for (uint8_t i = 1; i < t; i++)
-    {
-        //row.back() -= matrix[matrix.size() - i][e - 1][band + t - i - 1];
-        std::cout << "-" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - i][e - 1][l - k - i - 1]) << '\n';
-    
-        //row.back() += matrix[matrix.size() - i][e - 1][band + t - i];
-        std::cout << "+" << std::to_string(attr_vec[k - 9].fn_conf_counts[t - 1 - i][e - 1][l - k - i]) << '\n';
-    }
-    */
     for (auto & attr : attr_vec)
     {
         check_len_less_than_kmer_size(attr);
         check_thresh_from_kmer_lemma(attr);
     }
 }
 
+/*
+    Let f(k, t, e, l) be the number of error configurations that destroy more than t k-mers. 
+    The number of error configurations can be calculated in two ways.
+*/
 TEST(kmer_attributes, exhaustive_comparison)
 {
     std::vector<valik::kmer_attributes> attr_vec;
     if (!valik::read_fn_confs(attr_vec))
         valik::precalc_fn_confs(attr_vec);
 
-    //e = 15; // why does this not work?    
     valik::param_space space;
     for (uint8_t k{std::get<0>(space.kmer_range)}; k <= std::get<1>(space.kmer_range); k++)
     {   
@@ -167,25 +138,21 @@ TEST(kmer_attributes, exhaustive_comparison)
         {
             for (size_t e{1}; e <= space.max_errors; e++)
             {
-                for (size_t l = k + t - 1; l <= space.max_len; l++)
+                for (size_t l = k + t - 1 + e; l <= space.max_len; l++)
                 {
-                    uint64_t calc_sum = attr_vec[k - 9].fn_conf_counts[t - 1][e][l];    
-                    uint64_t proper_sum{0};
-                    // what it should be
-                    for (size_t i{1}; i <= k; i++)
-                        proper_sum += attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i];
+                    // f(k, t, e, l) is calculated based on f(k, t, e, l - 1)
+                    uint64_t calc_conf_count = attr_vec[k - 9].fn_conf_counts[t - 1][e][l];
 
-                    uint64_t row_sum = proper_sum;
+                    // f(k, t, e, l) is calculated based on a subvector sum in row (e - 1) 
+                    // i.e f(k, t, e, l) = f(k, t, e - 1, l - 1) + f(k, t, e - 1, l - 2) ...     
+                    uint64_t row_sum_conf_count{0};
+                    for (size_t i{1}; i <= k; i++)
+                        row_sum_conf_count += attr_vec[k - 9].fn_conf_counts[t - 1][e - 1][l - i];
 
                     for (size_t j{1}; j < t; j++)
-                        proper_sum += attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j];
+                        row_sum_conf_count += attr_vec[k - 9].fn_conf_counts[t - 1 - j][e - 1][l - k - j];
 
-                    if (calc_sum != proper_sum)
-                    {
-                        std::cout << "row_sum\tproper_sum\tcalc_sum\tk\tt\te\tl\n";
-                        std::cout << row_sum << '\t' << proper_sum << '\t' << calc_sum << '\t' << std::to_string(k) << '\t' << t << '\t' << e << '\t' << l << '\n';
-                    }
-                    EXPECT_EQ(proper_sum, calc_sum);
+                    EXPECT_EQ(row_sum_conf_count, calc_conf_count);
                 }
             }
         }