diff --git a/include/builder/build.impl b/include/builder/build.impl index dab731c..a30e7c4 100644 --- a/include/builder/build.impl +++ b/include/builder/build.impl @@ -26,7 +26,7 @@ void dictionary::build(std::string const& filename, throw std::runtime_error("m must be less <= " + std::to_string(kmer_t::max_m) + " but got m = " + std::to_string(build_config.m)); } - if (build_config.m > build_config.k) throw std::runtime_error("m must be < k"); + if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k"); if (build_config.l >= constants::max_l) { throw std::runtime_error("l must be < " + std::to_string(constants::max_l)); } @@ -76,6 +76,11 @@ void dictionary::build(std::string const& filename, mm::file_source input(data.minimizers.get_minimizers_filename(), mm::advice::sequential); minimizers_tuples_iterator iterator(input.data(), input.data() + input.size()); + + if (build_config.verbose) { + std::cout << "num_minimizers " << data.minimizers.num_minimizers() << std::endl; + } + m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config); input.close(); } diff --git a/include/cover/parse_file.hpp b/include/cover/parse_file.hpp index 53581c7..f9f78f0 100644 --- a/include/cover/parse_file.hpp +++ b/include/cover/parse_file.hpp @@ -159,8 +159,7 @@ struct lines_iterator { std::string read_line() { uint8_t const* begin = m_begin; - while (m_begin != m_end and *m_begin++ != '\n') - ; + while (m_begin != m_end and *m_begin++ != '\n'); if (begin == m_begin) return std::string(""); return std::string(reinterpret_cast(begin), m_begin - begin - 1); } diff --git a/include/info.impl b/include/info.impl index 8658c6d..02c3d6c 100644 --- a/include/info.impl +++ b/include/info.impl @@ -1,5 +1,41 @@ namespace sshash { +double bits_per_kmer_formula(uint64_t k, /* kmer length */ + uint64_t m, /* minimizer length */ + uint64_t n, /* num. kmers */ + uint64_t M) /* num. strings in SPSS */ +{ + /* + Caveats: + 1. we assume an alphabet of size 4 + 2. this assumes a random minimizer scheme, so num. super-kmers is ~ 2n/(k-m+2) + 3. we neglect lower order terms and skew index space + 4. no canonical parsing + */ + + assert(k > 0); + assert(k >= m); + + const uint64_t N = n + M * (k - 1); // num. symbols in SPSS + + // double num_minimizers = (2.0 * n) / (k - m + 2); // not distinct, hence num. of super-kmers + // std::cout << "num_minimizers = " << num_minimizers << std::endl; + // std::cout << "minimizers: " << (3.0 * num_minimizers) / n << " [bits/kmer]" << std::endl; + // std::cout << "pieces: " << (M * (2.0 + std::ceil(std::log2(static_cast(N) / M)))) / n + // << " [bits/kmer]" << std::endl; + // std::cout << "num_super_kmers_before_bucket: " << (2.0 * num_minimizers) / n << " [bits/kmer] + // " + // << std::endl; + // std::cout << "offsets: " << (std::ceil(std::log2(N)) * num_minimizers) / n << " [bits/kmer]" + // << std::endl; + // std::cout << "strings: " << (2.0 * N) / n << " [bits/kmer]" << std::endl; + + double num_bits = 2 * n * (1.0 + (5.0 + std::ceil(std::log2(N))) / (k - m + 2)) + + M * (2 * k + std::ceil(std::log2(static_cast(n) / M + k - 1))); + + return num_bits / n; +} + template void dictionary::print_space_breakdown() const { const uint64_t num_bytes = (num_bits() + 7) / 8; @@ -7,7 +43,9 @@ void dictionary::print_space_breakdown() const { << essentials::convert(num_bytes, essentials::MB) << " [MB]" << '\n'; std::cout << "SPACE BREAKDOWN:\n"; std::cout << " minimizers: " << static_cast(m_minimizers.num_bits()) / size() - << " [bits/kmer]\n"; + << " [bits/kmer] (" + << static_cast(m_minimizers.num_bits()) / m_minimizers.size() + << " [bits/key])\n"; std::cout << " pieces: " << static_cast(m_buckets.pieces.num_bits()) / size() << " [bits/kmer]\n"; std::cout << " num_super_kmers_before_bucket: " @@ -25,6 +63,9 @@ void dictionary::print_space_breakdown() const { std::cout << " --------------\n"; std::cout << " total: " << static_cast(num_bits()) / size() << " [bits/kmer]" << std::endl; + + std::cout << " Close-form formula: " << bits_per_kmer_formula(k(), m(), size(), num_contigs()) + << " [bits/kmer]" << std::endl; } template