Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/jermp/sshash into HEAD
Browse files Browse the repository at this point in the history
  • Loading branch information
adamant-pwn committed Jul 15, 2024
2 parents 2b7eeb0 + 05d6127 commit ab3ba51
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 4 deletions.
7 changes: 6 additions & 1 deletion include/builder/build.impl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ void dictionary<kmer_t>::build(std::string const& filename,
throw std::runtime_error("m must be less <= " + std::to_string(kmer_t::max_m) +
" but got m = " + std::to_string(build_config.m));
}
if (build_config.m > build_config.k) throw std::runtime_error("m must be < k");
if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k");
if (build_config.l >= constants::max_l) {
throw std::runtime_error("l must be < " + std::to_string(constants::max_l));
}
Expand Down Expand Up @@ -76,6 +76,11 @@ void dictionary<kmer_t>::build(std::string const& filename,
mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
mm::advice::sequential);
minimizers_tuples_iterator iterator(input.data(), input.data() + input.size());

if (build_config.verbose) {
std::cout << "num_minimizers " << data.minimizers.num_minimizers() << std::endl;
}

m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config);
input.close();
}
Expand Down
3 changes: 1 addition & 2 deletions include/cover/parse_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,7 @@ struct lines_iterator {

std::string read_line() {
uint8_t const* begin = m_begin;
while (m_begin != m_end and *m_begin++ != '\n')
;
while (m_begin != m_end and *m_begin++ != '\n');
if (begin == m_begin) return std::string("");
return std::string(reinterpret_cast<const char*>(begin), m_begin - begin - 1);
}
Expand Down
43 changes: 42 additions & 1 deletion include/info.impl
Original file line number Diff line number Diff line change
@@ -1,13 +1,51 @@
namespace sshash {

double bits_per_kmer_formula(uint64_t k, /* kmer length */
uint64_t m, /* minimizer length */
uint64_t n, /* num. kmers */
uint64_t M) /* num. strings in SPSS */
{
/*
Caveats:
1. we assume an alphabet of size 4
2. this assumes a random minimizer scheme, so num. super-kmers is ~ 2n/(k-m+2)
3. we neglect lower order terms and skew index space
4. no canonical parsing
*/

assert(k > 0);
assert(k >= m);

const uint64_t N = n + M * (k - 1); // num. symbols in SPSS

// double num_minimizers = (2.0 * n) / (k - m + 2); // not distinct, hence num. of super-kmers
// std::cout << "num_minimizers = " << num_minimizers << std::endl;
// std::cout << "minimizers: " << (3.0 * num_minimizers) / n << " [bits/kmer]" << std::endl;
// std::cout << "pieces: " << (M * (2.0 + std::ceil(std::log2(static_cast<double>(N) / M)))) / n
// << " [bits/kmer]" << std::endl;
// std::cout << "num_super_kmers_before_bucket: " << (2.0 * num_minimizers) / n << " [bits/kmer]
// "
// << std::endl;
// std::cout << "offsets: " << (std::ceil(std::log2(N)) * num_minimizers) / n << " [bits/kmer]"
// << std::endl;
// std::cout << "strings: " << (2.0 * N) / n << " [bits/kmer]" << std::endl;

double num_bits = 2 * n * (1.0 + (5.0 + std::ceil(std::log2(N))) / (k - m + 2)) +
M * (2 * k + std::ceil(std::log2(static_cast<double>(n) / M + k - 1)));

return num_bits / n;
}

template <class kmer_t>
void dictionary<kmer_t>::print_space_breakdown() const {
const uint64_t num_bytes = (num_bits() + 7) / 8;
std::cout << "total index size: " << num_bytes << " [B] -- "
<< essentials::convert(num_bytes, essentials::MB) << " [MB]" << '\n';
std::cout << "SPACE BREAKDOWN:\n";
std::cout << " minimizers: " << static_cast<double>(m_minimizers.num_bits()) / size()
<< " [bits/kmer]\n";
<< " [bits/kmer] ("
<< static_cast<double>(m_minimizers.num_bits()) / m_minimizers.size()
<< " [bits/key])\n";
std::cout << " pieces: " << static_cast<double>(m_buckets.pieces.num_bits()) / size()
<< " [bits/kmer]\n";
std::cout << " num_super_kmers_before_bucket: "
Expand All @@ -25,6 +63,9 @@ void dictionary<kmer_t>::print_space_breakdown() const {
std::cout << " --------------\n";
std::cout << " total: " << static_cast<double>(num_bits()) / size() << " [bits/kmer]"
<< std::endl;

std::cout << " Close-form formula: " << bits_per_kmer_formula(k(), m(), size(), num_contigs())
<< " [bits/kmer]" << std::endl;
}

template <class kmer_t>
Expand Down

0 comments on commit ab3ba51

Please sign in to comment.