Skip to content

Commit

Permalink
broken build.cpp into separate files
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed May 25, 2022
1 parent c48948c commit 9bc46b0
Show file tree
Hide file tree
Showing 12 changed files with 519 additions and 504 deletions.
484 changes: 10 additions & 474 deletions include/builder/build.cpp

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions include/builder/build_index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include <numeric> // for std::partial_sum

namespace sshash {

buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers,
buckets& m_buckets) {
uint64_t num_buckets = m_minimizers.size();
uint64_t num_kmers = data.num_kmers;
uint64_t num_super_kmers = data.strings.num_super_kmers();
std::vector<uint64_t> num_super_kmers_before_bucket(num_buckets + 1, 0);
pthash::compact_vector::builder offsets;
offsets.resize(num_super_kmers, std::ceil(std::log2(data.strings.num_bits() / 2)));

std::cout << "bits_per_offset = ceil(log2(" << data.strings.num_bits() / 2
<< ")) = " << std::ceil(std::log2(data.strings.num_bits() / 2)) << std::endl;

for (auto it = data.minimizers.begin(); it.has_next(); it.next()) {
assert(it.list().size() > 0);
if (it.list().size() != 1) {
uint64_t bucket_id = m_minimizers.lookup(it.minimizer());
num_super_kmers_before_bucket[bucket_id + 1] = it.list().size() - 1;
}
// else: it.list().size() = 1 and num_super_kmers_before_bucket[bucket_id + 1] is already 0
}
std::partial_sum(num_super_kmers_before_bucket.begin(), num_super_kmers_before_bucket.end(),
num_super_kmers_before_bucket.begin());

buckets_statistics buckets_stats(num_buckets, num_kmers, num_super_kmers);

uint64_t num_singletons = 0;
for (auto it = data.minimizers.begin(); it.has_next(); it.next()) {
uint64_t bucket_id = m_minimizers.lookup(it.minimizer());
uint64_t base = num_super_kmers_before_bucket[bucket_id] + bucket_id;
uint64_t num_super_kmers_in_bucket =
(num_super_kmers_before_bucket[bucket_id + 1] + bucket_id + 1) - base;
assert(num_super_kmers_in_bucket > 0);
if (num_super_kmers_in_bucket == 1) ++num_singletons;
buckets_stats.add_num_super_kmers_in_bucket(num_super_kmers_in_bucket);
uint64_t offset_pos = 0;
auto list = it.list();
for (auto [offset, num_kmers_in_super_kmer] : list) {
offsets.set(base + offset_pos++, offset);
buckets_stats.add_num_kmers_in_super_kmer(num_super_kmers_in_bucket,
num_kmers_in_super_kmer);
}
assert(offset_pos == num_super_kmers_in_bucket);
}

std::cout << "num_singletons " << num_singletons << "/" << num_buckets << " ("
<< (num_singletons * 100.0) / num_buckets << "%)" << std::endl;

m_buckets.pieces.encode(data.strings.pieces.begin(), data.strings.pieces.size());
m_buckets.num_super_kmers_before_bucket.encode(num_super_kmers_before_bucket.begin(),
num_super_kmers_before_bucket.size());
offsets.build(m_buckets.offsets);
m_buckets.strings.swap(data.strings.strings);

return buckets_stats;
}

} // namespace sshash
200 changes: 200 additions & 0 deletions include/builder/build_skew_index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#include "../../external/pthash/include/pthash.hpp"

namespace sshash {

void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets,
build_configuration const& build_config,
buckets_statistics const& buckets_stats) {
uint64_t min_log2_size = m_skew_index.min_log2;
uint64_t max_log2_size = m_skew_index.max_log2;

uint64_t max_num_super_kmers_in_bucket = buckets_stats.max_num_super_kmers_in_bucket();
m_skew_index.log2_max_num_super_kmers_in_bucket =
std::ceil(std::log2(buckets_stats.max_num_super_kmers_in_bucket()));

std::cout << "max_num_super_kmers_in_bucket " << max_num_super_kmers_in_bucket << std::endl;
std::cout << "log2_max_num_super_kmers_in_bucket "
<< m_skew_index.log2_max_num_super_kmers_in_bucket << std::endl;

uint64_t num_buckets_in_skew_index = 0;
for (auto it = data.minimizers.begin(); it.has_next(); it.next()) {
if (it.list().size() > (1ULL << min_log2_size)) ++num_buckets_in_skew_index;
}
std::cout << "num_buckets_in_skew_index " << num_buckets_in_skew_index << "/"
<< buckets_stats.num_buckets() << "("
<< (num_buckets_in_skew_index * 100.0) / buckets_stats.num_buckets() << "%)"
<< std::endl;

if (num_buckets_in_skew_index == 0) return;

std::vector<list_type> lists;
lists.reserve(num_buckets_in_skew_index);
for (auto it = data.minimizers.begin(); it.has_next(); it.next()) {
auto list = it.list();
if (list.size() > (1ULL << min_log2_size)) lists.push_back(list);
}
assert(lists.size() == num_buckets_in_skew_index);
std::sort(lists.begin(), lists.end(),
[](list_type const& x, list_type const& y) { return x.size() < y.size(); });

uint64_t num_partitions = max_log2_size - min_log2_size + 1;
if (buckets_stats.max_num_super_kmers_in_bucket() < (1ULL << max_log2_size)) {
num_partitions = m_skew_index.log2_max_num_super_kmers_in_bucket - min_log2_size;
}
std::cout << "num_partitions " << num_partitions << std::endl;

std::vector<uint64_t> num_kmers_in_partition(num_partitions, 0);
m_skew_index.mphfs.resize(num_partitions);
m_skew_index.positions.resize(num_partitions);

{
std::cout << "computing partitions..." << std::endl;

uint64_t partition_id = 0;
uint64_t lower = 1ULL << min_log2_size;
uint64_t upper = 2 * lower;
uint64_t num_kmers_in_skew_index = 0;
for (uint64_t i = 0; i != lists.size() + 1; ++i) {
if (i == lists.size() or lists[i].size() > upper) {
std::cout << "num_kmers belonging to buckets of size > " << lower
<< " and <= " << upper << ": " << num_kmers_in_partition[partition_id]
<< std::endl;
if (num_kmers_in_partition[partition_id] == 0) {
std::cout << "==> Empty bucket detected:\n";
std::cout << "there is no k-mer that belongs to a list of size > " << lower
<< " and <= " << upper << std::endl;
throw empty_bucket_runtime_error();
}
util::check_hash_collision_probability(num_kmers_in_partition[partition_id]);
num_kmers_in_skew_index += num_kmers_in_partition[partition_id];
partition_id += 1;

if (i == lists.size()) break;

lower = upper;
upper = 2 * lower;
if (partition_id == num_partitions - 1) upper = max_num_super_kmers_in_bucket;

/*
If still larger than upper, then there is an empty bucket
and we should try different parameters.
*/
if (lists[i].size() > upper) {
std::cout << "==> Empty bucket detected:\n";
std::cout << "there is no list of size > " << lower << " and <= " << upper
<< std::endl;
throw empty_bucket_runtime_error();
}
}

assert(lists[i].size() > lower and lists[i].size() <= upper);
for (auto [offset, num_kmers_in_super_kmer] : lists[i]) {
(void)offset; // unused
num_kmers_in_partition[partition_id] += num_kmers_in_super_kmer;
}
}
assert(partition_id == num_partitions);
std::cout << "num_kmers_in_skew_index " << num_kmers_in_skew_index << "("
<< (num_kmers_in_skew_index * 100.0) / buckets_stats.num_kmers() << "%)"
<< std::endl;
assert(num_kmers_in_skew_index == std::accumulate(num_kmers_in_partition.begin(),
num_kmers_in_partition.end(),
uint64_t(0)));
}

{
pthash::build_configuration mphf_config;
mphf_config.c = build_config.c;
mphf_config.alpha = 0.94;
mphf_config.seed = 1234567890; // my favourite seed
mphf_config.minimal_output = true;
mphf_config.verbose_output = false;
mphf_config.num_threads = std::thread::hardware_concurrency() >= 8 ? 8 : 1;

std::cout << "building PTHash mphfs (with " << mphf_config.num_threads
<< " threads) and positions..." << std::endl;

uint64_t partition_id = 0;
uint64_t lower = 1ULL << min_log2_size;
uint64_t upper = 2 * lower;
uint64_t num_bits_per_pos = min_log2_size + 1;

/* tmp storage for keys and super_kmer_ids ******/
std::vector<uint64_t> keys_in_partition;
std::vector<uint32_t> super_kmer_ids_in_partition;
keys_in_partition.reserve(num_kmers_in_partition[partition_id]);
super_kmer_ids_in_partition.reserve(num_kmers_in_partition[partition_id]);
pthash::compact_vector::builder cvb_positions;
cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos);
/*******/

for (uint64_t i = 0; i != lists.size() + 1; ++i) {
if (i == lists.size() or lists[i].size() > upper) {
std::cout << "lower " << lower << "; upper " << upper << "; num_bits_per_pos "
<< num_bits_per_pos << std::endl;

auto& mphf = m_skew_index.mphfs[partition_id];
assert(num_kmers_in_partition[partition_id] == keys_in_partition.size());
assert(num_kmers_in_partition[partition_id] == super_kmer_ids_in_partition.size());
mphf.build_in_internal_memory(keys_in_partition.begin(), keys_in_partition.size(),
mphf_config);

std::cout << " built mphs[" << partition_id << "] for " << keys_in_partition.size()
<< " keys; bits/key = "
<< static_cast<double>(mphf.num_bits()) / mphf.num_keys() << std::endl;

for (uint64_t i = 0; i != keys_in_partition.size(); ++i) {
uint64_t kmer = keys_in_partition[i];
uint64_t pos = mphf(kmer);
uint32_t super_kmer_id = super_kmer_ids_in_partition[i];
cvb_positions.set(pos, super_kmer_id);
}
auto& positions = m_skew_index.positions[partition_id];
cvb_positions.build(positions);

std::cout << " built positions[" << partition_id << "] for " << positions.size()
<< " keys; bits/key = " << (positions.bytes() * 8.0) / positions.size()
<< std::endl;

partition_id += 1;

if (i == lists.size()) break;

lower = upper;
upper = 2 * lower;
num_bits_per_pos += 1;
if (partition_id == num_partitions - 1) {
upper = max_num_super_kmers_in_bucket;
num_bits_per_pos = m_skew_index.log2_max_num_super_kmers_in_bucket;
}

keys_in_partition.clear();
super_kmer_ids_in_partition.clear();
keys_in_partition.reserve(num_kmers_in_partition[partition_id]);
super_kmer_ids_in_partition.reserve(num_kmers_in_partition[partition_id]);
cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos);
}

assert(lists[i].size() > lower and lists[i].size() <= upper);
uint64_t super_kmer_id = 0;
for (auto [offset, num_kmers_in_super_kmer] : lists[i]) {
bit_vector_iterator bv_it(m_buckets.strings, 2 * offset);
for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) {
uint64_t kmer = bv_it.read(2 * build_config.k);
keys_in_partition.push_back(kmer);
super_kmer_ids_in_partition.push_back(super_kmer_id);
bv_it.eat(2);
}
assert(super_kmer_id < (1ULL << cvb_positions.width()));
++super_kmer_id;
}
}
assert(partition_id == num_partitions);
}

std::cout << "num_bits_for_skew_index " << m_skew_index.num_bits() << "("
<< static_cast<double>(m_skew_index.num_bits()) / buckets_stats.num_kmers()
<< " [bits/kmer])" << std::endl;
}

} // namespace sshash
Loading

0 comments on commit 9bc46b0

Please sign in to comment.