-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
broken build.cpp into separate files
- Loading branch information
Showing
12 changed files
with
519 additions
and
504 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#include <numeric> // for std::partial_sum | ||
|
||
namespace sshash { | ||
|
||
buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, | ||
buckets& m_buckets) { | ||
uint64_t num_buckets = m_minimizers.size(); | ||
uint64_t num_kmers = data.num_kmers; | ||
uint64_t num_super_kmers = data.strings.num_super_kmers(); | ||
std::vector<uint64_t> num_super_kmers_before_bucket(num_buckets + 1, 0); | ||
pthash::compact_vector::builder offsets; | ||
offsets.resize(num_super_kmers, std::ceil(std::log2(data.strings.num_bits() / 2))); | ||
|
||
std::cout << "bits_per_offset = ceil(log2(" << data.strings.num_bits() / 2 | ||
<< ")) = " << std::ceil(std::log2(data.strings.num_bits() / 2)) << std::endl; | ||
|
||
for (auto it = data.minimizers.begin(); it.has_next(); it.next()) { | ||
assert(it.list().size() > 0); | ||
if (it.list().size() != 1) { | ||
uint64_t bucket_id = m_minimizers.lookup(it.minimizer()); | ||
num_super_kmers_before_bucket[bucket_id + 1] = it.list().size() - 1; | ||
} | ||
// else: it.list().size() = 1 and num_super_kmers_before_bucket[bucket_id + 1] is already 0 | ||
} | ||
std::partial_sum(num_super_kmers_before_bucket.begin(), num_super_kmers_before_bucket.end(), | ||
num_super_kmers_before_bucket.begin()); | ||
|
||
buckets_statistics buckets_stats(num_buckets, num_kmers, num_super_kmers); | ||
|
||
uint64_t num_singletons = 0; | ||
for (auto it = data.minimizers.begin(); it.has_next(); it.next()) { | ||
uint64_t bucket_id = m_minimizers.lookup(it.minimizer()); | ||
uint64_t base = num_super_kmers_before_bucket[bucket_id] + bucket_id; | ||
uint64_t num_super_kmers_in_bucket = | ||
(num_super_kmers_before_bucket[bucket_id + 1] + bucket_id + 1) - base; | ||
assert(num_super_kmers_in_bucket > 0); | ||
if (num_super_kmers_in_bucket == 1) ++num_singletons; | ||
buckets_stats.add_num_super_kmers_in_bucket(num_super_kmers_in_bucket); | ||
uint64_t offset_pos = 0; | ||
auto list = it.list(); | ||
for (auto [offset, num_kmers_in_super_kmer] : list) { | ||
offsets.set(base + offset_pos++, offset); | ||
buckets_stats.add_num_kmers_in_super_kmer(num_super_kmers_in_bucket, | ||
num_kmers_in_super_kmer); | ||
} | ||
assert(offset_pos == num_super_kmers_in_bucket); | ||
} | ||
|
||
std::cout << "num_singletons " << num_singletons << "/" << num_buckets << " (" | ||
<< (num_singletons * 100.0) / num_buckets << "%)" << std::endl; | ||
|
||
m_buckets.pieces.encode(data.strings.pieces.begin(), data.strings.pieces.size()); | ||
m_buckets.num_super_kmers_before_bucket.encode(num_super_kmers_before_bucket.begin(), | ||
num_super_kmers_before_bucket.size()); | ||
offsets.build(m_buckets.offsets); | ||
m_buckets.strings.swap(data.strings.strings); | ||
|
||
return buckets_stats; | ||
} | ||
|
||
} // namespace sshash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
#include "../../external/pthash/include/pthash.hpp" | ||
|
||
namespace sshash { | ||
|
||
void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets, | ||
build_configuration const& build_config, | ||
buckets_statistics const& buckets_stats) { | ||
uint64_t min_log2_size = m_skew_index.min_log2; | ||
uint64_t max_log2_size = m_skew_index.max_log2; | ||
|
||
uint64_t max_num_super_kmers_in_bucket = buckets_stats.max_num_super_kmers_in_bucket(); | ||
m_skew_index.log2_max_num_super_kmers_in_bucket = | ||
std::ceil(std::log2(buckets_stats.max_num_super_kmers_in_bucket())); | ||
|
||
std::cout << "max_num_super_kmers_in_bucket " << max_num_super_kmers_in_bucket << std::endl; | ||
std::cout << "log2_max_num_super_kmers_in_bucket " | ||
<< m_skew_index.log2_max_num_super_kmers_in_bucket << std::endl; | ||
|
||
uint64_t num_buckets_in_skew_index = 0; | ||
for (auto it = data.minimizers.begin(); it.has_next(); it.next()) { | ||
if (it.list().size() > (1ULL << min_log2_size)) ++num_buckets_in_skew_index; | ||
} | ||
std::cout << "num_buckets_in_skew_index " << num_buckets_in_skew_index << "/" | ||
<< buckets_stats.num_buckets() << "(" | ||
<< (num_buckets_in_skew_index * 100.0) / buckets_stats.num_buckets() << "%)" | ||
<< std::endl; | ||
|
||
if (num_buckets_in_skew_index == 0) return; | ||
|
||
std::vector<list_type> lists; | ||
lists.reserve(num_buckets_in_skew_index); | ||
for (auto it = data.minimizers.begin(); it.has_next(); it.next()) { | ||
auto list = it.list(); | ||
if (list.size() > (1ULL << min_log2_size)) lists.push_back(list); | ||
} | ||
assert(lists.size() == num_buckets_in_skew_index); | ||
std::sort(lists.begin(), lists.end(), | ||
[](list_type const& x, list_type const& y) { return x.size() < y.size(); }); | ||
|
||
uint64_t num_partitions = max_log2_size - min_log2_size + 1; | ||
if (buckets_stats.max_num_super_kmers_in_bucket() < (1ULL << max_log2_size)) { | ||
num_partitions = m_skew_index.log2_max_num_super_kmers_in_bucket - min_log2_size; | ||
} | ||
std::cout << "num_partitions " << num_partitions << std::endl; | ||
|
||
std::vector<uint64_t> num_kmers_in_partition(num_partitions, 0); | ||
m_skew_index.mphfs.resize(num_partitions); | ||
m_skew_index.positions.resize(num_partitions); | ||
|
||
{ | ||
std::cout << "computing partitions..." << std::endl; | ||
|
||
uint64_t partition_id = 0; | ||
uint64_t lower = 1ULL << min_log2_size; | ||
uint64_t upper = 2 * lower; | ||
uint64_t num_kmers_in_skew_index = 0; | ||
for (uint64_t i = 0; i != lists.size() + 1; ++i) { | ||
if (i == lists.size() or lists[i].size() > upper) { | ||
std::cout << "num_kmers belonging to buckets of size > " << lower | ||
<< " and <= " << upper << ": " << num_kmers_in_partition[partition_id] | ||
<< std::endl; | ||
if (num_kmers_in_partition[partition_id] == 0) { | ||
std::cout << "==> Empty bucket detected:\n"; | ||
std::cout << "there is no k-mer that belongs to a list of size > " << lower | ||
<< " and <= " << upper << std::endl; | ||
throw empty_bucket_runtime_error(); | ||
} | ||
util::check_hash_collision_probability(num_kmers_in_partition[partition_id]); | ||
num_kmers_in_skew_index += num_kmers_in_partition[partition_id]; | ||
partition_id += 1; | ||
|
||
if (i == lists.size()) break; | ||
|
||
lower = upper; | ||
upper = 2 * lower; | ||
if (partition_id == num_partitions - 1) upper = max_num_super_kmers_in_bucket; | ||
|
||
/* | ||
If still larger than upper, then there is an empty bucket | ||
and we should try different parameters. | ||
*/ | ||
if (lists[i].size() > upper) { | ||
std::cout << "==> Empty bucket detected:\n"; | ||
std::cout << "there is no list of size > " << lower << " and <= " << upper | ||
<< std::endl; | ||
throw empty_bucket_runtime_error(); | ||
} | ||
} | ||
|
||
assert(lists[i].size() > lower and lists[i].size() <= upper); | ||
for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { | ||
(void)offset; // unused | ||
num_kmers_in_partition[partition_id] += num_kmers_in_super_kmer; | ||
} | ||
} | ||
assert(partition_id == num_partitions); | ||
std::cout << "num_kmers_in_skew_index " << num_kmers_in_skew_index << "(" | ||
<< (num_kmers_in_skew_index * 100.0) / buckets_stats.num_kmers() << "%)" | ||
<< std::endl; | ||
assert(num_kmers_in_skew_index == std::accumulate(num_kmers_in_partition.begin(), | ||
num_kmers_in_partition.end(), | ||
uint64_t(0))); | ||
} | ||
|
||
{ | ||
pthash::build_configuration mphf_config; | ||
mphf_config.c = build_config.c; | ||
mphf_config.alpha = 0.94; | ||
mphf_config.seed = 1234567890; // my favourite seed | ||
mphf_config.minimal_output = true; | ||
mphf_config.verbose_output = false; | ||
mphf_config.num_threads = std::thread::hardware_concurrency() >= 8 ? 8 : 1; | ||
|
||
std::cout << "building PTHash mphfs (with " << mphf_config.num_threads | ||
<< " threads) and positions..." << std::endl; | ||
|
||
uint64_t partition_id = 0; | ||
uint64_t lower = 1ULL << min_log2_size; | ||
uint64_t upper = 2 * lower; | ||
uint64_t num_bits_per_pos = min_log2_size + 1; | ||
|
||
/* tmp storage for keys and super_kmer_ids ******/ | ||
std::vector<uint64_t> keys_in_partition; | ||
std::vector<uint32_t> super_kmer_ids_in_partition; | ||
keys_in_partition.reserve(num_kmers_in_partition[partition_id]); | ||
super_kmer_ids_in_partition.reserve(num_kmers_in_partition[partition_id]); | ||
pthash::compact_vector::builder cvb_positions; | ||
cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos); | ||
/*******/ | ||
|
||
for (uint64_t i = 0; i != lists.size() + 1; ++i) { | ||
if (i == lists.size() or lists[i].size() > upper) { | ||
std::cout << "lower " << lower << "; upper " << upper << "; num_bits_per_pos " | ||
<< num_bits_per_pos << std::endl; | ||
|
||
auto& mphf = m_skew_index.mphfs[partition_id]; | ||
assert(num_kmers_in_partition[partition_id] == keys_in_partition.size()); | ||
assert(num_kmers_in_partition[partition_id] == super_kmer_ids_in_partition.size()); | ||
mphf.build_in_internal_memory(keys_in_partition.begin(), keys_in_partition.size(), | ||
mphf_config); | ||
|
||
std::cout << " built mphs[" << partition_id << "] for " << keys_in_partition.size() | ||
<< " keys; bits/key = " | ||
<< static_cast<double>(mphf.num_bits()) / mphf.num_keys() << std::endl; | ||
|
||
for (uint64_t i = 0; i != keys_in_partition.size(); ++i) { | ||
uint64_t kmer = keys_in_partition[i]; | ||
uint64_t pos = mphf(kmer); | ||
uint32_t super_kmer_id = super_kmer_ids_in_partition[i]; | ||
cvb_positions.set(pos, super_kmer_id); | ||
} | ||
auto& positions = m_skew_index.positions[partition_id]; | ||
cvb_positions.build(positions); | ||
|
||
std::cout << " built positions[" << partition_id << "] for " << positions.size() | ||
<< " keys; bits/key = " << (positions.bytes() * 8.0) / positions.size() | ||
<< std::endl; | ||
|
||
partition_id += 1; | ||
|
||
if (i == lists.size()) break; | ||
|
||
lower = upper; | ||
upper = 2 * lower; | ||
num_bits_per_pos += 1; | ||
if (partition_id == num_partitions - 1) { | ||
upper = max_num_super_kmers_in_bucket; | ||
num_bits_per_pos = m_skew_index.log2_max_num_super_kmers_in_bucket; | ||
} | ||
|
||
keys_in_partition.clear(); | ||
super_kmer_ids_in_partition.clear(); | ||
keys_in_partition.reserve(num_kmers_in_partition[partition_id]); | ||
super_kmer_ids_in_partition.reserve(num_kmers_in_partition[partition_id]); | ||
cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos); | ||
} | ||
|
||
assert(lists[i].size() > lower and lists[i].size() <= upper); | ||
uint64_t super_kmer_id = 0; | ||
for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { | ||
bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); | ||
for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) { | ||
uint64_t kmer = bv_it.read(2 * build_config.k); | ||
keys_in_partition.push_back(kmer); | ||
super_kmer_ids_in_partition.push_back(super_kmer_id); | ||
bv_it.eat(2); | ||
} | ||
assert(super_kmer_id < (1ULL << cvb_positions.width())); | ||
++super_kmer_id; | ||
} | ||
} | ||
assert(partition_id == num_partitions); | ||
} | ||
|
||
std::cout << "num_bits_for_skew_index " << m_skew_index.num_bits() << "(" | ||
<< static_cast<double>(m_skew_index.num_bits()) / buckets_stats.num_kmers() | ||
<< " [bits/kmer])" << std::endl; | ||
} | ||
|
||
} // namespace sshash |
Oops, something went wrong.