From c48948c0f5882063deeb6702f518d187ad7c7a07 Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 24 May 2022 19:20:34 +0200 Subject: [PATCH] general renaming to match description in papers --- include/buckets.hpp | 24 +-- include/builder/build.cpp | 142 +++++++++--------- include/builder/util.hpp | 30 ++-- include/info.cpp | 8 +- include/lookup.cpp | 33 ++-- .../membership_query_canonical_parsing.cpp | 22 +-- .../membership_query_regular_parsing.cpp | 34 +++-- include/skew_index.hpp | 20 +-- include/util.hpp | 70 +++++---- 9 files changed, 195 insertions(+), 188 deletions(-) diff --git a/include/buckets.hpp b/include/buckets.hpp index dce8db3..b8b6b78 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -46,8 +46,8 @@ struct buckets { } std::pair locate_bucket(uint64_t bucket_id) const { - uint64_t begin = num_strings_before_bucket.access(bucket_id) + bucket_id; - uint64_t end = num_strings_before_bucket.access(bucket_id + 1) + bucket_id + 1; + uint64_t begin = num_super_kmers_before_bucket.access(bucket_id) + bucket_id; + uint64_t end = num_super_kmers_before_bucket.access(bucket_id + 1) + bucket_id + 1; assert(begin < end); return {begin, end}; } @@ -59,8 +59,8 @@ struct buckets { uint64_t lookup(uint64_t begin, uint64_t end, uint64_t target_kmer, uint64_t k, uint64_t m) const { - for (uint64_t string_id = begin; string_id != end; ++string_id) { - uint64_t offset = offsets.access(string_id); + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + uint64_t offset = offsets.access(super_kmer_id); auto [kmer_id, offset_end] = offset_to_id(offset, k); bit_vector_iterator bv_it(strings, 2 * offset); uint64_t window_size = std::min(k - m + 1, offset_end - offset - k + 1); @@ -72,9 +72,9 @@ struct buckets { return constants::invalid; } - uint64_t lookup_in_string(uint64_t string_id, uint64_t target_kmer, uint64_t k, - uint64_t m) const { - uint64_t offset = offsets.access(string_id); + uint64_t lookup_in_super_kmer(uint64_t super_kmer_id, uint64_t target_kmer, uint64_t k, + uint64_t m) const { + uint64_t offset = offsets.access(super_kmer_id); auto [kmer_id, offset_end] = offset_to_id(offset, k); bit_vector_iterator bv_it(strings, 2 * offset); uint64_t window_size = std::min(k - m + 1, offset_end - offset - k + 1); @@ -93,8 +93,8 @@ struct buckets { } uint64_t lookup_canonical(uint64_t begin, uint64_t end, uint64_t target_kmer, uint64_t target_kmer_rc, uint64_t k, uint64_t m) const { - for (uint64_t string_id = begin; string_id != end; ++string_id) { - uint64_t offset = offsets.access(string_id); + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + uint64_t offset = offsets.access(super_kmer_id); auto [kmer_id, offset_end] = offset_to_id(offset, k); bit_vector_iterator bv_it(strings, 2 * offset); uint64_t window_size = std::min(k - m + 1, offset_end - offset - k + 1); @@ -183,20 +183,20 @@ struct buckets { } uint64_t num_bits() const { - return pieces.num_bits() + num_strings_before_bucket.num_bits() + + return pieces.num_bits() + num_super_kmers_before_bucket.num_bits() + 8 * (offsets.bytes() + strings.bytes()); } template void visit(Visitor& visitor) { visitor.visit(pieces); - visitor.visit(num_strings_before_bucket); + visitor.visit(num_super_kmers_before_bucket); visitor.visit(offsets); visitor.visit(strings); } ef_sequence pieces; - ef_sequence num_strings_before_bucket; + ef_sequence num_super_kmers_before_bucket; pthash::compact_vector offsets; pthash::bit_vector strings; }; diff --git a/include/builder/build.cpp b/include/builder/build.cpp index bf6d7f3..4b71213 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -25,55 +25,55 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b uint64_t k = build_config.k; uint64_t m = build_config.m; uint64_t seed = build_config.seed; - uint64_t max_num_kmers_in_string = k - m + 1; - uint64_t block_size = 2 * k - m; // max_num_kmers_in_string + k - 1 - - if (max_num_kmers_in_string >= (1ULL << (sizeof(num_kmers_in_string_uint_type) * 8))) { - throw std::runtime_error("max_num_kmers_in_string " + - std::to_string(max_num_kmers_in_string) + " does not fit into " + - std::to_string(sizeof(num_kmers_in_string_uint_type) * 8) + - " bits"); + uint64_t max_num_kmers_in_super_kmer = k - m + 1; + uint64_t block_size = 2 * k - m; // max_num_kmers_in_super_kmer + k - 1 + + if (max_num_kmers_in_super_kmer >= (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8))) { + throw std::runtime_error( + "max_num_kmers_in_super_kmer " + std::to_string(max_num_kmers_in_super_kmer) + + " does not fit into " + std::to_string(sizeof(num_kmers_in_super_kmer_uint_type) * 8) + + " bits"); } /* fit into the wanted number of bits */ - assert(max_num_kmers_in_string < (1ULL << (sizeof(num_kmers_in_string_uint_type) * 8))); + assert(max_num_kmers_in_super_kmer < (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8))); compact_string_pool::builder builder(k); std::string sequence; uint64_t prev_minimizer = constants::invalid; - uint64_t begin = 0; // begin of parsed string in sequence - uint64_t end = 0; // end of parsed string in sequence + uint64_t begin = 0; // begin of parsed super_kmer in sequence + uint64_t end = 0; // end of parsed super_kmer in sequence uint64_t num_sequences = 0; uint64_t num_bases = 0; bool glue = false; - auto append_string = [&]() { + auto append_super_kmer = [&]() { if (sequence.empty() or prev_minimizer == constants::invalid or begin == end) return; assert(end > begin); - char const* string = sequence.data() + begin; + char const* super_kmer = sequence.data() + begin; uint64_t size = (end - begin) + k - 1; - assert(util::is_valid(string, size)); + assert(util::is_valid(super_kmer, size)); - /* if num_kmers_in_string > k - m + 1, then split the string into blocks */ - uint64_t num_kmers_in_string = end - begin; - uint64_t num_blocks = num_kmers_in_string / max_num_kmers_in_string + - (num_kmers_in_string % max_num_kmers_in_string != 0); + /* if num_kmers_in_super_kmer > k - m + 1, then split the super_kmer into blocks */ + uint64_t num_kmers_in_super_kmer = end - begin; + uint64_t num_blocks = num_kmers_in_super_kmer / max_num_kmers_in_super_kmer + + (num_kmers_in_super_kmer % max_num_kmers_in_super_kmer != 0); assert(num_blocks > 0); for (uint64_t i = 0; i != num_blocks; ++i) { uint64_t n = block_size; if (i == num_blocks - 1) n = size; uint64_t num_kmers_in_block = n - k + 1; - assert(num_kmers_in_block <= max_num_kmers_in_string); + assert(num_kmers_in_block <= max_num_kmers_in_super_kmer); data.minimizers.emplace_back(prev_minimizer, builder.offset, num_kmers_in_block); - builder.append(string + i * max_num_kmers_in_string, n, glue); + builder.append(super_kmer + i * max_num_kmers_in_super_kmer, n, glue); if (glue) { assert(data.minimizers.back().offset > k - 1); data.minimizers.back().offset -= k - 1; } - size -= max_num_kmers_in_string; + size -= max_num_kmers_in_super_kmer; glue = true; } }; @@ -179,7 +179,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b if (prev_minimizer == constants::invalid) prev_minimizer = minimizer; if (minimizer != prev_minimizer) { - append_string(); + append_super_kmer(); begin = end; prev_minimizer = minimizer; glue = true; @@ -189,7 +189,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b ++end; } - append_string(); + append_super_kmer(); } builder.finalize(); @@ -198,7 +198,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, " << data.num_kmers << " kmers" << std::endl; std::cout << "num_kmers " << data.num_kmers << std::endl; - std::cout << "num_strings " << data.strings.size() << std::endl; + std::cout << "num_super_kmers " << data.strings.num_super_kmers() << std::endl; std::cout << "num_pieces " << data.strings.pieces.size() << " (+" << (2.0 * data.strings.pieces.size() * (k - 1)) / data.num_kmers << " [bits/kmer])" << std::endl; @@ -230,12 +230,11 @@ buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets) { uint64_t num_buckets = m_minimizers.size(); uint64_t num_kmers = data.num_kmers; - uint64_t num_strings = data.strings.size(); - std::vector num_strings_before_bucket(num_buckets + 1, 0); + uint64_t num_super_kmers = data.strings.num_super_kmers(); + std::vector num_super_kmers_before_bucket(num_buckets + 1, 0); pthash::compact_vector::builder offsets; - offsets.resize(num_strings, std::ceil(std::log2(data.strings.num_bits() / 2))); + offsets.resize(num_super_kmers, std::ceil(std::log2(data.strings.num_bits() / 2))); - std::cout << "num_symbols_in_string " << data.strings.num_bits() / 2 << std::endl; std::cout << "bits_per_offset = ceil(log2(" << data.strings.num_bits() / 2 << ")) = " << std::ceil(std::log2(data.strings.num_bits() / 2)) << std::endl; @@ -243,39 +242,40 @@ buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, assert(it.list().size() > 0); if (it.list().size() != 1) { uint64_t bucket_id = m_minimizers.lookup(it.minimizer()); - num_strings_before_bucket[bucket_id + 1] = it.list().size() - 1; + num_super_kmers_before_bucket[bucket_id + 1] = it.list().size() - 1; } - // else: it.list().size() = 1 and num_strings_before_bucket[bucket_id + 1] is already 0 + // else: it.list().size() = 1 and num_super_kmers_before_bucket[bucket_id + 1] is already 0 } - std::partial_sum(num_strings_before_bucket.begin(), num_strings_before_bucket.end(), - num_strings_before_bucket.begin()); + std::partial_sum(num_super_kmers_before_bucket.begin(), num_super_kmers_before_bucket.end(), + num_super_kmers_before_bucket.begin()); - buckets_statistics buckets_stats(num_buckets, num_kmers, num_strings); + buckets_statistics buckets_stats(num_buckets, num_kmers, num_super_kmers); uint64_t num_singletons = 0; for (auto it = data.minimizers.begin(); it.has_next(); it.next()) { uint64_t bucket_id = m_minimizers.lookup(it.minimizer()); - uint64_t base = num_strings_before_bucket[bucket_id] + bucket_id; - uint64_t num_strings_in_bucket = - (num_strings_before_bucket[bucket_id + 1] + bucket_id + 1) - base; - assert(num_strings_in_bucket > 0); - if (num_strings_in_bucket == 1) ++num_singletons; - buckets_stats.add_num_strings_in_bucket(num_strings_in_bucket); + uint64_t base = num_super_kmers_before_bucket[bucket_id] + bucket_id; + uint64_t num_super_kmers_in_bucket = + (num_super_kmers_before_bucket[bucket_id + 1] + bucket_id + 1) - base; + assert(num_super_kmers_in_bucket > 0); + if (num_super_kmers_in_bucket == 1) ++num_singletons; + buckets_stats.add_num_super_kmers_in_bucket(num_super_kmers_in_bucket); uint64_t offset_pos = 0; auto list = it.list(); - for (auto [offset, num_kmers_in_string] : list) { + for (auto [offset, num_kmers_in_super_kmer] : list) { offsets.set(base + offset_pos++, offset); - buckets_stats.add_num_kmers_in_string(num_strings_in_bucket, num_kmers_in_string); + buckets_stats.add_num_kmers_in_super_kmer(num_super_kmers_in_bucket, + num_kmers_in_super_kmer); } - assert(offset_pos == num_strings_in_bucket); + assert(offset_pos == num_super_kmers_in_bucket); } std::cout << "num_singletons " << num_singletons << "/" << num_buckets << " (" << (num_singletons * 100.0) / num_buckets << "%)" << std::endl; m_buckets.pieces.encode(data.strings.pieces.begin(), data.strings.pieces.size()); - m_buckets.num_strings_before_bucket.encode(num_strings_before_bucket.begin(), - num_strings_before_bucket.size()); + m_buckets.num_super_kmers_before_bucket.encode(num_super_kmers_before_bucket.begin(), + num_super_kmers_before_bucket.size()); offsets.build(m_buckets.offsets); m_buckets.strings.swap(data.strings.strings); @@ -288,13 +288,13 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& uint64_t min_log2_size = m_skew_index.min_log2; uint64_t max_log2_size = m_skew_index.max_log2; - uint64_t max_num_strings_in_bucket = buckets_stats.max_num_strings_in_bucket(); - m_skew_index.log2_max_num_strings_in_bucket = - std::ceil(std::log2(buckets_stats.max_num_strings_in_bucket())); + uint64_t max_num_super_kmers_in_bucket = buckets_stats.max_num_super_kmers_in_bucket(); + m_skew_index.log2_max_num_super_kmers_in_bucket = + std::ceil(std::log2(buckets_stats.max_num_super_kmers_in_bucket())); - std::cout << "max_num_strings_in_bucket " << max_num_strings_in_bucket << std::endl; - std::cout << "log2_max_num_strings_in_bucket " << m_skew_index.log2_max_num_strings_in_bucket - << std::endl; + std::cout << "max_num_super_kmers_in_bucket " << max_num_super_kmers_in_bucket << std::endl; + std::cout << "log2_max_num_super_kmers_in_bucket " + << m_skew_index.log2_max_num_super_kmers_in_bucket << std::endl; uint64_t num_buckets_in_skew_index = 0; for (auto it = data.minimizers.begin(); it.has_next(); it.next()) { @@ -318,8 +318,8 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& [](list_type const& x, list_type const& y) { return x.size() < y.size(); }); uint64_t num_partitions = max_log2_size - min_log2_size + 1; - if (buckets_stats.max_num_strings_in_bucket() < (1ULL << max_log2_size)) { - num_partitions = m_skew_index.log2_max_num_strings_in_bucket - min_log2_size; + if (buckets_stats.max_num_super_kmers_in_bucket() < (1ULL << max_log2_size)) { + num_partitions = m_skew_index.log2_max_num_super_kmers_in_bucket - min_log2_size; } std::cout << "num_partitions " << num_partitions << std::endl; @@ -353,7 +353,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& lower = upper; upper = 2 * lower; - if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket; + if (partition_id == num_partitions - 1) upper = max_num_super_kmers_in_bucket; /* If still larger than upper, then there is an empty bucket @@ -368,9 +368,9 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& } assert(lists[i].size() > lower and lists[i].size() <= upper); - for (auto [offset, num_kmers_in_string] : lists[i]) { + for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { (void)offset; // unused - num_kmers_in_partition[partition_id] += num_kmers_in_string; + num_kmers_in_partition[partition_id] += num_kmers_in_super_kmer; } } assert(partition_id == num_partitions); @@ -399,11 +399,11 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& uint64_t upper = 2 * lower; uint64_t num_bits_per_pos = min_log2_size + 1; - /* tmp storage for keys and string_ids ******/ + /* tmp storage for keys and super_kmer_ids ******/ std::vector keys_in_partition; - std::vector string_ids_in_partition; + std::vector super_kmer_ids_in_partition; keys_in_partition.reserve(num_kmers_in_partition[partition_id]); - string_ids_in_partition.reserve(num_kmers_in_partition[partition_id]); + super_kmer_ids_in_partition.reserve(num_kmers_in_partition[partition_id]); pthash::compact_vector::builder cvb_positions; cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos); /*******/ @@ -415,7 +415,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& auto& mphf = m_skew_index.mphfs[partition_id]; assert(num_kmers_in_partition[partition_id] == keys_in_partition.size()); - assert(num_kmers_in_partition[partition_id] == string_ids_in_partition.size()); + assert(num_kmers_in_partition[partition_id] == super_kmer_ids_in_partition.size()); mphf.build_in_internal_memory(keys_in_partition.begin(), keys_in_partition.size(), mphf_config); @@ -426,8 +426,8 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& for (uint64_t i = 0; i != keys_in_partition.size(); ++i) { uint64_t kmer = keys_in_partition[i]; uint64_t pos = mphf(kmer); - uint32_t string_id = string_ids_in_partition[i]; - cvb_positions.set(pos, string_id); + uint32_t super_kmer_id = super_kmer_ids_in_partition[i]; + cvb_positions.set(pos, super_kmer_id); } auto& positions = m_skew_index.positions[partition_id]; cvb_positions.build(positions); @@ -444,29 +444,29 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& upper = 2 * lower; num_bits_per_pos += 1; if (partition_id == num_partitions - 1) { - upper = max_num_strings_in_bucket; - num_bits_per_pos = m_skew_index.log2_max_num_strings_in_bucket; + upper = max_num_super_kmers_in_bucket; + num_bits_per_pos = m_skew_index.log2_max_num_super_kmers_in_bucket; } keys_in_partition.clear(); - string_ids_in_partition.clear(); + super_kmer_ids_in_partition.clear(); keys_in_partition.reserve(num_kmers_in_partition[partition_id]); - string_ids_in_partition.reserve(num_kmers_in_partition[partition_id]); + super_kmer_ids_in_partition.reserve(num_kmers_in_partition[partition_id]); cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos); } assert(lists[i].size() > lower and lists[i].size() <= upper); - uint64_t string_id = 0; - for (auto [offset, num_kmers_in_string] : lists[i]) { + uint64_t super_kmer_id = 0; + for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); - for (uint64_t i = 0; i != num_kmers_in_string; ++i) { + for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) { uint64_t kmer = bv_it.read(2 * build_config.k); keys_in_partition.push_back(kmer); - string_ids_in_partition.push_back(string_id); + super_kmer_ids_in_partition.push_back(super_kmer_id); bv_it.eat(2); } - assert(string_id < (1ULL << cvb_positions.width())); - ++string_id; + assert(super_kmer_id < (1ULL << cvb_positions.width())); + ++super_kmer_id; } } assert(partition_id == num_partitions); diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 864cf52..dd1d95d 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -22,11 +22,10 @@ struct compact_string_pool { compact_string_pool() {} struct builder { - builder(uint64_t k) : k(k), offset(0), num_strings(0) {} + builder(uint64_t k) : k(k), offset(0), num_super_kmers(0) {} void build(compact_string_pool& pool) { - pool.k = k; - pool.num_strings = num_strings; + pool.m_num_super_kmers = num_super_kmers; pool.pieces.swap(pieces); pool.strings.build(&bvb_strings); } @@ -43,7 +42,7 @@ struct compact_string_pool { for (uint64_t i = prefix; i != size; ++i) { bvb_strings.append_bits(util::char_to_uint64(string[i]), 2); } - num_strings += 1; + num_super_kmers += 1; offset = bvb_strings.size() / 2; } @@ -59,29 +58,30 @@ struct compact_string_pool { uint64_t k; uint64_t offset; - uint64_t num_strings; + uint64_t num_super_kmers; std::vector pieces; pthash::bit_vector_builder bvb_strings; }; uint64_t num_bits() const { return strings.size(); } - uint64_t size() const { return num_strings; } + uint64_t num_super_kmers() const { return m_num_super_kmers; } - uint64_t k; - uint64_t num_strings; std::vector pieces; pthash::bit_vector strings; + +private: + uint64_t m_num_super_kmers; }; -typedef uint8_t num_kmers_in_string_uint_type; +typedef uint8_t num_kmers_in_super_kmer_uint_type; #pragma pack(push, 1) struct minimizer_tuple { - minimizer_tuple(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_string) - : minimizer(minimizer), offset(offset), num_kmers_in_string(num_kmers_in_string) {} + minimizer_tuple(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_super_kmer) + : minimizer(minimizer), offset(offset), num_kmers_in_super_kmer(num_kmers_in_super_kmer) {} uint64_t minimizer; uint64_t offset; - num_kmers_in_string_uint_type num_kmers_in_string; + num_kmers_in_super_kmer_uint_type num_kmers_in_super_kmer; }; #pragma pack(pop) @@ -94,7 +94,7 @@ struct list_type { iterator(std::vector::iterator begin) : m_begin(begin) {} inline std::pair operator*() const { - return {(*m_begin).offset, (*m_begin).num_kmers_in_string}; + return {(*m_begin).offset, (*m_begin).num_kmers_in_super_kmer}; } inline void operator++() { ++m_begin; } @@ -121,8 +121,8 @@ struct minimizers_tuples { // tuples.reserve(n); // } - void emplace_back(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_string) { - tuples.emplace_back(minimizer, offset, num_kmers_in_string); + void emplace_back(uint64_t minimizer, uint64_t offset, uint64_t num_kmers_in_super_kmer) { + tuples.emplace_back(minimizer, offset, num_kmers_in_super_kmer); } minimizer_tuple& back() { return tuples.back(); } diff --git a/include/info.cpp b/include/info.cpp index a00bb7d..3f58921 100644 --- a/include/info.cpp +++ b/include/info.cpp @@ -18,7 +18,6 @@ uint64_t skew_index::print_info() const { num_kmers_in_skew_index += n; lower = upper; upper = 2 * lower; - // if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket; } return num_kmers_in_skew_index; } @@ -31,8 +30,8 @@ void dictionary::print_space_breakdown() const { << " [bits/kmer]\n"; std::cout << " pieces: " << static_cast(m_buckets.pieces.num_bits()) / size() << " [bits/kmer]\n"; - std::cout << " num_strings_before_bucket: " - << static_cast(m_buckets.num_strings_before_bucket.num_bits()) / size() + std::cout << " num_super_kmers_before_bucket: " + << static_cast(m_buckets.num_super_kmers_before_bucket.num_bits()) / size() << " [bits/kmer]\n"; std::cout << " offsets: " << static_cast(8 * m_buckets.offsets.bytes()) / size() << " [bits/kmer]\n"; @@ -57,10 +56,9 @@ void dictionary::print_info() const { std::cout << "canonicalized = " << (canonicalized() ? "true" : "false") << '\n'; std::cout << "weighted = " << (weighted() ? "true" : "false") << '\n'; - std::cout << "num_strings = " << m_buckets.offsets.size() << '\n'; + std::cout << "num_super_kmers = " << m_buckets.offsets.size() << '\n'; std::cout << "num_pieces = " << m_buckets.pieces.size() << " (+" << (2.0 * m_buckets.pieces.size() * (k() - 1)) / size() << " [bits/kmer])" << '\n'; - std::cout << "num_symbols_in_string = " << m_buckets.strings.size() / 2 << '\n'; std::cout << "bits_per_offset = ceil(log2(" << m_buckets.strings.size() / 2 << ")) = " << std::ceil(std::log2(m_buckets.strings.size() / 2)) << '\n'; uint64_t num_kmers_in_skew_index = m_skew_index.print_info(); diff --git a/include/lookup.cpp b/include/lookup.cpp index 6fdf3d7..45435a2 100644 --- a/include/lookup.cpp +++ b/include/lookup.cpp @@ -9,13 +9,13 @@ uint64_t dictionary::lookup_uint64_regular_parsing(uint64_t uint64_kmer) const { if (m_skew_index.empty()) return m_buckets.lookup(bucket_id, uint64_kmer, m_k, m_m); auto [begin, end] = m_buckets.locate_bucket(bucket_id); - uint64_t num_strings_in_bucket = end - begin; - uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket); - if (log2_num_strings_in_bucket > m_skew_index.min_log2) { - uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_strings_in_bucket); - /* It must hold pos < num_strings_in_bucket for the kmer to exist. */ - if (pos < num_strings_in_bucket) { - return m_buckets.lookup_in_string(begin + pos, uint64_kmer, m_k, m_m); + uint64_t num_super_kmers_in_bucket = end - begin; + uint64_t log2_num_super_kmers_in_bucket = util::ceil_log2_uint32(num_super_kmers_in_bucket); + if (log2_num_super_kmers_in_bucket > m_skew_index.min_log2) { + uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_super_kmers_in_bucket); + /* It must hold pos < num_super_kmers_in_bucket for the kmer to exist. */ + if (pos < num_super_kmers_in_bucket) { + return m_buckets.lookup_in_super_kmer(begin + pos, uint64_kmer, m_k, m_m); } return constants::invalid; } @@ -34,17 +34,18 @@ uint64_t dictionary::lookup_uint64_canonical_parsing(uint64_t uint64_kmer) const } auto [begin, end] = m_buckets.locate_bucket(bucket_id); - uint64_t num_strings_in_bucket = end - begin; - uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket); - if (log2_num_strings_in_bucket > m_skew_index.min_log2) { - uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_strings_in_bucket); - if (pos < num_strings_in_bucket) { - uint64_t kmer_id = m_buckets.lookup_in_string(begin + pos, uint64_kmer, m_k, m_m); + uint64_t num_super_kmers_in_bucket = end - begin; + uint64_t log2_num_super_kmers_in_bucket = util::ceil_log2_uint32(num_super_kmers_in_bucket); + if (log2_num_super_kmers_in_bucket > m_skew_index.min_log2) { + uint64_t pos = m_skew_index.lookup(uint64_kmer, log2_num_super_kmers_in_bucket); + if (pos < num_super_kmers_in_bucket) { + uint64_t kmer_id = m_buckets.lookup_in_super_kmer(begin + pos, uint64_kmer, m_k, m_m); if (kmer_id != constants::invalid) return kmer_id; } - uint64_t pos_rc = m_skew_index.lookup(uint64_kmer_rc, log2_num_strings_in_bucket); - if (pos_rc < num_strings_in_bucket) { - uint64_t kmer_id = m_buckets.lookup_in_string(begin + pos_rc, uint64_kmer_rc, m_k, m_m); + uint64_t pos_rc = m_skew_index.lookup(uint64_kmer_rc, log2_num_super_kmers_in_bucket); + if (pos_rc < num_super_kmers_in_bucket) { + uint64_t kmer_id = + m_buckets.lookup_in_super_kmer(begin + pos_rc, uint64_kmer_rc, m_k, m_m); return kmer_id; } return constants::invalid; diff --git a/include/query/membership_query_canonical_parsing.cpp b/include/query/membership_query_canonical_parsing.cpp index f224807..6f96873 100644 --- a/include/query/membership_query_canonical_parsing.cpp +++ b/include/query/membership_query_canonical_parsing.cpp @@ -146,17 +146,19 @@ struct membership_query_canonical_parsing { int is_member() { bool check_minimizer = !same_minimizer(); if (!m_dict->m_skew_index.empty()) { - uint64_t num_strings_in_bucket = m_end - m_begin; - uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket); - if (log2_num_strings_in_bucket > (m_dict->m_skew_index).min_log2) { - uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_strings_in_bucket); - if (p < num_strings_in_bucket) { + uint64_t num_super_kmers_in_bucket = m_end - m_begin; + uint64_t log2_num_super_kmers_in_bucket = + util::ceil_log2_uint32(num_super_kmers_in_bucket); + if (log2_num_super_kmers_in_bucket > (m_dict->m_skew_index).min_log2) { + uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_super_kmers_in_bucket); + if (p < num_super_kmers_in_bucket) { int ret = is_member(m_begin + p, m_begin + p + 1, check_minimizer); if (ret != return_value::KMER_NOT_FOUND) return ret; check_minimizer = false; } - uint64_t p_rc = m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_strings_in_bucket); - if (p_rc < num_strings_in_bucket) { + uint64_t p_rc = + m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_super_kmers_in_bucket); + if (p_rc < num_super_kmers_in_bucket) { int ret = is_member(m_begin + p_rc, m_begin + p_rc + 1, check_minimizer); if (ret != return_value::KMER_NOT_FOUND) return ret; } @@ -167,8 +169,8 @@ struct membership_query_canonical_parsing { } int is_member(uint64_t begin, uint64_t end, bool check_minimizer) { - for (uint64_t string_id = begin; string_id != end; ++string_id) { - uint64_t offset = (m_dict->m_buckets).offsets.access(string_id); + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + uint64_t offset = (m_dict->m_buckets).offsets.access(super_kmer_id); uint64_t pos_in_string = 2 * offset; m_reverse = false; m_string_iterator.at(pos_in_string); @@ -180,7 +182,7 @@ struct membership_query_canonical_parsing { while (m_pos_in_window != m_window_size) { uint64_t val = m_string_iterator.read(2 * m_k); - if (check_minimizer and string_id == begin and m_pos_in_window == 0) { + if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { uint64_t val_rc = util::compute_reverse_complement(val, m_k); uint64_t minimizer = std::min(util::compute_minimizer(val, m_k, m_m, m_seed), diff --git a/include/query/membership_query_regular_parsing.cpp b/include/query/membership_query_regular_parsing.cpp index 80e2ba4..877e050 100644 --- a/include/query/membership_query_regular_parsing.cpp +++ b/include/query/membership_query_regular_parsing.cpp @@ -181,11 +181,12 @@ struct membership_query_regular_parsing { int is_member() { bool check_minimizer = !same_minimizer(); if (!m_dict->m_skew_index.empty()) { - uint64_t num_strings_in_bucket = m_end - m_begin; - uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket); - if (log2_num_strings_in_bucket > (m_dict->m_skew_index).min_log2) { - uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_strings_in_bucket); - if (p < num_strings_in_bucket) { + uint64_t num_super_kmers_in_bucket = m_end - m_begin; + uint64_t log2_num_super_kmers_in_bucket = + util::ceil_log2_uint32(num_super_kmers_in_bucket); + if (log2_num_super_kmers_in_bucket > (m_dict->m_skew_index).min_log2) { + uint64_t p = m_dict->m_skew_index.lookup(m_kmer, log2_num_super_kmers_in_bucket); + if (p < num_super_kmers_in_bucket) { int ret = is_member(m_begin + p, m_begin + p + 1, check_minimizer); if (ret != return_value::KMER_NOT_FOUND) return ret; } @@ -198,11 +199,12 @@ struct membership_query_regular_parsing { int is_member_rc() { bool check_minimizer = !same_minimizer_rc(); if (!m_dict->m_skew_index.empty()) { - uint64_t num_strings_in_bucket = m_end - m_begin; - uint64_t log2_num_strings_in_bucket = util::ceil_log2_uint32(num_strings_in_bucket); - if (log2_num_strings_in_bucket > (m_dict->m_skew_index).min_log2) { - uint64_t p = m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_strings_in_bucket); - if (p < num_strings_in_bucket) { + uint64_t num_super_kmers_in_bucket = m_end - m_begin; + uint64_t log2_num_super_kmers_in_bucket = + util::ceil_log2_uint32(num_super_kmers_in_bucket); + if (log2_num_super_kmers_in_bucket > (m_dict->m_skew_index).min_log2) { + uint64_t p = m_dict->m_skew_index.lookup(m_kmer_rc, log2_num_super_kmers_in_bucket); + if (p < num_super_kmers_in_bucket) { int ret = is_member_rc(m_begin + p, m_begin + p + 1, check_minimizer); if (ret != return_value::KMER_NOT_FOUND) return ret; } @@ -213,8 +215,8 @@ struct membership_query_regular_parsing { } int is_member(uint64_t begin, uint64_t end, bool check_minimizer) { - for (uint64_t string_id = begin; string_id != end; ++string_id) { - uint64_t offset = (m_dict->m_buckets).offsets.access(string_id); + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + uint64_t offset = (m_dict->m_buckets).offsets.access(super_kmer_id); uint64_t pos_in_string = 2 * offset; m_reverse = false; m_string_iterator.at(pos_in_string); @@ -226,7 +228,7 @@ struct membership_query_regular_parsing { while (m_pos_in_window != m_window_size) { uint64_t val = m_string_iterator.read(2 * m_k); - if (check_minimizer and string_id == begin and m_pos_in_window == 0) { + if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); if (minimizer != m_curr_minimizer) return return_value::MINIMIZER_NOT_FOUND; } @@ -247,8 +249,8 @@ struct membership_query_regular_parsing { } int is_member_rc(uint64_t begin, uint64_t end, bool check_minimizer) { - for (uint64_t string_id = begin; string_id != end; ++string_id) { - uint64_t offset = (m_dict->m_buckets).offsets.access(string_id); + for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { + uint64_t offset = (m_dict->m_buckets).offsets.access(super_kmer_id); uint64_t pos_in_string = 2 * offset; m_reverse = false; m_string_iterator.at(pos_in_string); @@ -260,7 +262,7 @@ struct membership_query_regular_parsing { while (m_pos_in_window != m_window_size) { uint64_t val = m_string_iterator.read(2 * m_k); - if (check_minimizer and string_id == begin and m_pos_in_window == 0) { + if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); if (minimizer != m_curr_minimizer_rc) { return return_value::MINIMIZER_NOT_FOUND; diff --git a/include/skew_index.hpp b/include/skew_index.hpp index d8afa0c..1664299 100644 --- a/include/skew_index.hpp +++ b/include/skew_index.hpp @@ -8,7 +8,7 @@ struct skew_index { skew_index() : min_log2(constants::min_l) , max_log2(constants::max_l) - , log2_max_num_strings_in_bucket(0) { + , log2_max_num_super_kmers_in_bucket(0) { mphfs.resize(0); positions.resize(0); } @@ -18,12 +18,12 @@ struct skew_index { bool empty() const { return mphfs.empty(); } - uint64_t lookup(uint64_t uint64_kmer, uint64_t log2_num_strings_in_bucket) const { - assert(log2_num_strings_in_bucket >= uint64_t(min_log2 + 1)); - assert(log2_num_strings_in_bucket <= log2_max_num_strings_in_bucket); - uint64_t partition_id = log2_num_strings_in_bucket - (min_log2 + 1); - if (log2_num_strings_in_bucket == log2_max_num_strings_in_bucket or - log2_num_strings_in_bucket > max_log2) { + uint64_t lookup(uint64_t uint64_kmer, uint64_t log2_num_super_kmers_in_bucket) const { + assert(log2_num_super_kmers_in_bucket >= uint64_t(min_log2 + 1)); + assert(log2_num_super_kmers_in_bucket <= log2_max_num_super_kmers_in_bucket); + uint64_t partition_id = log2_num_super_kmers_in_bucket - (min_log2 + 1); + if (log2_num_super_kmers_in_bucket == log2_max_num_super_kmers_in_bucket or + log2_num_super_kmers_in_bucket > max_log2) { partition_id = positions.size() - 1; } auto const& mphf = mphfs[partition_id]; @@ -34,7 +34,7 @@ struct skew_index { uint64_t num_bits() const { uint64_t n = - (sizeof(min_log2) + sizeof(max_log2) + sizeof(log2_max_num_strings_in_bucket)) * 8; + (sizeof(min_log2) + sizeof(max_log2) + sizeof(log2_max_num_super_kmers_in_bucket)) * 8; for (uint64_t partition_id = 0; partition_id != mphfs.size(); ++partition_id) { auto const& mphf = mphfs[partition_id]; auto const& P = positions[partition_id]; @@ -47,14 +47,14 @@ struct skew_index { void visit(Visitor& visitor) { visitor.visit(min_log2); visitor.visit(max_log2); - visitor.visit(log2_max_num_strings_in_bucket); + visitor.visit(log2_max_num_super_kmers_in_bucket); visitor.visit(mphfs); visitor.visit(positions); } uint16_t min_log2; uint16_t max_log2; - uint32_t log2_max_num_strings_in_bucket; + uint32_t log2_max_num_super_kmers_in_bucket; std::vector mphfs; std::vector positions; }; diff --git a/include/util.hpp b/include/util.hpp index f0fe172..4dadba4 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -92,40 +92,42 @@ struct buckets_statistics { static const uint64_t max_bucket_size = 4 * 1024; static const uint64_t max_string_size = 256; - buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_strings = 0) + buckets_statistics(uint64_t num_buckets, uint64_t num_kmers, uint64_t num_super_kmers = 0) : m_num_buckets(num_buckets) , m_num_kmers(num_kmers) - // , m_num_strings(num_strings) - , m_max_num_kmers_in_string(0) - , m_max_num_strings_in_bucket(0) { - (void)num_strings; + // , m_num_super_kmers(num_super_kmers) + , m_max_num_kmers_in_super_kmer(0) + , m_max_num_super_kmers_in_bucket(0) { + (void)num_super_kmers; m_bucket_sizes.resize(max_bucket_size + 1, 0); m_total_kmers.resize(max_bucket_size + 1, 0); m_string_sizes.resize(max_string_size + 1, 0); } - void add_num_strings_in_bucket(uint64_t num_strings_in_bucket) { - if (num_strings_in_bucket < max_bucket_size + 1) { - m_bucket_sizes[num_strings_in_bucket] += 1; + void add_num_super_kmers_in_bucket(uint64_t num_super_kmers_in_bucket) { + if (num_super_kmers_in_bucket < max_bucket_size + 1) { + m_bucket_sizes[num_super_kmers_in_bucket] += 1; } } - void add_num_kmers_in_string(uint64_t num_strings_in_bucket, uint64_t num_kmers_in_string) { - if (num_strings_in_bucket < max_bucket_size + 1) { - m_total_kmers[num_strings_in_bucket] += num_kmers_in_string; + void add_num_kmers_in_super_kmer(uint64_t num_super_kmers_in_bucket, + uint64_t num_kmers_in_super_kmer) { + if (num_super_kmers_in_bucket < max_bucket_size + 1) { + m_total_kmers[num_super_kmers_in_bucket] += num_kmers_in_super_kmer; } - if (num_kmers_in_string > m_max_num_kmers_in_string) { - m_max_num_kmers_in_string = num_kmers_in_string; + if (num_kmers_in_super_kmer > m_max_num_kmers_in_super_kmer) { + m_max_num_kmers_in_super_kmer = num_kmers_in_super_kmer; } - if (num_strings_in_bucket > m_max_num_strings_in_bucket) { - m_max_num_strings_in_bucket = num_strings_in_bucket; + if (num_super_kmers_in_bucket > m_max_num_super_kmers_in_bucket) { + m_max_num_super_kmers_in_bucket = num_super_kmers_in_bucket; } - if (num_kmers_in_string < max_string_size + 1) m_string_sizes[num_kmers_in_string] += 1; + if (num_kmers_in_super_kmer < max_string_size + 1) + m_string_sizes[num_kmers_in_super_kmer] += 1; } uint64_t num_kmers() const { return m_num_kmers; } uint64_t num_buckets() const { return m_num_buckets; } - uint64_t max_num_strings_in_bucket() const { return m_max_num_strings_in_bucket; } + uint64_t max_num_super_kmers_in_bucket() const { return m_max_num_super_kmers_in_bucket; } void print() const { // full statistics @@ -135,7 +137,7 @@ struct buckets_statistics { // bucket_size != max_bucket_size + 1; ++bucket_size) { // if (m_bucket_sizes[bucket_size] > 0) { // std::cout << "buckets with " << bucket_size - // << " strings=" << m_bucket_sizes[bucket_size] << "(" + // << " super_kmers=" << m_bucket_sizes[bucket_size] << "(" // << (m_bucket_sizes[bucket_size] * 100.0) / m_num_buckets // << "%)|total_kmers=" << m_total_kmers[bucket_size] << "(" // << (m_total_kmers[bucket_size] * 100.0) / m_num_kmers << "%)" @@ -170,42 +172,44 @@ struct buckets_statistics { std::cout << " === bucket statistics (less) === \n"; for (uint64_t bucket_size = 1; bucket_size != 16 + 1; ++bucket_size) { if (m_bucket_sizes[bucket_size] > 0) { - std::cout << "buckets with " << bucket_size - << " strings = " << (m_bucket_sizes[bucket_size] * 100.0) / m_num_buckets - << "%" << std::endl; + std::cout << "buckets with " << bucket_size << " super_kmers = " + << (m_bucket_sizes[bucket_size] * 100.0) / m_num_buckets << "%" + << std::endl; } } - std::cout << "max_num_strings_in_bucket " << m_max_num_strings_in_bucket << std::endl; + std::cout << "max_num_super_kmers_in_bucket " << m_max_num_super_kmers_in_bucket + << std::endl; - // std::cout << " === string statistics === \n"; - // uint64_t total_strings = 0; + // std::cout << " === super_kmer statistics === \n"; + // uint64_t total_super_kmers = 0; // uint64_t total_kmers = 0; // for (uint64_t string_size = 1; string_size != max_string_size + 1; ++string_size) { // if (m_string_sizes[string_size] > 0) { - // std::cout << "strings with " << string_size + // std::cout << "super_kmers with " << string_size // << " kmer=" << m_string_sizes[string_size] << "(" - // << (m_string_sizes[string_size] * 100.0) / m_num_strings + // << (m_string_sizes[string_size] * 100.0) / m_num_super_kmers // << "%)|total_kmers=" << (string_size * m_string_sizes[string_size]) << // "(" // << (string_size * m_string_sizes[string_size] * 100.0) / m_num_kmers // << "%)" << std::endl; - // total_strings += m_string_sizes[string_size]; + // total_super_kmers += m_string_sizes[string_size]; // total_kmers += string_size * m_string_sizes[string_size]; // } // } - // std::cout << "total_strings " << total_strings << "/" << m_num_strings << " (" - // << (total_strings * 100.0) / m_num_strings << "%)" << std::endl; + // std::cout << "total_super_kmers " << total_super_kmers << "/" << m_num_super_kmers << "(" + // << (total_super_kmers * 100.0) / m_num_super_kmers << "%)" << std::endl; // std::cout << "total_kmers " << total_kmers << "/" << m_num_kmers << " (" // << (total_kmers * 100.0) / m_num_kmers << "%)" << std::endl; - // std::cout << "max_num_kmers_in_string " << m_max_num_kmers_in_string << std::endl; + // std::cout << "max_num_kmers_in_super_kmer " << m_max_num_kmers_in_super_kmer << + // std::endl; } private: uint64_t m_num_buckets; uint64_t m_num_kmers; - // uint64_t m_num_strings; - uint64_t m_max_num_kmers_in_string; - uint64_t m_max_num_strings_in_bucket; + // uint64_t m_num_super_kmers; + uint64_t m_max_num_kmers_in_super_kmer; + uint64_t m_max_num_super_kmers_in_bucket; std::vector m_bucket_sizes; std::vector m_total_kmers; std::vector m_string_sizes;