diff --git a/CMakeLists.txt b/CMakeLists.txt index b60e570..c6e1641 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,18 +64,10 @@ set(Z_LIB_SOURCES include/gz/zip_stream.cpp ) -set(SSHASH_SOURCES - include/dictionary.cpp - include/info.cpp - include/dump.cpp - include/statistics.cpp - include/builder/build.cpp -) # Create a static lib add_library(sshash_static STATIC ${Z_LIB_SOURCES} - ${SSHASH_SOURCES} ) add_executable(sshash src/sshash.cpp) diff --git a/external/pthash b/external/pthash index 9466359..3042587 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit 94663599ab7d092a9301bf8c8709fbf558a04601 +Subproject commit 3042587a9a8d21032e06be263d1628158967d9dd diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp index 669d63b..35b4c92 100644 --- a/include/bit_vector_iterator.hpp +++ b/include/bit_vector_iterator.hpp @@ -5,6 +5,7 @@ namespace sshash { +template struct bit_vector_iterator { bit_vector_iterator() : m_bv(nullptr) {} @@ -17,80 +18,61 @@ struct bit_vector_iterator { } inline kmer_t read(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - val = m_buf & ((kmer_t(1) << l) - 1); - } else { - val = m_buf; - } + kmer_t val = m_buf; + val.take(l); return val; } inline kmer_t read_reverse(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf_reverse(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - uint64_t shift = (l >= 64) ? (constants::uint_kmer_bits - l) : 64; - val = m_buf >> shift; - } else { - val = m_buf; - } + kmer_t val = m_buf; + val.drop(kmer_t::uint_kmer_bits - l); return val; } inline void eat(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - if (l != constants::uint_kmer_bits) m_buf >>= l; + m_buf.drop(l); m_avail -= l; m_pos += l; } inline void eat_reverse(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf_reverse(); - if (l != constants::uint_kmer_bits) m_buf <<= l; + m_buf.pad(l); m_avail -= l; m_pos -= l; } - inline kmer_t read_and_advance_by_two(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + inline kmer_t read_and_advance_by_char(uint64_t l) { + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - val = m_buf & ((kmer_t(1) << l) - 1); - m_buf >>= 2; - } else { - val = m_buf; - } - m_avail -= 2; - m_pos += 2; + kmer_t val = m_buf; + val.take(l); + m_buf.drop_char(); + m_avail -= kmer_t::bits_per_char; + m_pos += kmer_t::bits_per_char; return val; } - inline kmer_t get_next_two_bits() { - if (m_avail < 2) fill_buf(); - kmer_t val = m_buf & 3; - m_buf >>= 2; - m_avail -= 2; - m_pos += 2; - return val; + inline uint64_t get_next_char() { + if (m_avail < kmer_t::bits_per_char) fill_buf(); + m_avail -= kmer_t::bits_per_char; + m_pos += kmer_t::bits_per_char; + return m_buf.pop_char(); } inline kmer_t take(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - val = m_buf & ((kmer_t(1) << l) - 1); - m_buf >>= l; - } else { - val = m_buf; - } + kmer_t val = m_buf; + val.take(l); + m_buf.drop(l); m_avail -= l; m_pos += l; return val; @@ -100,38 +82,20 @@ struct bit_vector_iterator { private: inline void fill_buf() { - if constexpr (constants::uint_kmer_bits == 64) { - m_buf = m_bv->get_word64(m_pos); - } else { - assert(constants::uint_kmer_bits == 128); - m_buf = static_cast(m_bv->get_word64(m_pos)); - m_buf += static_cast(m_bv->get_word64(m_pos + 64)) << 64; + static_assert(kmer_t::uint_kmer_bits % 64 == 0); + for (int i = kmer_t::uint_kmer_bits - 64; i >= 0; i -= 64) { + if (m_pos + i < m_bv->size()) { m_buf.append64(m_bv->get_word64(m_pos + i)); } } - m_avail = constants::uint_kmer_bits; + m_avail = kmer_t::uint_kmer_bits; } inline void fill_buf_reverse() { - if constexpr (constants::uint_kmer_bits == 64) { - if (m_pos < 64) { - m_buf = m_bv->get_word64(0); - m_avail = m_pos; - m_buf <<= (64 - m_pos); - return; - } - m_buf = m_bv->get_word64(m_pos - 64); - } else { - assert(constants::uint_kmer_bits == 128); - if (m_pos < 128) { - m_buf = static_cast(m_bv->get_word64(0)) << 64; - m_buf += static_cast(m_bv->get_word64(64)); - m_avail = m_pos; - m_buf <<= (128 - m_pos); - return; - } - m_buf = static_cast(m_bv->get_word64(m_pos - 128)) << 64; - m_buf += static_cast(m_bv->get_word64(m_pos - 64)); + static_assert(kmer_t::uint_kmer_bits % 64 == 0); + for (int i = kmer_t::uint_kmer_bits; i > 0; i -= 64) { + m_buf.append64(m_bv->get_word64(std::max(m_pos, kmer_t::uint_kmer_bits) - i)); } - m_avail = constants::uint_kmer_bits; + m_avail = std::min(m_pos, kmer_t::uint_kmer_bits); + m_buf.pad(kmer_t::uint_kmer_bits - m_avail); } pthash::bit_vector const* m_bv; diff --git a/include/bitpack.hpp b/include/bitpack.hpp new file mode 100644 index 0000000..95d8084 --- /dev/null +++ b/include/bitpack.hpp @@ -0,0 +1,61 @@ +#pragma once + +namespace sshash { +// full binary tree of given height +// with Int type in its leafs +template +struct bitpack { + static_assert(height > 0); + using halfpack = std::conditional_t>; + static constexpr uint16_t hsize = 8 * sizeof(halfpack); + halfpack a, b; + + bitpack() {} + bitpack(uint64_t x) : a(x), b(0) {} + bitpack(halfpack a, halfpack b) : a(a), b(b) {} + explicit operator uint64_t() const { return (uint64_t)a; } + + bool operator==(bitpack const& t) const { return std::pair{a, b} == std::pair{t.a, t.b}; } + bool operator!=(bitpack const& t) const { return std::pair{a, b} != std::pair{t.a, t.b}; } + bool operator<(bitpack const& t) const { return std::pair{a, b} < std::pair{t.a, t.b}; } + + // shift in [0, size) + bitpack& operator>>=(uint16_t shift) { + if (shift < hsize) { + a = (a >> shift) | (b << (hsize - shift)); + b >>= shift; + } else { + a = b >> (shift - hsize); + b = 0; + } + return *this; + } + bitpack& operator<<=(uint16_t shift) { + if (shift < hsize) { + b = (b << shift) | (a >> (hsize - shift)); + a <<= shift; + } else { + b = a << (shift - hsize); + a = 0; + } + return *this; + } + bitpack operator<<(uint16_t shift) const { return bitpack(*this) <<= shift; } + bitpack operator>>(uint16_t shift) const { return bitpack(*this) >>= shift; } + + bitpack& operator|=(bitpack const& t) { + a |= t.a; + b |= t.b; + return *this; + } + bitpack& operator&=(bitpack const& t) { + a &= t.a; + b &= t.b; + return *this; + } + bitpack operator|(bitpack const& t) const { return bitpack(*this) |= t; } + bitpack operator&(bitpack const& t) const { return bitpack(*this) &= t; } + + bitpack operator~() const { return {~a, ~b}; } +}; +} // namespace sshash diff --git a/include/buckets.hpp b/include/buckets.hpp index 880c7e4..743c0d0 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -6,6 +6,7 @@ namespace sshash { +template struct buckets { std::pair offset_to_id(uint64_t offset, uint64_t k) const { auto [pos, contig_begin, contig_end] = pieces.locate(offset); @@ -56,14 +57,14 @@ struct buckets { kmer_t contig_prefix(uint64_t contig_id, uint64_t k) const { uint64_t contig_begin = pieces.access(contig_id); - bit_vector_iterator bv_it(strings, 2 * contig_begin); - return bv_it.read(2 * (k - 1)); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * contig_begin); + return bv_it.read(kmer_t::bits_per_char * (k - 1)); } kmer_t contig_suffix(uint64_t contig_id, uint64_t k) const { uint64_t contig_end = pieces.access(contig_id + 1); - bit_vector_iterator bv_it(strings, 2 * (contig_end - k + 1)); - return bv_it.read(2 * (k - 1)); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * (contig_end - k + 1)); + return bv_it.read(kmer_t::bits_per_char * (k - 1)); } std::pair locate_bucket(uint64_t bucket_id) const { @@ -94,10 +95,10 @@ struct buckets { uint64_t m) const { uint64_t offset = offsets.access(super_kmer_id); auto [res, contig_end] = offset_to_id(offset, k); - bit_vector_iterator bv_it(strings, 2 * offset); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { - kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k); + kmer_t read_kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * k); if (read_kmer == target_kmer) { res.kmer_id += w; res.kmer_id_in_contig += w; @@ -119,10 +120,10 @@ struct buckets { for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { uint64_t offset = offsets.access(super_kmer_id); auto [res, contig_end] = offset_to_id(offset, k); - bit_vector_iterator bv_it(strings, 2 * offset); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { - kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k); + kmer_t read_kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * k); if (read_kmer == target_kmer) { res.kmer_id += w; res.kmer_id_in_contig += w; @@ -170,8 +171,8 @@ struct buckets { void access(uint64_t kmer_id, char* string_kmer, uint64_t k) const { uint64_t offset = id_to_offset(kmer_id, k); - bit_vector_iterator bv_it(strings, 2 * offset); - kmer_t read_kmer = bv_it.read(2 * k); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * offset); + kmer_t read_kmer = bv_it.read(kmer_t::bits_per_char * k); util::uint_kmer_to_string(read_kmer, string_kmer, k); } @@ -186,7 +187,7 @@ struct buckets { , m_end_kmer_id(end_kmer_id) , m_k(k) // { - m_bv_it = bit_vector_iterator(m_buckets->strings, -1); + m_bv_it = bit_vector_iterator(m_buckets->strings, -1); m_offset = m_buckets->id_to_offset(m_begin_kmer_id, k); auto [pos, piece_end] = m_buckets->pieces.next_geq(m_offset); if (piece_end == m_offset) pos += 1; @@ -209,12 +210,12 @@ struct buckets { util::uint_kmer_to_string(m_read_kmer, m_ret.second.data(), m_k); } else { memmove(m_ret.second.data(), m_ret.second.data() + 1, m_k - 1); - m_ret.second[m_k - 1] = util::uint64_to_char(m_last_two_bits); + m_ret.second[m_k - 1] = kmer_t::uint64_to_char(m_last_char); } m_clear = false; - m_read_kmer >>= 2; - m_last_two_bits = m_bv_it.get_next_two_bits(); - m_read_kmer += m_last_two_bits << (2 * (m_k - 1)); + m_read_kmer.drop_char(); + m_last_char = m_bv_it.get_next_char(); + m_read_kmer.kth_char_or(m_k - 1, m_last_char); ++m_begin_kmer_id; ++m_offset; return m_ret; @@ -230,18 +231,18 @@ struct buckets { uint64_t m_k; uint64_t m_offset; uint64_t m_next_offset; - bit_vector_iterator m_bv_it; + bit_vector_iterator m_bv_it; ef_sequence::iterator m_pieces_it; kmer_t m_read_kmer; - uint64_t m_last_two_bits; + uint64_t m_last_char; bool m_clear; void next_piece() { - m_bv_it.at(2 * m_offset); + m_bv_it.at(kmer_t::bits_per_char * m_offset); m_next_offset = m_pieces_it.next(); assert(m_next_offset > m_offset); - m_read_kmer = m_bv_it.take(2 * m_k); + m_read_kmer = m_bv_it.take(kmer_t::bits_per_char * m_k); m_clear = true; } }; diff --git a/include/builder/build.cpp b/include/builder/build.impl similarity index 91% rename from include/builder/build.cpp rename to include/builder/build.impl index 0cec24b..a30e7c4 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.impl @@ -12,16 +12,18 @@ namespace sshash { -void dictionary::build(std::string const& filename, build_configuration const& build_config) { +template +void dictionary::build(std::string const& filename, + build_configuration const& build_config) { /* Validate the build configuration. */ if (build_config.k == 0) throw std::runtime_error("k must be > 0"); - if (build_config.k > constants::max_k) { - throw std::runtime_error("k must be less <= " + std::to_string(constants::max_k) + + if (build_config.k > kmer_t::max_k) { + throw std::runtime_error("k must be less <= " + std::to_string(kmer_t::max_k) + " but got k = " + std::to_string(build_config.k)); } if (build_config.m == 0) throw std::runtime_error("m must be > 0"); - if (build_config.m > constants::max_m) { - throw std::runtime_error("m must be less <= " + std::to_string(constants::max_m) + + if (build_config.m > kmer_t::max_m) { + throw std::runtime_error("m must be less <= " + std::to_string(kmer_t::max_m) + " but got m = " + std::to_string(build_config.m)); } if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k"); @@ -41,7 +43,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b /* step 1: parse the input file and build compact string pool ***/ timer.start(); - parse_data data = parse_file(filename, build_config); + parse_data data = parse_file(filename, build_config); m_size = data.num_kmers; timer.stop(); timings.push_back(timer.elapsed()); diff --git a/include/builder/build_index.hpp b/include/builder/build_index.hpp index 4fa7548..4f6ff66 100644 --- a/include/builder/build_index.hpp +++ b/include/builder/build_index.hpp @@ -155,7 +155,9 @@ struct bucket_pairs { } }; -buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets, +template +buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, + buckets& m_buckets, build_configuration const& build_config) { uint64_t num_buckets = m_minimizers.size(); uint64_t num_kmers = data.num_kmers; diff --git a/include/builder/build_skew_index.hpp b/include/builder/build_skew_index.hpp index 75f75ae..4baf087 100644 --- a/include/builder/build_skew_index.hpp +++ b/include/builder/build_skew_index.hpp @@ -4,8 +4,9 @@ namespace sshash { -void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets, - build_configuration const& build_config, +template +void build_skew_index(skew_index& m_skew_index, parse_data& data, + buckets const& m_buckets, build_configuration const& build_config, buckets_statistics const& buckets_stats) { const uint64_t min_log2_size = m_skew_index.min_log2; const uint64_t max_log2_size = m_skew_index.max_log2; @@ -212,12 +213,13 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& assert(lists[i].size() > lower and lists[i].size() <= upper); uint64_t super_kmer_id = 0; for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { - bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); + bit_vector_iterator bv_it(m_buckets.strings, + kmer_t::bits_per_char * offset); for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) { - kmer_t kmer = bv_it.read(2 * build_config.k); + kmer_t kmer = bv_it.read(kmer_t::bits_per_char * build_config.k); keys_in_partition.push_back(kmer); super_kmer_ids_in_partition.push_back(super_kmer_id); - bv_it.eat(2); + bv_it.eat(kmer_t::bits_per_char); } assert(super_kmer_id < (1ULL << cvb_positions.width())); ++super_kmer_id; diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 3b7aa2c..9ba9b95 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -4,15 +4,18 @@ namespace sshash { +template struct parse_data { parse_data(std::string const& tmp_dirname) : num_kmers(0), minimizers(tmp_dirname) {} uint64_t num_kmers; minimizers_tuples minimizers; - compact_string_pool strings; + compact_string_pool strings; weights::builder weights_builder; }; -void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { +template +void parse_file(std::istream& is, parse_data& data, + build_configuration const& build_config) { uint64_t k = build_config.k; uint64_t m = build_config.m; uint64_t seed = build_config.seed; @@ -29,7 +32,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b /* fit into the wanted number of bits */ assert(max_num_kmers_in_super_kmer < (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8))); - compact_string_pool::builder builder(k); + typename compact_string_pool::builder builder(k); std::string sequence; uint64_t prev_minimizer = constants::invalid_uint64; @@ -48,7 +51,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b assert(end > begin); char const* super_kmer = sequence.data() + begin; uint64_t size = (end - begin) + k - 1; - assert(util::is_valid(super_kmer, size)); + assert(util::is_valid(super_kmer, size)); /* if num_kmers_in_super_kmer > k - m + 1, then split the super_kmer into blocks */ uint64_t num_kmers_in_super_kmer = end - begin; @@ -60,7 +63,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b if (i == num_blocks - 1) n = size; uint64_t num_kmers_in_block = n - k + 1; assert(num_kmers_in_block <= max_num_kmers_in_super_kmer); - data.minimizers.emplace_back(prev_minimizer, builder.offset, num_kmers_in_block); + data.minimizers.emplace_back(uint64_t(prev_minimizer), builder.offset, + num_kmers_in_block); builder.append(super_kmer + i * max_num_kmers_in_super_kmer, n, glue); if (glue) { assert(data.minimizers.back().offset > k - 1); @@ -160,14 +164,15 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b while (end != sequence.size() - k + 1) { char const* kmer = sequence.data() + end; - assert(util::is_valid(kmer, k)); - kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); - uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); + assert(util::is_valid(kmer, k)); + kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); + uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); if (build_config.canonical_parsing) { - kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, k); - uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); - minimizer = std::min(minimizer, minimizer_rc); + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(k); + uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); + minimizer = std::min(minimizer, minimizer_rc); } if (prev_minimizer == constants::invalid_uint64) prev_minimizer = minimizer; @@ -205,16 +210,18 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } } -parse_data parse_file(std::string const& filename, build_configuration const& build_config) { +template +parse_data parse_file(std::string const& filename, + build_configuration const& build_config) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); std::cout << "reading file '" << filename << "'..." << std::endl; - parse_data data(build_config.tmp_dirname); + parse_data data(build_config.tmp_dirname); if (util::ends_with(filename, ".gz")) { zip_istream zis(is); - parse_file(zis, data, build_config); + parse_file(zis, data, build_config); } else { - parse_file(is, data, build_config); + parse_file(is, data, build_config); } is.close(); return data; diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 09770a6..59cedd3 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -21,6 +21,7 @@ struct parse_runtime_error : public std::runtime_error { } } +template struct compact_string_pool { compact_string_pool() {} @@ -39,24 +40,24 @@ struct compact_string_pool { if (glue) { prefix = k - 1; } else { /* otherwise, start a new piece */ - pieces.push_back(bvb_strings.size() / 2); + pieces.push_back(bvb_strings.size() / kmer_t::bits_per_char); } for (uint64_t i = prefix; i != size; ++i) { - bvb_strings.append_bits(util::char_to_uint(string[i]), 2); + bvb_strings.append_bits(kmer_t::char_to_uint(string[i]), kmer_t::bits_per_char); } num_super_kmers += 1; - offset = bvb_strings.size() / 2; + offset = bvb_strings.size() / kmer_t::bits_per_char; } void finalize() { /* So pieces will be of size p+1, where p is the number of DNA sequences in the input file. */ - pieces.push_back(bvb_strings.size() / 2); + pieces.push_back(bvb_strings.size() / kmer_t::bits_per_char); assert(pieces.front() == 0); /* Push a final sentinel (dummy) kmer to avoid bounds' checking in bit_vector_iterator::fill_buf(). */ - bvb_strings.append_bits(0, 2 * k); + bvb_strings.append_bits(0, k * kmer_t::bits_per_char); } uint64_t k; diff --git a/include/constants.hpp b/include/constants.hpp index a5c191d..aa5ccf2 100644 --- a/include/constants.hpp +++ b/include/constants.hpp @@ -3,15 +3,6 @@ #include "kmer.hpp" namespace sshash::constants { - -constexpr uint64_t uint_kmer_bits = sizeof(kmer_t) * 8; - -/* max *odd* size that can be packed into sizeof(kmer_t)*8 bits */ -constexpr uint64_t max_k = sizeof(kmer_t) * 4 - 1; - -/* max *odd* size that can be packed into 64 bits */ -constexpr uint64_t max_m = 31; - constexpr uint64_t invalid_uint64 = uint64_t(-1); constexpr uint64_t seed = 1; diff --git a/include/cover/parse_file.hpp b/include/cover/parse_file.hpp index 2b1c4c2..f9f78f0 100644 --- a/include/cover/parse_file.hpp +++ b/include/cover/parse_file.hpp @@ -159,8 +159,7 @@ struct lines_iterator { std::string read_line() { uint8_t const* begin = m_begin; - while (m_begin != m_end and *m_begin++ != '\n') - ; + while (m_begin != m_end and *m_begin++ != '\n'); if (begin == m_begin) return std::string(""); return std::string(reinterpret_cast(begin), m_begin - begin - 1); } @@ -198,6 +197,7 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) { for (auto weight : weights) output.append(std::to_string(weight) + " "); } +template void permute_and_write(std::istream& is, std::string const& output_filename, std::string const& tmp_dirname, pthash::compact_vector const& permutation, pthash::bit_vector const& signs, uint64_t k) { @@ -255,8 +255,8 @@ void permute_and_write(std::istream& is, std::string const& output_filename, and reverse the weights in header_sequence */ dna_sequence_rc.resize(dna_sequence.size()); header_sequence_r.clear(); - util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), - dna_sequence.size()); + kmer_t::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), + dna_sequence.size()); reverse_header(header_sequence, header_sequence_r, k); dna_sequence.swap(dna_sequence_rc); header_sequence.swap(header_sequence_r); @@ -352,6 +352,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename, } } +template void permute_and_write(std::string const& input_filename, std::string const& output_filename, std::string const& tmp_dirname, pthash::compact_vector const& permutation, pthash::bit_vector const& signs, uint64_t k) { @@ -360,9 +361,9 @@ void permute_and_write(std::string const& input_filename, std::string const& out std::cout << "reading file '" << input_filename << "'..." << std::endl; if (util::ends_with(input_filename, ".gz")) { zip_istream zis(is); - permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); } else { - permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); } is.close(); } diff --git a/include/dictionary.cpp b/include/dictionary.cpp deleted file mode 100644 index 011233d..0000000 --- a/include/dictionary.cpp +++ /dev/null @@ -1,191 +0,0 @@ -#include "dictionary.hpp" - -namespace sshash { - -lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { - uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); - uint64_t bucket_id = m_minimizers.lookup(minimizer); - - if (m_skew_index.empty()) return m_buckets.lookup(bucket_id, uint_kmer, m_k, m_m); - - auto [begin, end] = m_buckets.locate_bucket(bucket_id); - uint64_t num_super_kmers_in_bucket = end - begin; - uint64_t log2_bucket_size = util::ceil_log2_uint32(num_super_kmers_in_bucket); - if (log2_bucket_size > m_skew_index.min_log2) { - uint64_t pos = m_skew_index.lookup(uint_kmer, log2_bucket_size); - /* It must hold pos < num_super_kmers_in_bucket for the kmer to exist. */ - if (pos < num_super_kmers_in_bucket) { - return m_buckets.lookup_in_super_kmer(begin + pos, uint_kmer, m_k, m_m); - } - return lookup_result(); - } - - return m_buckets.lookup(begin, end, uint_kmer, m_k, m_m); -} - -lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { - kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, m_k); - uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); - uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed); - uint64_t bucket_id = m_minimizers.lookup(std::min(minimizer, minimizer_rc)); - - if (m_skew_index.empty()) { - return m_buckets.lookup_canonical(bucket_id, uint_kmer, uint_kmer_rc, m_k, m_m); - } - - auto [begin, end] = m_buckets.locate_bucket(bucket_id); - uint64_t num_super_kmers_in_bucket = end - begin; - uint64_t log2_bucket_size = util::ceil_log2_uint32(num_super_kmers_in_bucket); - if (log2_bucket_size > m_skew_index.min_log2) { - uint64_t pos = m_skew_index.lookup(uint_kmer, log2_bucket_size); - if (pos < num_super_kmers_in_bucket) { - auto res = m_buckets.lookup_in_super_kmer(begin + pos, uint_kmer, m_k, m_m); - assert(res.kmer_orientation == constants::forward_orientation); - if (res.kmer_id != constants::invalid_uint64) return res; - } - uint64_t pos_rc = m_skew_index.lookup(uint_kmer_rc, log2_bucket_size); - if (pos_rc < num_super_kmers_in_bucket) { - auto res = m_buckets.lookup_in_super_kmer(begin + pos_rc, uint_kmer_rc, m_k, m_m); - res.kmer_orientation = constants::backward_orientation; - return res; - } - return lookup_result(); - } - - return m_buckets.lookup_canonical(begin, end, uint_kmer, uint_kmer_rc, m_k, m_m); -} - -uint64_t dictionary::lookup(char const* string_kmer, bool check_reverse_complement) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); - return lookup_uint(uint_kmer, check_reverse_complement); -} -uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_complement) const { - auto res = lookup_advanced_uint(uint_kmer, check_reverse_complement); - return res.kmer_id; -} - -lookup_result dictionary::lookup_advanced(char const* string_kmer, - bool check_reverse_complement) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); - return lookup_advanced_uint(uint_kmer, check_reverse_complement); -} -lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, - bool check_reverse_complement) const { - if (m_canonical_parsing) return lookup_uint_canonical_parsing(uint_kmer); - auto res = lookup_uint_regular_parsing(uint_kmer); - assert(res.kmer_orientation == constants::forward_orientation); - if (check_reverse_complement and res.kmer_id == constants::invalid_uint64) { - kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, m_k); - res = lookup_uint_regular_parsing(uint_kmer_rc); - res.kmer_orientation = constants::backward_orientation; - } - return res; -} - -bool dictionary::is_member(char const* string_kmer, bool check_reverse_complement) const { - return lookup(string_kmer, check_reverse_complement) != constants::invalid_uint64; -} -bool dictionary::is_member_uint(kmer_t uint_kmer, bool check_reverse_complement) const { - return lookup_uint(uint_kmer, check_reverse_complement) != constants::invalid_uint64; -} - -void dictionary::access(uint64_t kmer_id, char* string_kmer) const { - assert(kmer_id < size()); - m_buckets.access(kmer_id, string_kmer, m_k); -} - -uint64_t dictionary::weight(uint64_t kmer_id) const { - assert(kmer_id < size()); - return m_weights.weight(kmer_id); -} - -uint64_t dictionary::contig_size(uint64_t contig_id) const { - assert(contig_id < num_contigs()); - auto [begin, end] = m_buckets.contig_offsets(contig_id); - uint64_t contig_length = end - begin; - assert(contig_length >= m_k); - return contig_length - m_k + 1; -} - -void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res, - bool check_reverse_complement) const { - res.forward_A = lookup_advanced_uint(suffix + (util::char_to_uint('A') << (2 * (m_k - 1))), - check_reverse_complement); - res.forward_C = lookup_advanced_uint(suffix + (util::char_to_uint('C') << (2 * (m_k - 1))), - check_reverse_complement); - res.forward_G = lookup_advanced_uint(suffix + (util::char_to_uint('G') << (2 * (m_k - 1))), - check_reverse_complement); - res.forward_T = lookup_advanced_uint(suffix + (util::char_to_uint('T') << (2 * (m_k - 1))), - check_reverse_complement); -} -void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res, - bool check_reverse_complement) const { - res.backward_A = - lookup_advanced_uint(prefix + util::char_to_uint('A'), check_reverse_complement); - res.backward_C = - lookup_advanced_uint(prefix + util::char_to_uint('C'), check_reverse_complement); - res.backward_G = - lookup_advanced_uint(prefix + util::char_to_uint('G'), check_reverse_complement); - res.backward_T = - lookup_advanced_uint(prefix + util::char_to_uint('T'), check_reverse_complement); -} - -neighbourhood dictionary::kmer_forward_neighbours(char const* string_kmer, - bool check_reverse_complement) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); - return kmer_forward_neighbours(uint_kmer, check_reverse_complement); -} -neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer, - bool check_reverse_complement) const { - neighbourhood res; - kmer_t suffix = uint_kmer >> 2; - forward_neighbours(suffix, res, check_reverse_complement); - return res; -} - -neighbourhood dictionary::kmer_backward_neighbours(char const* string_kmer, - bool check_reverse_complement) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); - return kmer_backward_neighbours(uint_kmer, check_reverse_complement); -} -neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer, - bool check_reverse_complement) const { - neighbourhood res; - kmer_t prefix = (uint_kmer << 2) & ((kmer_t(1) << (2 * m_k)) - 1); - backward_neighbours(prefix, res, check_reverse_complement); - return res; -} - -neighbourhood dictionary::kmer_neighbours(char const* string_kmer, - bool check_reverse_complement) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); - return kmer_neighbours(uint_kmer, check_reverse_complement); -} -neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer, bool check_reverse_complement) const { - neighbourhood res; - kmer_t suffix = uint_kmer >> 2; - forward_neighbours(suffix, res, check_reverse_complement); - kmer_t prefix = (uint_kmer << 2) & ((kmer_t(1) << (2 * m_k)) - 1); - backward_neighbours(prefix, res, check_reverse_complement); - return res; -} - -neighbourhood dictionary::contig_neighbours(uint64_t contig_id, - bool check_reverse_complement) const { - assert(contig_id < num_contigs()); - neighbourhood res; - kmer_t suffix = m_buckets.contig_suffix(contig_id, m_k); - forward_neighbours(suffix, res, check_reverse_complement); - kmer_t prefix = m_buckets.contig_prefix(contig_id, m_k) << 2; - backward_neighbours(prefix, res, check_reverse_complement); - return res; -} - -uint64_t dictionary::num_bits() const { - return 8 * (sizeof(m_size) + sizeof(m_seed) + sizeof(m_k) + sizeof(m_m) + - sizeof(m_canonical_parsing)) + - m_minimizers.num_bits() + m_buckets.num_bits() + m_skew_index.num_bits() + - m_weights.num_bits(); -} - -} // namespace sshash \ No newline at end of file diff --git a/include/dictionary.hpp b/include/dictionary.hpp index 1c97701..82a68cd 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -8,6 +8,14 @@ namespace sshash { +// Forward declarations of the friend template classes +template +struct streaming_query_canonical_parsing; + +template +struct streaming_query_regular_parsing; + +template struct dictionary { dictionary() : m_size(0), m_seed(0), m_k(0), m_m(0), m_canonical_parsing(0) {} @@ -40,20 +48,22 @@ struct dictionary { uint64_t contig_size(uint64_t contig_id) const; /* Navigational queries. */ - neighbourhood kmer_forward_neighbours(char const* string_kmer, - bool check_reverse_complement = true) const; - neighbourhood kmer_forward_neighbours(kmer_t uint_kmer, - bool check_reverse_complement = true) const; - neighbourhood kmer_backward_neighbours(char const* string_kmer, - bool check_reverse_complement = true) const; - neighbourhood kmer_backward_neighbours(kmer_t uint_kmer, - bool check_reverse_complement = true) const; + neighbourhood kmer_forward_neighbours(char const* string_kmer, + bool check_reverse_complement = true) const; + neighbourhood kmer_forward_neighbours(kmer_t uint_kmer, + bool check_reverse_complement = true) const; + neighbourhood kmer_backward_neighbours(char const* string_kmer, + bool check_reverse_complement = true) const; + neighbourhood kmer_backward_neighbours(kmer_t uint_kmer, + bool check_reverse_complement = true) const; /* forward and backward */ - neighbourhood kmer_neighbours(char const* string_kmer, - bool check_reverse_complement = true) const; - neighbourhood kmer_neighbours(kmer_t uint_kmer, bool check_reverse_complement = true) const; - neighbourhood contig_neighbours(uint64_t contig_id, bool check_reverse_complement = true) const; + neighbourhood kmer_neighbours(char const* string_kmer, + bool check_reverse_complement = true) const; + neighbourhood kmer_neighbours(kmer_t uint_kmer, + bool check_reverse_complement = true) const; + neighbourhood contig_neighbours(uint64_t contig_id, + bool check_reverse_complement = true) const; /* Return the weight of the kmer given its id. */ uint64_t weight(uint64_t kmer_id) const; @@ -66,8 +76,8 @@ struct dictionary { bool is_member_uint(kmer_t uint_kmer, bool check_reverse_complement = true) const; /* Streaming queries. */ - friend struct streaming_query_canonical_parsing; - friend struct streaming_query_regular_parsing; + friend struct streaming_query_canonical_parsing; + friend struct streaming_query_regular_parsing; streaming_query_report streaming_query_from_file(std::string const& filename, bool multiline) const; @@ -84,7 +94,7 @@ struct dictionary { } private: - typename buckets::iterator it; + typename buckets::iterator it; }; iterator begin() const { return iterator(this, 0, size()); } @@ -147,15 +157,24 @@ struct dictionary { uint16_t m_m; uint16_t m_canonical_parsing; minimizers m_minimizers; - buckets m_buckets; - skew_index m_skew_index; + buckets m_buckets; + skew_index m_skew_index; weights m_weights; lookup_result lookup_uint_regular_parsing(kmer_t uint_kmer) const; lookup_result lookup_uint_canonical_parsing(kmer_t uint_kmer) const; - void forward_neighbours(kmer_t suffix, neighbourhood& res, bool check_reverse_complement) const; - void backward_neighbours(kmer_t prefix, neighbourhood& res, + void forward_neighbours(kmer_t suffix, neighbourhood& res, + bool check_reverse_complement) const; + void backward_neighbours(kmer_t prefix, neighbourhood& res, bool check_reverse_complement) const; + kmer_t get_prefix(kmer_t kmer) const; + kmer_t get_suffix(kmer_t kmer) const; }; } // namespace sshash + +#include "builder/build.impl" +#include "dictionary.impl" +#include "dump.impl" +#include "info.impl" +#include "statistics.impl" diff --git a/include/dictionary.impl b/include/dictionary.impl new file mode 100644 index 0000000..72d41a9 --- /dev/null +++ b/include/dictionary.impl @@ -0,0 +1,219 @@ +namespace sshash { + +template +lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { + auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); + uint64_t bucket_id = m_minimizers.lookup(uint64_t(minimizer)); + + if (m_skew_index.empty()) return m_buckets.lookup(bucket_id, uint_kmer, m_k, m_m); + + auto [begin, end] = m_buckets.locate_bucket(bucket_id); + uint64_t num_super_kmers_in_bucket = end - begin; + uint64_t log2_bucket_size = util::ceil_log2_uint32(num_super_kmers_in_bucket); + if (log2_bucket_size > m_skew_index.min_log2) { + uint64_t pos = m_skew_index.lookup(uint_kmer, log2_bucket_size); + /* It must hold pos < num_super_kmers_in_bucket for the kmer to exist. */ + if (pos < num_super_kmers_in_bucket) { + return m_buckets.lookup_in_super_kmer(begin + pos, uint_kmer, m_k, m_m); + } + return lookup_result(); + } + + return m_buckets.lookup(begin, end, uint_kmer, m_k, m_m); +} + +template +lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); + auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); + auto minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed); + uint64_t bucket_id = m_minimizers.lookup(uint64_t(std::min(minimizer, minimizer_rc))); + + if (m_skew_index.empty()) { + return m_buckets.lookup_canonical(bucket_id, uint_kmer, uint_kmer_rc, m_k, m_m); + } + + auto [begin, end] = m_buckets.locate_bucket(bucket_id); + uint64_t num_super_kmers_in_bucket = end - begin; + uint64_t log2_bucket_size = util::ceil_log2_uint32(num_super_kmers_in_bucket); + if (log2_bucket_size > m_skew_index.min_log2) { + uint64_t pos = m_skew_index.lookup(uint_kmer, log2_bucket_size); + if (pos < num_super_kmers_in_bucket) { + auto res = m_buckets.lookup_in_super_kmer(begin + pos, uint_kmer, m_k, m_m); + assert(res.kmer_orientation == constants::forward_orientation); + if (res.kmer_id != constants::invalid_uint64) return res; + } + uint64_t pos_rc = m_skew_index.lookup(uint_kmer_rc, log2_bucket_size); + if (pos_rc < num_super_kmers_in_bucket) { + auto res = m_buckets.lookup_in_super_kmer(begin + pos_rc, uint_kmer_rc, m_k, m_m); + res.kmer_orientation = constants::backward_orientation; + return res; + } + return lookup_result(); + } + + return m_buckets.lookup_canonical(begin, end, uint_kmer, uint_kmer_rc, m_k, m_m); +} + +template +uint64_t dictionary::lookup(char const* string_kmer, bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); + return lookup_uint(uint_kmer, check_reverse_complement); +} +template +uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_complement) const { + auto res = lookup_advanced_uint(uint_kmer, check_reverse_complement); + return res.kmer_id; +} + +template +lookup_result dictionary::lookup_advanced(char const* string_kmer, + bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); + return lookup_advanced_uint(uint_kmer, check_reverse_complement); +} +template +lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, + bool check_reverse_complement) const { + if (m_canonical_parsing) return lookup_uint_canonical_parsing(uint_kmer); + auto res = lookup_uint_regular_parsing(uint_kmer); + assert(res.kmer_orientation == constants::forward_orientation); + if (check_reverse_complement and res.kmer_id == constants::invalid_uint64) { + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); + res = lookup_uint_regular_parsing(uint_kmer_rc); + res.kmer_orientation = constants::backward_orientation; + } + return res; +} + +template +bool dictionary::is_member(char const* string_kmer, bool check_reverse_complement) const { + return lookup(string_kmer, check_reverse_complement) != constants::invalid_uint64; +} +template +bool dictionary::is_member_uint(kmer_t uint_kmer, bool check_reverse_complement) const { + return lookup_uint(uint_kmer, check_reverse_complement) != constants::invalid_uint64; +} + +template +void dictionary::access(uint64_t kmer_id, char* string_kmer) const { + assert(kmer_id < size()); + m_buckets.access(kmer_id, string_kmer, m_k); +} + +template +uint64_t dictionary::weight(uint64_t kmer_id) const { + assert(kmer_id < size()); + return m_weights.weight(kmer_id); +} + +template +uint64_t dictionary::contig_size(uint64_t contig_id) const { + assert(contig_id < num_contigs()); + auto [begin, end] = m_buckets.contig_offsets(contig_id); + uint64_t contig_length = end - begin; + assert(contig_length >= m_k); + return contig_length - m_k + 1; +} + +template +void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res, + bool check_reverse_complement) const { + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + kmer_t new_kmer = suffix; + new_kmer.kth_char_or(m_k - 1, kmer_t::char_to_uint(kmer_t::alphabet[i])); + res.forward[i] = lookup_advanced_uint(new_kmer, check_reverse_complement); + } +} +template +void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res, + bool check_reverse_complement) const { + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + kmer_t new_kmer = prefix; + new_kmer.kth_char_or(0, kmer_t::char_to_uint(kmer_t::alphabet[i])); + res.backward[i] = lookup_advanced_uint(new_kmer, check_reverse_complement); + } +} + +template +neighbourhood dictionary::kmer_forward_neighbours( + char const* string_kmer, bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); + return kmer_forward_neighbours(uint_kmer, check_reverse_complement); +} + +template +kmer_t dictionary::get_suffix(kmer_t kmer) const { + kmer_t suffix = kmer; + suffix.drop_char(); + return suffix; +} +template +neighbourhood dictionary::kmer_forward_neighbours( + kmer_t uint_kmer, bool check_reverse_complement) const { + neighbourhood res; + forward_neighbours(get_suffix(uint_kmer), res, check_reverse_complement); + return res; +} + +template +neighbourhood dictionary::kmer_backward_neighbours( + char const* string_kmer, bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); + return kmer_backward_neighbours(uint_kmer, check_reverse_complement); +} + +template +kmer_t dictionary::get_prefix(kmer_t kmer) const { + kmer_t prefix = kmer; + prefix.pad_char(); + prefix.take_chars(m_k); + return prefix; +} +template +neighbourhood dictionary::kmer_backward_neighbours( + kmer_t uint_kmer, bool check_reverse_complement) const { + neighbourhood res; + backward_neighbours(get_prefix(uint_kmer), res, check_reverse_complement); + return res; +} + +template +neighbourhood dictionary::kmer_neighbours(char const* string_kmer, + bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); + return kmer_neighbours(uint_kmer, check_reverse_complement); +} +template +neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer, + bool check_reverse_complement) const { + neighbourhood res; + forward_neighbours(get_suffix(uint_kmer), res, check_reverse_complement); + backward_neighbours(get_prefix(uint_kmer), res, check_reverse_complement); + return res; +} + +template +neighbourhood dictionary::contig_neighbours(uint64_t contig_id, + bool check_reverse_complement) const { + assert(contig_id < num_contigs()); + neighbourhood res; + kmer_t suffix = m_buckets.contig_suffix(contig_id, m_k); + forward_neighbours(suffix, res, check_reverse_complement); + kmer_t prefix = m_buckets.contig_prefix(contig_id, m_k); + prefix.pad_char(); + backward_neighbours(prefix, res, check_reverse_complement); + return res; +} + +template +uint64_t dictionary::num_bits() const { + return 8 * (sizeof(m_size) + sizeof(m_seed) + sizeof(m_k) + sizeof(m_m) + + sizeof(m_canonical_parsing)) + + m_minimizers.num_bits() + m_buckets.num_bits() + m_skew_index.num_bits() + + m_weights.num_bits(); +} + +} // namespace sshash diff --git a/include/dump.cpp b/include/dump.impl similarity index 80% rename from include/dump.cpp rename to include/dump.impl index 4ead9b7..a2373fa 100644 --- a/include/dump.cpp +++ b/include/dump.impl @@ -1,8 +1,7 @@ -#include "dictionary.hpp" - namespace sshash { -void dictionary::dump(std::string const& filename) const { +template +void dictionary::dump(std::string const& filename) const { uint64_t num_kmers = size(); uint64_t num_minimizers = m_minimizers.size(); uint64_t num_super_kmers = m_buckets.offsets.size(); @@ -24,15 +23,16 @@ void dictionary::dump(std::string const& filename) const { uint64_t offset = m_buckets.offsets.access(super_kmer_id); auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k); (void)_; - bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); + bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); - uint64_t prev_minimizer = constants::invalid_uint64; + kmer_t prev_minimizer = constants::invalid_uint64; bool super_kmer_header_written = false; for (uint64_t w = 0; w != window_size; ++w) { - kmer_t kmer = bv_it.read_and_advance_by_two(2 * m_k); + kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - kmer_t kmer_rc = util::compute_reverse_complement(kmer, m_k); + kmer_t kmer_rc = kmer; + kmer_rc.reverse_complement_inplace(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { @@ -53,7 +53,9 @@ void dictionary::dump(std::string const& filename) const { if (minimizer != prev_minimizer) { break; } else { - out << util::uint64_to_char(kmer >> (2 * (m_k - 1))); + kmer_t last_char = kmer; + last_char.drop_chars(m_k - 1); + out << kmer_t::uint64_to_char(last_char.pop_char()); } } prev_minimizer = minimizer; diff --git a/include/hash_util.hpp b/include/hash_util.hpp index 48a568d..9f729ac 100644 --- a/include/hash_util.hpp +++ b/include/hash_util.hpp @@ -5,53 +5,46 @@ namespace sshash { +template struct kmers_pthash_hasher_64 { typedef pthash::hash64 hash_type; /* specialization for kmer_t */ static inline pthash::hash64 hash(kmer_t x, uint64_t seed) { - if constexpr (constants::uint_kmer_bits == 64) { - return pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), seed); - } else { - assert(constants::uint_kmer_bits == 128); - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - uint64_t hash = - pthash::MurmurHash2_64(reinterpret_cast(&low), sizeof(low), seed) ^ - pthash::MurmurHash2_64(reinterpret_cast(&high), sizeof(high), ~seed); - return hash; + uint64_t hash = 0; + for (int i = 0; i < kmer_t::uint_kmer_bits; i += 64) { + uint64_t block = x.pop64(); + hash ^= pthash::MurmurHash2_64(reinterpret_cast(&block), sizeof(block), + seed + i); } + return hash; } }; +template struct kmers_pthash_hasher_128 { typedef pthash::hash128 hash_type; /* specialization for kmer_t */ static inline pthash::hash128 hash(kmer_t x, uint64_t seed) { - if constexpr (constants::uint_kmer_bits == 64) { - return {pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), seed), - pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), ~seed)}; - } else { - assert(constants::uint_kmer_bits == 128); - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - return { - pthash::MurmurHash2_64(reinterpret_cast(&low), sizeof(low), seed) ^ - pthash::MurmurHash2_64(reinterpret_cast(&high), sizeof(high), - ~seed), - pthash::MurmurHash2_64(reinterpret_cast(&low), sizeof(low), seed + 1) ^ - pthash::MurmurHash2_64(reinterpret_cast(&high), sizeof(high), - ~(seed + 1))}; + uint64_t hash_first = 0; + uint64_t hash_second = 0; + for (int i = 0; i < kmer_t::uint_kmer_bits; i += 64) { + uint64_t block = x.pop64(); + hash_first ^= pthash::MurmurHash2_64(reinterpret_cast(&block), + sizeof(block), seed + i); + hash_second ^= pthash::MurmurHash2_64(reinterpret_cast(&block), + sizeof(block), ~seed + i); } + return {hash_first, hash_second}; } }; // typedef pthash::murmurhash2_64 minimizers_base_hasher_type; typedef pthash::murmurhash2_128 minimizers_base_hasher_type; -// typedef kmers_pthash_hasher_64 kmers_base_hasher_type; -typedef kmers_pthash_hasher_128 kmers_base_hasher_type; +template +using kmers_base_hasher_type = kmers_pthash_hasher_128; // typedef pthash::single_phf // kmers_pthash_type; -typedef pthash::partitioned_phf - kmers_pthash_type; +template +using kmers_pthash_type = pthash::partitioned_phf, // base hasher + pthash::dictionary_dictionary, // encoder type + true>; // minimal output /* used to hash m-mers and determine the minimizer of a k-mer */ struct murmurhash2_64 { diff --git a/include/info.cpp b/include/info.impl similarity index 79% rename from include/info.cpp rename to include/info.impl index 963bcf3..a62ad30 100644 --- a/include/info.cpp +++ b/include/info.impl @@ -1,29 +1,6 @@ -#include "dictionary.hpp" -#include "skew_index.hpp" - namespace sshash { -uint64_t skew_index::print_info() const { - uint64_t num_partitions = mphfs.size(); - uint64_t lower = 1ULL << min_log2; - uint64_t upper = 2 * lower; - uint64_t num_kmers_in_skew_index = 0; - for (uint64_t partition_id = 0; partition_id != num_partitions; ++partition_id) { - uint64_t n = mphfs[partition_id].num_keys(); - assert(n == positions[partition_id].size()); - std::cout << "num_kmers belonging to buckets of size > " << lower << " and <= " << upper - << ": " << n << "; "; - std::cout << "bits/kmer = " << static_cast(mphfs[partition_id].num_bits()) / n - << " (mphf) + " << (positions[partition_id].bytes() * 8.0) / n - << " (positions)\n"; - num_kmers_in_skew_index += n; - lower = upper; - upper = 2 * lower; - } - return num_kmers_in_skew_index; -} - -double bits_per_kmer_formula(uint64_t k, /* kmer length */ +static double bits_per_kmer_formula(uint64_t k, /* kmer length */ uint64_t m, /* minimizer length */ uint64_t n, /* num. kmers */ uint64_t M) /* num. strings in SPSS */ @@ -58,7 +35,8 @@ double bits_per_kmer_formula(uint64_t k, /* kmer length */ double perc(uint64_t amount, uint64_t total) { return (amount * 100.0) / total; } -void dictionary::print_space_breakdown() const { +template +void dictionary::print_space_breakdown() const { const uint64_t num_bytes = (num_bits() + 7) / 8; std::cout << "total index size: " << num_bytes << " [B] -- " << essentials::convert(num_bytes, essentials::MB) << " [MB]" << '\n'; @@ -90,7 +68,8 @@ void dictionary::print_space_breakdown() const { << " [bits/kmer]" << std::endl; } -void dictionary::print_info() const { +template +void dictionary::print_info() const { std::cout << "=== dictionary info:\n"; std::cout << "num_kmers = " << size() << '\n'; std::cout << "k = " << k() << '\n'; @@ -111,4 +90,4 @@ void dictionary::print_info() const { print_space_breakdown(); } -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/include/kmer.hpp b/include/kmer.hpp index f22f7b7..69affbe 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -1,11 +1,291 @@ #pragma once +#include "bitpack.hpp" +#include +#include + +template +bool operator<(std::bitset const& a, std::bitset const& b) { + return a.to_string() < b.to_string(); +} + namespace sshash { +template +struct uint_kmer_t { + using uint_t = Kmer; + Kmer kmer; + + uint_kmer_t() {} + uint_kmer_t(uint64_t kmer) : kmer(kmer) {} + + explicit operator uint64_t() const { + if constexpr (std::is_constructible_v) { + return static_cast(kmer); + } else { // std::bitset? + return (kmer & Kmer(uint64_t(-1))).to_ulong(); + } + } + + // TODO: change to <=> when switching to C++20 + bool operator==(uint_kmer_t const& t) const { return kmer == t.kmer; } + bool operator!=(uint_kmer_t const& t) const { return kmer != t.kmer; } + bool operator<(uint_kmer_t const& t) const { return kmer < t.kmer; } + + void pad(uint16_t b) { + assert(b < uint_kmer_bits); + kmer <<= b; + } + void pad_char() { pad(bits_per_char); } + + void drop(uint16_t b) { + assert(b < uint_kmer_bits); + kmer >>= b; + } + void drop64() { + if constexpr (uint_kmer_bits == 64) { + kmer = 0; + } else { + drop(64); + } + } + void drop_char() { drop(bits_per_char); } + void drop_chars(uint16_t k) { drop(k * bits_per_char); } + + void take(uint16_t b) { kmer &= ~(~Kmer(0) << b); } + void take64() { kmer &= Kmer(uint64_t(-1)); } + void take_char() { take(bits_per_char); } + void take_chars(uint16_t k) { take(k * bits_per_char); } + + uint64_t pop64() { + uint64_t res(*this); + drop64(); + return res; + } + uint64_t pop_char() { + uint64_t res(*this); + res &= (uint64_t(1) << bits_per_char) - 1; + drop_char(); + return res; + } + + void append(uint16_t b, uint64_t n) { + assert(b < uint_kmer_bits); + kmer = (kmer << b) | Kmer(n); + } + void append64(uint64_t n) { + if constexpr (uint_kmer_bits == 64) { + kmer = n; + } else { + append(64, n); + } + } + void append_char(uint64_t c) { append(bits_per_char, c); } + + // assigns a character at k-th position + // assuming that the position is empty + void kth_char_or(uint16_t k, uint64_t c) { kmer |= Kmer(c) << (k * bits_per_char); } + + static constexpr uint16_t uint_kmer_bits = 8 * sizeof(Kmer); + static constexpr uint8_t bits_per_char = BitsPerChar; + + static_assert(uint_kmer_bits % 64 == 0, "Kmer must use 64*k bits"); + static_assert(bits_per_char < 64, "BitsPerChar must be less than 64"); + + static constexpr uint16_t max_k = uint_kmer_bits / bits_per_char; + static constexpr uint16_t max_m = 64 / bits_per_char; +}; + +template +struct alpha_kmer_t : uint_kmer_t { + using base = uint_kmer_t; + using base::base; + static constexpr char const* alphabet = Alphabet; + static constexpr uint8_t alphabet_size = std::char_traits::length(Alphabet); + + static bool is_valid(char c); + static uint64_t char_to_uint(char c); + static char uint64_to_char(uint64_t x) { return alphabet[x]; } + + // Revcompl only makes sense for DNA, fallback to noop otherwise + [[maybe_unused]] virtual void reverse_complement_inplace(uint64_t) {} + [[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, + uint64_t size) { + for (uint64_t i = 0; i != size; ++i) { output[i] = input[i]; } + } +}; + +#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING +inline constexpr char nucleotides[] = "ACGT"; +#else +inline constexpr char nucleotides[] = "ACTG"; +#endif + +template +struct dna_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; + using base::uint_kmer_bits; + using base::bits_per_char; + using base::base; + // Use odd lengths for DNA to avoid dealing with self-complements + static constexpr uint16_t max_k = base::max_k - !(base::max_k % 2); + static constexpr uint16_t max_m = base::max_m - !(base::max_m % 2); + /* + This works with the map: + A -> 00; C -> 01; G -> 11; T -> 10. + + Example. + reverse_complement("ACTCACG") = CGTGAGT, in binary: + reverse_complement("00.01.10.01.00.01.11") = 01.11.10.11.00.11.10. + */ + [[maybe_unused]] static uint64_t crc64(uint64_t x) { + /* complement */ +#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING + uint64_t c = ~x; +#else + uint64_t c = x ^ 0xaaaaaaaaaaaaaaaa; // ...1010.1010.1010.1010 +#endif + /* swap byte order */ + uint64_t res = __builtin_bswap64(c); + + /* Swap nuc order in bytes */ + const uint64_t c1 = 0x0f0f0f0f0f0f0f0f; + const uint64_t c2 = 0x3333333333333333; + res = ((res & c1) << 4) | ((res & (c1 << 4)) >> 4); // swap 2-nuc order in bytes + res = ((res & c2) << 2) | ((res & (c2 << 2)) >> 2); // swap nuc order in 2-nuc + return res; + } + + [[maybe_unused]] void reverse_complement_inplace(uint64_t k) override { + assert(k <= max_k); + dna_uint_kmer_t rev(0); + for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { rev.append64(crc64(base::pop64())); } + // res is full reverse-complement to x + rev.drop(uint_kmer_bits - k * bits_per_char); + *this = rev; + } + +#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING + /* + char decimal binary + A 65 01000001 -> 00 + C 67 01000011 -> 01 + G 71 01000111 -> 10 + T 84 01010100 -> 11 + + a 97 01100001 -> 00 + c 99 01100011 -> 01 + g 103 01100111 -> 10 + t 116 01110100 -> 11 + */ + static uint64_t char_to_uint(char c) { return (((c >> 1) ^ (c >> 2)) & 3); } +#else + /* + char decimal binary + A 65 01000.00.1 -> 00 + C 67 01000.01.1 -> 01 + G 71 01000.11.1 -> 11 + T 84 01010.10.0 -> 10 + + a 97 01100.00.1 -> 00 + c 99 01100.01.1 -> 01 + g 103 01100.11.1 -> 11 + t 116 01110.10.0 -> 10 + */ + static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } +#endif + + /* + Forward character map: + A -> A 65 + C -> C 67 + G -> G 71 + T -> T 84 + a -> a 97 + c -> c 99 + g -> g 103 + t -> t 116 + All other chars map to zero. + */ + static constexpr char canonicalize_basepair_forward_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + /* + Reverse character map: + 65 A -> T 84 + 67 C -> G 71 + 71 G -> C 67 + 84 T -> A 65 + 97 a -> t 116 + 99 c -> g 103 + 103 g -> c 99 + 116 t -> a 97 + All other chars map to zero. + */ + static constexpr char canonicalize_basepair_reverse_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + [[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, + uint64_t size) { + for (uint64_t i = 0; i != size; ++i) { + int c = input[i]; + output[size - i - 1] = canonicalize_basepair_reverse_map[c]; + } + } + + static inline bool is_valid(char c) { + return canonicalize_basepair_forward_map[static_cast(c)]; + } +}; + +inline constexpr char amino_acids[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX"; +template +struct aa_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; + using base::uint_kmer_bits; + using base::bits_per_char; + using base::base; + + static constexpr int8_t char_to_aa[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 25, 23, 24, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + + static bool is_valid(char c) { return ~char_to_aa[static_cast(c)]; } + static uint64_t char_to_uint(char c) { return char_to_aa[static_cast(c)]; } +}; +// also supports bitpack<__uint128_t, 1>, std::bitset<256>, etc #ifdef SSHASH_USE_MAX_KMER_LENGTH_63 -typedef __uint128_t kmer_t; +using default_kmer_t = dna_uint_kmer_t<__uint128_t>; #else -typedef uint64_t kmer_t; +using default_kmer_t = dna_uint_kmer_t; #endif -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/include/minimizer_enumerator.hpp b/include/minimizer_enumerator.hpp index aec73f8..e002a27 100644 --- a/include/minimizer_enumerator.hpp +++ b/include/minimizer_enumerator.hpp @@ -48,7 +48,7 @@ struct fixed_size_deque { std::vector m_buffer; }; -template +template struct minimizer_enumerator { minimizer_enumerator() {} @@ -57,33 +57,29 @@ struct minimizer_enumerator { , m_m(m) , m_seed(seed) , m_position(0) - , m_mask((kmer_t(1) << (2 * m_m)) - 1) , m_q(k - m + 1) /* deque cannot contain more than k - m + 1 elements */ {} - template - uint64_t next(kmer_t kmer, bool clear) { + uint64_t next(kmer_t kmer, bool clear, bool reverse = false) { if (clear) { - if constexpr (reverse) { - for (uint64_t i = 0; i != m_k - m_m + 1; ++i) { - uint64_t mmer = static_cast((kmer >> (2 * (m_k - m_m - i))) & m_mask); - eat(mmer); - } - } else { - for (uint64_t i = 0; i != m_k - m_m + 1; ++i) { - uint64_t mmer = static_cast(kmer & m_mask); - kmer >>= 2; - eat(mmer); + for (uint64_t i = 0; i != m_k - m_m + 1; ++i) { + kmer_t mmer = kmer; + if (reverse) { + mmer.drop_chars(m_k - m_m - i); + } else { + kmer.drop_char(); } + mmer.take_chars(m_m); + eat(uint64_t(mmer)); } } else { - if constexpr (reverse) { - uint64_t mmer = static_cast(kmer & m_mask); - eat(mmer); + kmer_t mmer = kmer; + if (reverse) { + mmer.take_chars(m_m); } else { - uint64_t mmer = static_cast(kmer >> (2 * (m_k - m_m))); - eat(mmer); + mmer.drop_chars(m_k - m_m); } + eat(uint64_t(mmer)); } return m_q.front().value; } @@ -99,7 +95,8 @@ struct minimizer_enumerator { mmer_t() {} mmer_t(uint64_t hash, uint64_t position, uint64_t value) : hash(hash), position(position), value(value) {} - uint64_t hash, position, value; + uint64_t hash, position; + uint64_t value; }; /* NOTE: we could use a std::deque here, diff --git a/include/minimizers.hpp b/include/minimizers.hpp index 452c7e2..d9be4ce 100644 --- a/include/minimizers.hpp +++ b/include/minimizers.hpp @@ -13,7 +13,7 @@ struct minimizers { mphf_config.seed = util::get_seed_for_hash_function(build_config); mphf_config.minimal_output = true; mphf_config.verbose_output = false; - mphf_config.num_threads = std::thread::hardware_concurrency(); + mphf_config.num_threads = build_config.num_threads; mphf_config.num_partitions = 4 * mphf_config.num_threads; if (size / mphf_config.num_partitions < pthash::constants::min_partition_size) { @@ -54,4 +54,4 @@ struct minimizers { minimizers_pthash_type m_mphf; }; -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/include/query/streaming_query.hpp b/include/query/streaming_query.hpp index b9ee0cb..f7ef41c 100644 --- a/include/query/streaming_query.hpp +++ b/include/query/streaming_query.hpp @@ -9,8 +9,8 @@ namespace sshash { -template -streaming_query_report streaming_query_from_fasta_file_multiline(dictionary const* dict, +template +streaming_query_report streaming_query_from_fasta_file_multiline(dictionary const* dict, std::istream& is) { streaming_query_report report; buffered_lines_iterator it(is); @@ -42,8 +42,9 @@ streaming_query_report streaming_query_from_fasta_file_multiline(dictionary cons return report; } -template -streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is) { +template +streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, + std::istream& is) { streaming_query_report report; std::string line; uint64_t k = dict->k(); @@ -65,8 +66,9 @@ streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, s return report; } -template -streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, std::istream& is) { +template +streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, + std::istream& is) { streaming_query_report report; std::string line; uint64_t k = dict->k(); @@ -92,15 +94,16 @@ streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, s return report; } -template -streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is, - bool multiline) { - if (multiline) return streaming_query_from_fasta_file_multiline(dict, is); - return streaming_query_from_fasta_file(dict, is); +template +streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, + std::istream& is, bool multiline) { + if (multiline) return streaming_query_from_fasta_file_multiline(dict, is); + return streaming_query_from_fasta_file(dict, is); } -streaming_query_report dictionary::streaming_query_from_file(std::string const& filename, - bool multiline) const { +template +streaming_query_report dictionary::streaming_query_from_file(std::string const& filename, + bool multiline) const { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); streaming_query_report report; @@ -109,11 +112,13 @@ streaming_query_report dictionary::streaming_query_from_file(std::string const& zip_istream zis(is); if (canonicalized()) { - report = streaming_query_from_fasta_file(this, zis, - multiline); + report = + streaming_query_from_fasta_file>( + this, zis, multiline); } else { - report = streaming_query_from_fasta_file(this, zis, - multiline); + report = + streaming_query_from_fasta_file>( + this, zis, multiline); } } else if (util::ends_with(filename, ".fq.gz") or util::ends_with(filename, ".fastq.gz")) { @@ -124,18 +129,24 @@ streaming_query_report dictionary::streaming_query_from_file(std::string const& zip_istream zis(is); if (canonicalized()) { - report = streaming_query_from_fastq_file(this, zis); + report = + streaming_query_from_fastq_file>( + this, zis); } else { - report = streaming_query_from_fastq_file(this, zis); + report = + streaming_query_from_fastq_file>( + this, zis); } } else if (util::ends_with(filename, ".fa") or util::ends_with(filename, ".fasta")) { if (canonicalized()) { - report = streaming_query_from_fasta_file(this, is, - multiline); + report = + streaming_query_from_fasta_file>( + this, is, multiline); } else { - report = streaming_query_from_fasta_file(this, is, - multiline); + report = + streaming_query_from_fasta_file>( + this, is, multiline); } } else if (util::ends_with(filename, ".fq") or util::ends_with(filename, ".fastq")) { @@ -144,9 +155,13 @@ streaming_query_report dictionary::streaming_query_from_file(std::string const& << std::endl; } if (canonicalized()) { - report = streaming_query_from_fastq_file(this, is); + report = + streaming_query_from_fastq_file>( + this, is); } else { - report = streaming_query_from_fastq_file(this, is); + report = + streaming_query_from_fastq_file>( + this, is); } } else { diff --git a/include/query/streaming_query_canonical_parsing.hpp b/include/query/streaming_query_canonical_parsing.hpp index 3afeafb..a5756e6 100644 --- a/include/query/streaming_query_canonical_parsing.hpp +++ b/include/query/streaming_query_canonical_parsing.hpp @@ -6,8 +6,9 @@ namespace sshash { +template struct streaming_query_canonical_parsing { - streaming_query_canonical_parsing(dictionary const* dict) + streaming_query_canonical_parsing(dictionary const* dict) : m_dict(dict) @@ -17,7 +18,7 @@ struct streaming_query_canonical_parsing { , m_prev_minimizer(constants::invalid_uint64) , m_kmer(constants::invalid_uint64) - , m_shift(2 * (dict->m_k - 1)) + , m_shift(dict->m_k - 1) , m_k(dict->m_k) , m_m(dict->m_m) , m_seed(dict->m_seed) @@ -45,7 +46,8 @@ struct streaming_query_canonical_parsing { lookup_result lookup_advanced(const char* kmer) { /* 1. validation */ - bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]); + bool is_valid = + m_start ? util::is_valid(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]); if (!is_valid) { start(); return lookup_result(); @@ -53,19 +55,20 @@ struct streaming_query_canonical_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { - m_kmer >>= 2; - m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; - assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); + m_kmer.drop_char(); + m_kmer.kth_char_or(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); + assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { - m_kmer = util::string_to_uint_kmer(kmer, m_k); + m_kmer = util::string_to_uint_kmer(kmer, m_k); } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); - assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); + assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); + m_kmer_rc = m_kmer; + m_kmer_rc.reverse_complement_inplace(m_k); constexpr bool reverse = true; - uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start); - assert(minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); - m_curr_minimizer = std::min(m_curr_minimizer, minimizer_rc); + uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); + assert(minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); + m_curr_minimizer = std::min(m_curr_minimizer, minimizer_rc); /* 3. compute result */ if (m_start) { @@ -101,14 +104,14 @@ struct streaming_query_canonical_parsing { uint64_t num_extensions() const { return m_num_extensions; } private: - dictionary const* m_dict; + dictionary const* m_dict; /* result */ lookup_result m_res; /* (kmer,minimizer) state */ - minimizer_enumerator<> m_minimizer_enum; - minimizer_enumerator<> m_minimizer_enum_rc; + minimizer_enumerator m_minimizer_enum; + minimizer_enumerator m_minimizer_enum_rc; bool m_minimizer_not_found; bool m_start; uint64_t m_curr_minimizer, m_prev_minimizer; @@ -118,7 +121,7 @@ struct streaming_query_canonical_parsing { uint64_t m_shift, m_k, m_m, m_seed; /* string state */ - bit_vector_iterator m_string_iterator; + bit_vector_iterator m_string_iterator; uint64_t m_begin, m_end; uint64_t m_pos_in_window, m_window_size; bool m_reverse; @@ -173,10 +176,11 @@ struct streaming_query_canonical_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - kmer_t val_rc = util::compute_reverse_complement(val, m_k); + kmer_t val_rc = val; + val_rc.reverse_complement_inplace(m_k); uint64_t minimizer = - std::min(util::compute_minimizer(val, m_k, m_m, m_seed), - util::compute_minimizer(val_rc, m_k, m_m, m_seed)); + std::min(util::compute_minimizer(val, m_k, m_m, m_seed), + util::compute_minimizer(val_rc, m_k, m_m, m_seed)); if (minimizer != m_curr_minimizer) { m_minimizer_not_found = true; m_res = lookup_result(); diff --git a/include/query/streaming_query_regular_parsing.hpp b/include/query/streaming_query_regular_parsing.hpp index c3f34f1..ad58989 100644 --- a/include/query/streaming_query_regular_parsing.hpp +++ b/include/query/streaming_query_regular_parsing.hpp @@ -6,8 +6,9 @@ namespace sshash { +template struct streaming_query_regular_parsing { - streaming_query_regular_parsing(dictionary const* dict) + streaming_query_regular_parsing(dictionary const* dict) : m_dict(dict) @@ -25,7 +26,7 @@ struct streaming_query_regular_parsing { , m_kmer(constants::invalid_uint64) - , m_shift(2 * (dict->m_k - 1)) + , m_shift(dict->m_k - 1) , m_k(dict->m_k) , m_m(dict->m_m) , m_seed(dict->m_seed) @@ -49,7 +50,8 @@ struct streaming_query_regular_parsing { lookup_result lookup_advanced(const char* kmer) { /* 1. validation */ - bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]); + bool is_valid = + m_start ? util::is_valid(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]); if (!is_valid) { m_start = true; return lookup_result(); @@ -57,18 +59,19 @@ struct streaming_query_regular_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { - m_kmer >>= 2; - m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; - assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); + m_kmer.drop_char(); + m_kmer.kth_char_or(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); + assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { - m_kmer = util::string_to_uint_kmer(kmer, m_k); + m_kmer = util::string_to_uint_kmer(kmer, m_k); } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); - assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); + assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); + m_kmer_rc = m_kmer; + m_kmer_rc.reverse_complement_inplace(m_k); constexpr bool reverse = true; - m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start); - assert(m_curr_minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); + m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); + assert(m_curr_minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); bool both_minimizers_not_found = (same_minimizer() and m_minimizer_not_found) and (same_minimizer_rc() and m_minimizer_rc_not_found); @@ -129,14 +132,14 @@ struct streaming_query_regular_parsing { uint64_t num_extensions() const { return m_num_extensions; } private: - dictionary const* m_dict; + dictionary const* m_dict; /* result */ lookup_result m_res; /* (kmer,minimizer) state */ - minimizer_enumerator<> m_minimizer_enum; - minimizer_enumerator<> m_minimizer_enum_rc; + minimizer_enumerator m_minimizer_enum; + minimizer_enumerator m_minimizer_enum_rc; bool m_minimizer_not_found, m_minimizer_rc_not_found; bool m_start; uint64_t m_curr_minimizer, m_curr_minimizer_rc; @@ -147,7 +150,7 @@ struct streaming_query_regular_parsing { uint64_t m_shift, m_k, m_m, m_seed; /* string state */ - bit_vector_iterator m_string_iterator; + bit_vector_iterator m_string_iterator; uint64_t m_begin, m_end; uint64_t m_pos_in_window, m_window_size; bool m_reverse; @@ -226,7 +229,7 @@ struct streaming_query_regular_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); + auto minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); if (minimizer != m_curr_minimizer) { m_minimizer_not_found = true; m_res = lookup_result(); @@ -268,7 +271,7 @@ struct streaming_query_regular_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); + auto minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); if (minimizer != m_curr_minimizer_rc) { m_minimizer_rc_not_found = true; m_res = lookup_result(); diff --git a/include/skew_index.hpp b/include/skew_index.hpp index f49c33e..05da4ab 100644 --- a/include/skew_index.hpp +++ b/include/skew_index.hpp @@ -4,6 +4,7 @@ namespace sshash { +template struct skew_index { skew_index() : min_log2(constants::min_l) @@ -14,7 +15,25 @@ struct skew_index { } /* Returns the number of k-mers in the index. */ - uint64_t print_info() const; + uint64_t print_info() const { + uint64_t num_partitions = mphfs.size(); + uint64_t lower = 1ULL << min_log2; + uint64_t upper = 2 * lower; + uint64_t num_kmers_in_skew_index = 0; + for (uint64_t partition_id = 0; partition_id != num_partitions; ++partition_id) { + uint64_t n = mphfs[partition_id].num_keys(); + assert(n == positions[partition_id].size()); + std::cout << "num_kmers belonging to buckets of size > " << lower << " and <= " << upper + << ": " << n << "; "; + std::cout << "bits/kmer = " << static_cast(mphfs[partition_id].num_bits()) / n + << " (mphf) + " << (positions[partition_id].bytes() * 8.0) / n + << " (positions)\n"; + num_kmers_in_skew_index += n; + lower = upper; + upper = 2 * lower; + } + return num_kmers_in_skew_index; + } bool empty() const { return mphfs.empty(); } @@ -58,7 +77,7 @@ struct skew_index { uint16_t min_log2; uint16_t max_log2; uint32_t log2_max_num_super_kmers_in_bucket; - std::vector mphfs; + std::vector> mphfs; std::vector positions; private: @@ -72,4 +91,4 @@ struct skew_index { } }; -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/include/statistics.cpp b/include/statistics.impl similarity index 82% rename from include/statistics.cpp rename to include/statistics.impl index 3b77d67..c9fef8e 100644 --- a/include/statistics.cpp +++ b/include/statistics.impl @@ -1,9 +1,9 @@ -#include "dictionary.hpp" #include "buckets_statistics.hpp" namespace sshash { -void dictionary::compute_statistics() const { +template +void dictionary::compute_statistics() const { uint64_t num_kmers = size(); uint64_t num_minimizers = m_minimizers.size(); uint64_t num_super_kmers = m_buckets.offsets.size(); @@ -20,15 +20,16 @@ void dictionary::compute_statistics() const { uint64_t offset = m_buckets.offsets.access(super_kmer_id); auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k); (void)_; - bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); + bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); - uint64_t prev_minimizer = constants::invalid_uint64; + kmer_t prev_minimizer = constants::invalid_uint64; uint64_t w = 0; for (; w != window_size; ++w) { - uint64_t kmer = bv_it.read_and_advance_by_two(2 * m_k); + kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - uint64_t kmer_rc = util::compute_reverse_complement(kmer, m_k); + kmer_t kmer_rc = kmer; + kmer_rc.reverse_complement_inplace(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { diff --git a/include/util.hpp b/include/util.hpp index 19fb0af..3cc1218 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -1,8 +1,9 @@ #pragma once -#include +#include #include #include +#include #include // for std::ceil on linux #include "hash_util.hpp" @@ -32,17 +33,10 @@ struct lookup_result { uint64_t contig_size; }; +template struct neighbourhood { - /* forward */ - lookup_result forward_A; - lookup_result forward_C; - lookup_result forward_G; - lookup_result forward_T; - /* backward */ - lookup_result backward_A; - lookup_result backward_C; - lookup_result backward_G; - lookup_result backward_T; + std::array forward; + std::array backward; }; [[maybe_unused]] static bool equal_lookup_result(lookup_result expected, lookup_result got) { @@ -81,6 +75,7 @@ struct build_configuration { : k(31) , m(17) , seed(constants::seed) + , num_threads(std::thread::hardware_concurrency()) , l(constants::min_l) , c(constants::c) @@ -94,6 +89,7 @@ struct build_configuration { uint64_t k; // kmer size uint64_t m; // minimizer size uint64_t seed; + uint64_t num_threads; uint64_t l; // drive dictionary trade-off double c; // drive PTHash trade-off @@ -132,203 +128,33 @@ static inline uint32_t ceil_log2_uint32(uint32_t x) { return (x > 1) ? msb(x - 1 return std::equal(pattern.begin(), pattern.end(), str.end() - pattern.size()); } -#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING -/* -char decimal binary - A 65 01000001 -> 00 - C 67 01000011 -> 01 - G 71 01000111 -> 10 - T 84 01010100 -> 11 - - a 97 01100001 -> 00 - c 99 01100011 -> 01 - g 103 01100111 -> 10 - t 116 01110100 -> 11 -*/ -static inline kmer_t char_to_uint(char c) { - switch (c) { - case 'A': - return 0; - case 'a': - return 0; - case 'C': - return 1; - case 'c': - return 1; - case 'G': - return 2; - case 'g': - return 2; - case 'T': - return 3; - case 't': - return 3; - } - assert(false); - return -1; -} -#else -/* -char decimal binary - A 65 01000.00.1 -> 00 - C 67 01000.01.1 -> 01 - G 71 01000.11.1 -> 11 - T 84 01010.10.0 -> 10 - - a 97 01100.00.1 -> 00 - c 99 01100.01.1 -> 01 - g 103 01100.11.1 -> 11 - t 116 01110.10.0 -> 10 -*/ -static kmer_t char_to_uint(char c) { return (c >> 1) & 3; } -#endif - -static char uint64_to_char(uint64_t x) { - assert(x <= 3); -#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING - static char nucleotides[4] = {'A', 'C', 'G', 'T'}; -#else - static char nucleotides[4] = {'A', 'C', 'T', 'G'}; -#endif - return nucleotides[x]; -} - +template [[maybe_unused]] static kmer_t string_to_uint_kmer(char const* str, uint64_t k) { - assert(k <= constants::max_k); + assert(k <= kmer_t::max_k); kmer_t x = 0; - for (uint64_t i = 0; i != k; ++i) x += char_to_uint(str[i]) << (2 * i); + for (int i = k - 1; i >= 0; i--) { x.append_char(kmer_t::char_to_uint(str[i])); } return x; } +template static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { - assert(k <= constants::max_k); - for (uint64_t i = 0; i != k; ++i) { - str[i] = uint64_to_char(x & 3); - x >>= 2; - } + assert(k <= kmer_t::max_k); + for (uint64_t i = 0; i != k; ++i) { str[i] = kmer_t::uint64_to_char(x.pop_char()); } } +template [[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) { - assert(k <= constants::max_k); + assert(k <= kmer_t::max_k); std::string str; str.resize(k); uint_kmer_to_string(x, str.data(), k); return str; } -template -[[maybe_unused]] static uint64_t crc(uint64_t x, uint64_t k) { - assert(k <= 32); - - /* complement */ -#ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING - uint64_t c = ~x; -#else - uint64_t c = x ^ 0xaaaaaaaaaaaaaaaa; // ...1010.1010.1010.1010 -#endif - - /* swap byte order */ - uint64_t res = __builtin_bswap64(c); - - /* Swap nuc order in bytes */ - const uint64_t c1 = 0x0f0f0f0f0f0f0f0f; // ...0000.1111.0000.1111 - const uint64_t c2 = 0x3333333333333333; // ...0011.0011.0011.0011 - res = ((res & c1) << 4) | ((res & (c1 << 4)) >> 4); // swap 2-nuc order in bytes - res = ((res & c2) << 2) | ((res & (c2 << 2)) >> 2); // swap nuc order in 2-nuc - - /* Realign to the right */ - if constexpr (align) res >>= 64 - 2 * k; - - return res; -} - -[[maybe_unused]] static kmer_t compute_reverse_complement(kmer_t x, uint64_t k) { - assert(k <= constants::max_k); - if constexpr (constants::uint_kmer_bits == 64) { - return crc(x, k); - } else { - assert(constants::uint_kmer_bits == 128); - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - uint64_t k_low = 32; - uint64_t k_high = k - 32; - uint64_t shift = 128 - 2 * k; - if (k < 32) { - k_low = k; - k_high = 0; - shift = 64; - } - kmer_t low_rc = crc(low, k_low); - kmer_t high_rc = crc(high, k_high); - kmer_t res = (low_rc << 64) + high_rc; - - /* Realign to the right */ - res >>= shift; - - return res; - } -} - -/* - Forward character map: - A -> A 65 - C -> C 67 - G -> G 71 - T -> T 84 - a -> a 97 - c -> c 99 - g -> g 103 - t -> t 116 - All other chars map to zero. -*/ -static const char canonicalize_basepair_forward_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -/* - Reverse character map: - 65 A -> T 84 - 67 C -> G 71 - 71 G -> C 67 - 84 T -> A 65 - 97 a -> t 116 - 99 c -> g 103 - 103 g -> c 99 - 116 t -> a 97 - All other chars map to zero. -*/ -static const char canonicalize_basepair_reverse_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -[[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, - uint64_t size) { - for (uint64_t i = 0; i != size; ++i) { - int c = input[i]; - output[size - i - 1] = canonicalize_basepair_reverse_map[c]; - } -} - -static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c]; } - +template [[maybe_unused]] static bool is_valid(char const* str, uint64_t size) { for (uint64_t i = 0; i != size; ++i) { - int c = str[i]; - if (canonicalize_basepair_forward_map[c] == 0) return false; + if (!kmer_t::is_valid(str[i])) { return false; } } return true; } @@ -336,44 +162,44 @@ static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c] /* This implements the random minimizer. */ -template +template uint64_t compute_minimizer(kmer_t kmer, const uint64_t k, const uint64_t m, const uint64_t seed) { - assert(m <= constants::max_m); + assert(m <= kmer_t::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); - uint64_t minimizer = uint64_t(-1); - kmer_t mask = (kmer_t(1) << (2 * m)) - 1; + kmer_t minimizer = kmer_t(-1); for (uint64_t i = 0; i != k - m + 1; ++i) { - uint64_t mmer = static_cast(kmer & mask); - uint64_t hash = Hasher::hash(mmer, seed); + kmer_t mmer = kmer; + mmer.take_chars(m); + uint64_t hash = Hasher::hash(uint64_t(mmer), seed); if (hash < min_hash) { min_hash = hash; minimizer = mmer; } - kmer >>= 2; + kmer.drop_char(); } - return minimizer; + return uint64_t(minimizer); } /* used in dump.cpp */ -template -std::pair compute_minimizer_pos(kmer_t kmer, uint64_t k, uint64_t m, - uint64_t seed) { - assert(m <= constants::max_m); +template +std::pair compute_minimizer_pos(kmer_t kmer, uint64_t k, uint64_t m, + uint64_t seed) { + assert(m <= kmer_t::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); - uint64_t minimizer = uint64_t(-1); - kmer_t mask = (kmer_t(1) << (2 * m)) - 1; + kmer_t minimizer = kmer_t(-1); uint64_t pos = 0; for (uint64_t i = 0; i != k - m + 1; ++i) { - uint64_t mmer = static_cast(kmer & mask); - uint64_t hash = Hasher::hash(mmer, seed); + kmer_t mmer = kmer; + mmer.take_chars(m); + uint64_t hash = Hasher::hash(uint64_t(mmer), seed); if (hash < min_hash) { min_hash = hash; minimizer = mmer; pos = i; } - kmer >>= 2; + kmer.drop_char(); } return {minimizer, pos}; } @@ -479,4 +305,4 @@ struct buffered_lines_iterator { uint64_t m_read_chars; }; -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/src/bench_utils.hpp b/src/bench_utils.hpp index fa6dc59..9062b60 100644 --- a/src/bench_utils.hpp +++ b/src/bench_utils.hpp @@ -2,7 +2,8 @@ namespace sshash { -void perf_test_iterator(dictionary const& dict) { +template +void perf_test_iterator(dictionary const& dict) { essentials::timer t; t.start(); auto it = dict.begin(); @@ -16,7 +17,8 @@ void perf_test_iterator(dictionary const& dict) { std::cout << "iterator: avg_nanosec_per_kmer " << avg_nanosec << std::endl; } -void perf_test_lookup_access(dictionary const& dict) { +template +void perf_test_lookup_access(dictionary const& dict) { constexpr uint64_t num_queries = 1000000; constexpr uint64_t runs = 5; essentials::uniform_int_rng distr(0, dict.size() - 1, essentials::get_random_seed()); @@ -33,7 +35,7 @@ void perf_test_lookup_access(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); @@ -80,7 +82,7 @@ void perf_test_lookup_access(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); @@ -138,7 +140,8 @@ void perf_test_lookup_access(dictionary const& dict) { } } -void perf_test_lookup_weight(dictionary const& dict) { +template +void perf_test_lookup_weight(dictionary const& dict) { if (!dict.weighted()) { std::cerr << "ERROR: the dictionary does not store weights" << std::endl; return; @@ -158,7 +161,7 @@ void perf_test_lookup_weight(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); diff --git a/src/build.cpp b/src/build.cpp index 9006558..96386b0 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -1,5 +1,6 @@ using namespace sshash; +template int build(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); @@ -10,8 +11,7 @@ int build(int argc, char** argv) { "\t- one DNA sequence per line.\n" "\tFor example, it could be the de Bruijn graph topology output by BCALM.", "-i", true); - parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").", "-k", - true); + parser.add("k", "K-mer length (must be <= " + std::to_string(kmer_t::max_k) + ").", "-k", true); parser.add("m", "Minimizer length (must be < k).", "-m", true); /* Optional arguments. */ @@ -53,7 +53,7 @@ int build(int argc, char** argv) { auto k = parser.get("k"); auto m = parser.get("m"); - dictionary dict; + dictionary dict; build_configuration build_config; build_config.k = k; diff --git a/src/check_utils.hpp b/src/check_utils.hpp index 54fcafb..498bbc4 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -6,7 +6,8 @@ namespace sshash { -bool check_correctness_negative_lookup(dictionary const& dict) { +template +bool check_correctness_negative_lookup(dictionary const& dict) { std::cout << "checking correctness of negative lookup with random kmers..." << std::endl; const uint64_t num_lookups = std::min(1000000, dict.size()); std::string kmer(dict.k(), 0); @@ -25,7 +26,8 @@ bool check_correctness_negative_lookup(dictionary const& dict) { return true; } -bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { +template +bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { const uint64_t k = dict.k(); std::string line; uint64_t pos = 0; @@ -56,9 +58,9 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { ++num_lines; for (uint64_t i = 0; i + k <= line.size(); ++i) { - assert(util::is_valid(line.data() + i, k)); + assert(util::is_valid(line.data() + i, k)); - kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k); + kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k); bool orientation = constants::forward_orientation; if (num_kmers != 0 and num_kmers % 5000000 == 0) { @@ -67,7 +69,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { /* transform 50% of the kmers into their reverse complements */ if ((num_kmers & 1) == 0) { - uint_kmer = util::compute_reverse_complement(uint_kmer, k); + uint_kmer.reverse_complement_inplace(k); orientation = constants::backward_orientation; } @@ -155,8 +157,9 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { // check access dict.access(id, got_kmer_str.data()); - kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); - kmer_t got_uint_kmer_rc = util::compute_reverse_complement(got_uint_kmer, k); + kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); + kmer_t got_uint_kmer_rc = got_uint_kmer; + got_uint_kmer_rc.reverse_complement_inplace(k); if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) { std::cout << "ERROR: got '" << got_kmer_str << "' but expected '" << expected_kmer_str << "'" << std::endl; @@ -177,7 +180,8 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { return check_correctness_negative_lookup(dict); } -bool check_correctness_navigational_kmer_query(std::istream& is, dictionary const& dict) { +template +bool check_correctness_navigational_kmer_query(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; uint64_t pos = 0; @@ -191,69 +195,29 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary cons continue; } for (uint64_t i = 0; i + k <= line.size(); ++i) { - assert(util::is_valid(line.data() + i, k)); + assert(util::is_valid(line.data() + i, k)); if (num_kmers != 0 and num_kmers % 5000000 == 0) { std::cout << "checked " << num_kmers << " kmers" << std::endl; } - - neighbourhood curr = dict.kmer_neighbours(line.data() + i); - - char next_nuc = line[i + k]; - switch (next_nuc) { - case 'A': - if (curr.forward_A.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_A" << std::endl; - } - assert(curr.forward_A.kmer_id != constants::invalid_uint64); - break; - case 'C': - if (curr.forward_C.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_C" << std::endl; - } - assert(curr.forward_C.kmer_id != constants::invalid_uint64); - break; - case 'G': - if (curr.forward_G.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_G" << std::endl; - } - assert(curr.forward_G.kmer_id != constants::invalid_uint64); - break; - case 'T': - if (curr.forward_T.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_T" << std::endl; - } - assert(curr.forward_T.kmer_id != constants::invalid_uint64); - break; + neighbourhood curr = dict.kmer_neighbours(line.data() + i); + if (i + k < line.size()) { + char next_nuc = line[i + k]; + bool next_nuc_not_found = curr.forward[kmer_t::char_to_uint(next_nuc)].kmer_id == + constants::invalid_uint64; + if (next_nuc_not_found) { + std::cout << "expected forward[" << next_nuc << "]" << std::endl; + } + assert(!next_nuc_not_found); } if (i != 0) { char prev_nuc = line[i - 1]; - switch (prev_nuc) { - case 'A': - if (curr.backward_A.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_A" << std::endl; - } - assert(curr.backward_A.kmer_id != constants::invalid_uint64); - break; - case 'C': - if (curr.backward_C.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_C" << std::endl; - } - assert(curr.backward_C.kmer_id != constants::invalid_uint64); - break; - case 'G': - if (curr.backward_G.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_G" << std::endl; - } - assert(curr.backward_G.kmer_id != constants::invalid_uint64); - break; - case 'T': - if (curr.backward_T.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_T" << std::endl; - } - assert(curr.backward_T.kmer_id != constants::invalid_uint64); - break; + bool prev_nuc_not_found = curr.backward[kmer_t::char_to_uint(prev_nuc)].kmer_id == + constants::invalid_uint64; + if (prev_nuc_not_found) { + std::cout << "expected backward[" << prev_nuc << "]" << std::endl; } + assert(!prev_nuc_not_found); } ++num_kmers; @@ -272,7 +236,8 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary cons return true; } -bool check_correctness_navigational_contig_query(dictionary const& dict) { +template +bool check_correctness_navigational_contig_query(dictionary const& dict) { std::cout << "checking correctness of navigational queries for contigs..." << std::endl; uint64_t num_contigs = dict.num_contigs(); uint64_t k = dict.k(); @@ -290,19 +255,16 @@ bool check_correctness_navigational_contig_query(dictionary const& dict) { uint64_t begin_kmer_id = kmer_id; dict.access(begin_kmer_id, kmer.data()); auto backward = dict.kmer_backward_neighbours(kmer.data()); - equal_lookup_result(backward.backward_A, res.backward_A); - equal_lookup_result(backward.backward_C, res.backward_C); - equal_lookup_result(backward.backward_G, res.backward_G); - equal_lookup_result(backward.backward_T, res.backward_T); + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + equal_lookup_result(backward.backward[i], res.backward[i]); + } uint64_t end_kmer_id = kmer_id + contig_size - 1; dict.access(end_kmer_id, kmer.data()); auto forward = dict.kmer_forward_neighbours(kmer.data()); - equal_lookup_result(forward.forward_A, res.forward_A); - equal_lookup_result(forward.forward_C, res.forward_C); - equal_lookup_result(forward.forward_G, res.forward_G); - equal_lookup_result(forward.forward_T, res.forward_T); - + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + equal_lookup_result(forward.forward[i], res.forward[i]); + } kmer_id += contig_size; } std::cout << "checked " << contig_id << " contigs" << std::endl; @@ -310,7 +272,8 @@ bool check_correctness_navigational_contig_query(dictionary const& dict) { return true; } -bool check_correctness_weights(std::istream& is, dictionary const& dict) { +template +bool check_correctness_weights(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; uint64_t kmer_id = 0; @@ -365,7 +328,8 @@ bool check_correctness_weights(std::istream& is, dictionary const& dict) { The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. */ -bool check_correctness_lookup_access(dictionary const& dict, std::string const& filename) { +template +bool check_correctness_lookup_access(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); bool good = true; @@ -383,7 +347,8 @@ bool check_correctness_lookup_access(dictionary const& dict, std::string const& The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. */ -bool check_correctness_navigational_kmer_query(dictionary const& dict, +template +bool check_correctness_navigational_kmer_query(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); @@ -401,7 +366,8 @@ bool check_correctness_navigational_kmer_query(dictionary const& dict, /* The input file must be the one the index was built from. */ -bool check_correctness_weights(dictionary const& dict, std::string const& filename) { +template +bool check_correctness_weights(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); bool good = true; @@ -415,7 +381,8 @@ bool check_correctness_weights(dictionary const& dict, std::string const& filena return good; } -bool check_dictionary(dictionary const& dict) { +template +bool check_dictionary(dictionary const& dict) { uint64_t k = dict.k(); uint64_t n = dict.size(); std::cout << "checking correctness of access and positive lookup..." << std::endl; @@ -444,7 +411,8 @@ bool check_dictionary(dictionary const& dict) { return check_correctness_negative_lookup(dict); } -bool check_correctness_kmer_iterator(dictionary const& dict) { +template +bool check_correctness_kmer_iterator(dictionary const& dict) { std::cout << "checking correctness of kmer iterator..." << std::endl; std::string expected_kmer(dict.k(), 0); constexpr uint64_t runs = 3; @@ -470,7 +438,8 @@ bool check_correctness_kmer_iterator(dictionary const& dict) { return true; } -bool check_correctness_contig_iterator(dictionary const& dict) { +template +bool check_correctness_contig_iterator(dictionary const& dict) { std::cout << "checking correctness of contig iterator..." << std::endl; std::string expected_kmer(dict.k(), 0); for (uint64_t contig_id = 0; contig_id != dict.num_contigs(); ++contig_id) { diff --git a/src/common.hpp b/src/common.hpp index b8d7ba3..6ee652d 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -11,7 +11,8 @@ void random_kmer(char* kmer, uint64_t k) { for (uint64_t i = 0; i != k; ++i) kmer[i] = "ACGT"[rand() % 4]; } -void load_dictionary(dictionary& dict, std::string const& index_filename, bool verbose) { +template +void load_dictionary(dictionary& dict, std::string const& index_filename, bool verbose) { const uint64_t num_bytes_read = essentials::load(dict, index_filename.c_str()); if (verbose) { std::cout << "total index size: " << num_bytes_read << " [B] -- " diff --git a/src/permute.cpp b/src/permute.cpp index 3df127a..3cb0d5c 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -3,6 +3,7 @@ using namespace sshash; +template int permute(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); @@ -16,8 +17,7 @@ int permute(int argc, char** argv) { "\tFor example, it could be the de Bruijn graph topology output " "by BCALM.", "-i", true); - parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").", "-k", - true); + parser.add("k", "K-mer length (must be <= " + std::to_string(kmer_t::max_k) + ").", "-k", true); /* Optional arguments. */ parser.add("output_filename", "Output file where the permuted collection will be written.", @@ -38,8 +38,8 @@ int permute(int argc, char** argv) { std::cerr << "k must be > 0" << std::endl; return 1; } - if (k > constants::max_k) { - std::cerr << "k must be less <= " + std::to_string(constants::max_k) + + if (k > kmer_t::max_k) { + std::cerr << "k must be less <= " + std::to_string(kmer_t::max_k) + " but got k = " + std::to_string(k) << std::endl; return 1; @@ -94,7 +94,7 @@ int permute(int argc, char** argv) { } /* permute and save to output file */ - permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); std::remove(permutation_filename.c_str()); return 0; diff --git a/src/query.cpp b/src/query.cpp index 67f31b1..9ae3028 100644 --- a/src/query.cpp +++ b/src/query.cpp @@ -3,6 +3,7 @@ using namespace sshash; +template int query(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -22,7 +23,7 @@ int query(int argc, char** argv) { bool verbose = parser.get("verbose"); bool multiline = parser.get("multiline"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); essentials::logger("performing queries from file '" + query_filename + "'..."); diff --git a/src/sshash.cpp b/src/sshash.cpp index f6e8ac3..9767cee 100644 --- a/src/sshash.cpp +++ b/src/sshash.cpp @@ -9,6 +9,7 @@ using namespace sshash; +template int check(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -16,13 +17,14 @@ int check(int argc, char** argv) { if (!parser.parse()) return 1; auto index_filename = parser.get("index_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); check_dictionary(dict); check_correctness_navigational_contig_query(dict); return 0; } +template int bench(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -30,7 +32,7 @@ int bench(int argc, char** argv) { if (!parser.parse()) return 1; auto index_filename = parser.get("index_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); perf_test_lookup_access(dict); if (dict.weighted()) perf_test_lookup_weight(dict); @@ -38,6 +40,7 @@ int bench(int argc, char** argv) { return 0; } +template int dump(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -47,12 +50,13 @@ int dump(int argc, char** argv) { auto index_filename = parser.get("index_filename"); auto output_filename = parser.get("output_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); dict.dump(output_filename); return 0; } +template int compute_statistics(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -60,7 +64,7 @@ int compute_statistics(int argc, char** argv) { if (!parser.parse()) return 1; auto index_filename = parser.get("index_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); dict.compute_statistics(); return 0; @@ -86,19 +90,19 @@ int main(int argc, char** argv) { if (argc < 2) return help(argv[0]); auto tool = std::string(argv[1]); if (tool == "build") { - return build(argc - 1, argv + 1); + return build(argc - 1, argv + 1); } else if (tool == "query") { - return query(argc - 1, argv + 1); + return query(argc - 1, argv + 1); } else if (tool == "check") { - return check(argc - 1, argv + 1); + return check(argc - 1, argv + 1); } else if (tool == "bench") { - return bench(argc - 1, argv + 1); + return bench(argc - 1, argv + 1); } else if (tool == "dump") { - return dump(argc - 1, argv + 1); + return dump(argc - 1, argv + 1); } else if (tool == "permute") { - return permute(argc - 1, argv + 1); + return permute(argc - 1, argv + 1); } else if (tool == "compute-statistics") { - return compute_statistics(argc - 1, argv + 1); + return compute_statistics(argc - 1, argv + 1); } std::cout << "Unsupported tool '" << tool << "'.\n" << std::endl; return help(argv[0]); diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index f3a433f..788df29 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -1,11 +1,11 @@ #include "include/util.hpp" #include "src/common.hpp" // for random_kmer +#include using namespace sshash; std::ostream& operator<<(std::ostream& os, __uint128_t x) { - os << *(reinterpret_cast(&x) + 0); - os << *(reinterpret_cast(&x) + 1); + os << static_cast(x) << static_cast(x >> 64); return os; } @@ -16,14 +16,16 @@ void expect(T got, T expected) { } } +using kmer_t = default_kmer_t; + int main(int argc, char** argv) { if (argc < 2) { std::cout << "Usage " << argv[0] << " k" << std::endl; return 1; } uint64_t k = std::stoull(argv[1]); - if (k > constants::max_k) { - std::cerr << "k must be less <= " << constants::max_k << " but got k = " << k << '\n'; + if (k > kmer_t::max_k) { + std::cerr << "k must be less <= " << kmer_t::max_k << " but got k = " << k << '\n'; return 1; } std::string read( @@ -36,37 +38,38 @@ int main(int argc, char** argv) { "c"); for (uint64_t i = 0; i != read.length() - k + 1; ++i) { - bool is_valid = util::is_valid(read.data() + i, k); + bool is_valid = util::is_valid(read.data() + i, k); if (!is_valid) { std::cerr << "ERROR: '" << std::string(read.data() + i, k) << "' is NOT valid!" << std::endl; } - std::cout << "read: '" << std::string(read.data() + i, k) << "'; "; - kmer_t x = util::string_to_uint_kmer(read.data() + i, k); - std::string kmer = util::uint_kmer_to_string(x, k); - std::cout << "capitalized: '" << kmer << "'" << std::endl; + kmer_t x = util::string_to_uint_kmer(read.data() + i, k); + std::string kmer = util::uint_kmer_to_string(x, k); + std::transform(kmer.begin(), kmer.end(), kmer.begin(), + [](unsigned char c) { return std::tolower(c); }); + expect(std::string(read.data() + i, k), kmer); } /****/ #ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING - expect(util::char_to_uint('A'), kmer_t(0)); - expect(util::char_to_uint('a'), kmer_t(0)); - expect(util::char_to_uint('C'), kmer_t(1)); - expect(util::char_to_uint('c'), kmer_t(1)); - expect(util::char_to_uint('G'), kmer_t(2)); - expect(util::char_to_uint('g'), kmer_t(2)); - expect(util::char_to_uint('T'), kmer_t(3)); - expect(util::char_to_uint('t'), kmer_t(3)); + expect(kmer_t::char_to_uint('A'), 0); + expect(kmer_t::char_to_uint('a'), 0); + expect(kmer_t::char_to_uint('C'), 1); + expect(kmer_t::char_to_uint('c'), 1); + expect(kmer_t::char_to_uint('G'), 2); + expect(kmer_t::char_to_uint('g'), 2); + expect(kmer_t::char_to_uint('T'), 3); + expect(kmer_t::char_to_uint('t'), 3); #else - expect(util::char_to_uint('A'), kmer_t(0)); - expect(util::char_to_uint('a'), kmer_t(0)); - expect(util::char_to_uint('C'), kmer_t(1)); - expect(util::char_to_uint('c'), kmer_t(1)); - expect(util::char_to_uint('T'), kmer_t(2)); - expect(util::char_to_uint('t'), kmer_t(2)); - expect(util::char_to_uint('G'), kmer_t(3)); - expect(util::char_to_uint('g'), kmer_t(3)); + expect(kmer_t::char_to_uint('A'), 0); + expect(kmer_t::char_to_uint('a'), 0); + expect(kmer_t::char_to_uint('C'), 1); + expect(kmer_t::char_to_uint('c'), 1); + expect(kmer_t::char_to_uint('T'), 2); + expect(kmer_t::char_to_uint('t'), 2); + expect(kmer_t::char_to_uint('G'), 3); + expect(kmer_t::char_to_uint('g'), 3); #endif for (uint64_t kmer_len = 1; kmer_len <= k; ++kmer_len) { @@ -76,40 +79,40 @@ int main(int argc, char** argv) { for (uint64_t i = 0; i != 1000; ++i) { // generate a random kmer of length kmer_len random_kmer(kmer.data(), kmer_len); - util::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); - kmer_t uint_kmer = util::string_to_uint_kmer(kmer.data(), kmer_len); - uint_kmer = util::compute_reverse_complement(uint_kmer, kmer_len); + kmer_t::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); + kmer_t uint_kmer = util::string_to_uint_kmer(kmer.data(), kmer_len); + uint_kmer.reverse_complement_inplace(kmer_len); expect(util::uint_kmer_to_string(uint_kmer, kmer_len), rc); } } /****/ - expect(util::canonicalize_basepair_forward_map[int('A')], 'A'); - expect(util::canonicalize_basepair_forward_map[int('a')], 'a'); + expect(kmer_t::canonicalize_basepair_forward_map[int('A')], 'A'); + expect(kmer_t::canonicalize_basepair_forward_map[int('a')], 'a'); - expect(util::canonicalize_basepair_forward_map[int('C')], 'C'); - expect(util::canonicalize_basepair_forward_map[int('c')], 'c'); + expect(kmer_t::canonicalize_basepair_forward_map[int('C')], 'C'); + expect(kmer_t::canonicalize_basepair_forward_map[int('c')], 'c'); - expect(util::canonicalize_basepair_forward_map[int('T')], 'T'); - expect(util::canonicalize_basepair_forward_map[int('t')], 't'); + expect(kmer_t::canonicalize_basepair_forward_map[int('T')], 'T'); + expect(kmer_t::canonicalize_basepair_forward_map[int('t')], 't'); - expect(util::canonicalize_basepair_forward_map[int('G')], 'G'); - expect(util::canonicalize_basepair_forward_map[int('g')], 'g'); + expect(kmer_t::canonicalize_basepair_forward_map[int('G')], 'G'); + expect(kmer_t::canonicalize_basepair_forward_map[int('g')], 'g'); /****/ - expect(util::canonicalize_basepair_reverse_map[int('A')], 'T'); - expect(util::canonicalize_basepair_reverse_map[int('a')], 't'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('A')], 'T'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('a')], 't'); - expect(util::canonicalize_basepair_reverse_map[int('C')], 'G'); - expect(util::canonicalize_basepair_reverse_map[int('c')], 'g'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('C')], 'G'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('c')], 'g'); - expect(util::canonicalize_basepair_reverse_map[int('T')], 'A'); - expect(util::canonicalize_basepair_reverse_map[int('t')], 'a'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('T')], 'A'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('t')], 'a'); - expect(util::canonicalize_basepair_reverse_map[int('G')], 'C'); - expect(util::canonicalize_basepair_reverse_map[int('g')], 'c'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('G')], 'C'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('g')], 'c'); return 0; -} \ No newline at end of file +}