diff --git a/README.md b/README.md index cc38a16..b6a7092 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,10 @@ two basic queries are supported: - i = Lookup(g), where i is in [0,n) if the k-mer g is found in the dictionary or i = -1 otherwise; - g = Access(i), where g is the k-mer associated to the identifier i. +If also the abundances of the k-mers (their frequency counts) are stored in the dictionary, then the dictionary is said to be *weighted* and it also supports: + +- c = Abundance(i), where i is a given k-mer identifier. + A membership query (determine if a given k-mer is present in the dictionary or not) is, therefore, supported by means of the lookup query. The dictionary can also stream through all k-mers of a given DNA file (.fasta or .fastq formats) to determine their membership to the dictionary. @@ -91,44 +95,47 @@ where the code was compiled (see the section [Compiling the Code](#compiling-the to show the usage of the driver program (reported below for convenience). - Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [-o output_filename] [--check] [--bench] [--verbose] - + Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--abundances] [-o output_filename] [--check] [--bench] [--verbose] + input_filename Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not: - without duplicate nor invalid kmers - one DNA sequence per line. For example, it could be the de Bruijn graph topology output by BCALM. - + k K-mer length (must be <= 31). - + m Minimizer length (must be < k). - + [-s seed] Seed for construction (default is 1). - + [-l l] A (integer) constant that controls the space/time trade-off of the dictionary. A reasonable values lies between 2 and 12 (default is 6). - + [-c c] A (floating point) constant that trades construction speed for space effectiveness of minimal perfect hashing. A reasonable value lies between 3.0 and 10.0 (default is 3.000000). - + [--canonical-parsing] Canonical parsing of k-mers. This option changes the parsing and results in a trade-off between index space and lookup time. - + + [--abundances] + Also store the abundances in compressed format. + [-o output_filename] Output file name where the data structure will be serialized. - + [--check] Check correctness after construction. - + [--bench] Run benchmark after construction. - + [--verbose] Verbose output during construction. - + [-h,--help] Print this help text and silently exits. @@ -154,6 +161,10 @@ To run a performance benchmark after construction of the index, use: ./bench salmonella_enterica.index + +To also store the abundances, use the option `--abundances`: + + ./build ../data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz 31 13 --abundances --check --verbose ### Example 2 diff --git a/data/unitigs_stitched/with_abundances/ecoli1_k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_abundances/ecoli1_k31_ust.abundances.fa.gz new file mode 100644 index 0000000..e5e0b03 Binary files /dev/null and b/data/unitigs_stitched/with_abundances/ecoli1_k31_ust.abundances.fa.gz differ diff --git a/data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz new file mode 100644 index 0000000..1199845 Binary files /dev/null and b/data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz differ diff --git a/include/abundances.hpp b/include/abundances.hpp new file mode 100644 index 0000000..f4a1a58 --- /dev/null +++ b/include/abundances.hpp @@ -0,0 +1,242 @@ +#pragma once + +#include +#include // count the distinct abundances +#include "ef_sequence.hpp" + +namespace sshash { + +struct abundances { + struct builder { + builder() : m_most_frequent_abundance(0) {} + + void init(uint64_t most_frequent_abundance) { + m_most_frequent_abundance = most_frequent_abundance; + m_kmer_id_interval_lengths.push_back(0); + m_abundance_interval_lengths.push_back(0); + } + + void eat(uint64_t abundance) { + assert(abundance > 0); + auto it = m_abundances_map.find(abundance); + if (it != m_abundances_map.cend()) { // found + (*it).second += 1; + } else { + m_abundances_map[abundance] = 1; + } + } + + void push_kmer_id_interval(uint64_t value, uint64_t length) { + m_kmer_id_interval_values.push_back(value); + m_kmer_id_interval_lengths.push_back(m_kmer_id_interval_lengths.back() + length); + } + + void push_abundance_interval(uint64_t value, uint64_t length) { + m_abundance_interval_values.push_back(value); + m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length); + } + + uint64_t num_kmer_id_intervals() const { return m_kmer_id_interval_values.size(); } + uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); } + + void finalize(uint64_t num_kmers) { + assert( + std::is_sorted(m_kmer_id_interval_values.begin(), m_kmer_id_interval_values.end())); + assert(std::is_sorted(m_kmer_id_interval_lengths.begin(), + m_kmer_id_interval_lengths.end())); + assert(std::is_sorted(m_abundance_interval_lengths.begin(), + m_abundance_interval_lengths.end())); + + std::cout << "num_kmer_id_intervals " << num_kmer_id_intervals() << std::endl; + std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl; + + uint64_t num_distinct_abundances = m_abundances_map.size(); + + std::cout << "found " << num_distinct_abundances << " distint abundances (ceil(log2(" + << num_distinct_abundances + << ")) = " << std::ceil(std::log2(num_distinct_abundances)) << ")" + << std::endl; + + m_abundances.reserve(num_distinct_abundances); + uint64_t n = 0; + uint64_t largest_ab = 0; + for (auto p : m_abundances_map) { + if (p.first > largest_ab) largest_ab = p.first; + n += p.second; + m_abundances.push_back(p); + } + assert(largest_ab > 0); + + std::cout << "largest_ab+1 = " << largest_ab + 1 << " (ceil(log2(" << largest_ab + 1 + << ")) = " << std::ceil(std::log2(largest_ab + 1)) << ")" << std::endl; + + if (n != num_kmers) { + std::cout << "ERROR: expected " << num_kmers << " kmers but got " << n << std::endl; + throw std::runtime_error("file is malformed"); + } + + std::sort(m_abundances.begin(), m_abundances.end(), [](auto const& x, auto const& y) { + if (x.second != y.second) return x.second > y.second; + return x.first < y.first; + }); + + /* If this tests fails, then we need to change the value of + constants::most_frequent_abundance */ + if (m_most_frequent_abundance != m_abundances.front().first) { + throw std::runtime_error("the most frequent abundance is not " + + std::to_string(constants::most_frequent_abundance)); + } + + uint64_t rest = num_kmers - m_abundances.front().second; + std::cout << "kmers that do not have the most frequent ab: " << rest << " (" + << (rest * 100.0) / num_kmers << "%)" << std::endl; + std::cout << "cumulative_kmer_id_interval_lengths " << m_kmer_id_interval_lengths.back() + << '/' << rest << std::endl; + std::cout << "cumulative_abundance_interval_lengths " + << m_abundance_interval_lengths.back() << '/' << rest << std::endl; + + m_abundance_dictionary_builder.resize(num_distinct_abundances, + std::ceil(std::log2(largest_ab + 1))); + for (uint64_t id = 0; id != num_distinct_abundances; ++id) { + uint64_t ab_value = m_abundances[id].first; + m_abundance_dictionary_builder.set(id, ab_value); + m_abundances_map[ab_value] = id; + } + } + + void build(abundances& index) { + std::swap(index.m_most_frequent_abundance, m_most_frequent_abundance); + + index.m_kmer_id_interval_values.encode(m_kmer_id_interval_values.begin(), + m_kmer_id_interval_values.size()); + index.m_kmer_id_interval_lengths.encode(m_kmer_id_interval_lengths.begin(), + m_kmer_id_interval_lengths.size()); + + uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size(); + pthash::compact_vector::builder abundance_interval_values; + abundance_interval_values.resize( + m_abundance_interval_values.size(), + num_distinct_abundances == 1 ? 1 : std::ceil(std::log2(num_distinct_abundances))); + for (uint64_t i = 0; i != m_abundance_interval_values.size(); ++i) { + uint64_t abundance = m_abundance_interval_values[i]; + uint64_t id = m_abundances_map[abundance]; + assert(id < num_distinct_abundances); + abundance_interval_values.set(i, id); + } + abundance_interval_values.build(index.m_abundance_interval_values); + index.m_abundance_interval_lengths.encode(m_abundance_interval_lengths.begin(), + m_abundance_interval_lengths.size()); + + m_abundance_dictionary_builder.build(index.m_abundance_dictionary); + } + + // return the average empirical entropy per abundance + double print_info(uint64_t num_kmers) { + assert(!m_abundances.empty()); + double expected_ab_value = 0.0; + double entropy_ab = 0.0; + uint64_t print = 0; + for (auto p : m_abundances) { + double prob = static_cast(p.second) / num_kmers; + expected_ab_value += p.first * prob; + entropy_ab += prob * std::log2(1.0 / prob); + print += 1; + if (print <= 10) { + std::cout << "ab:" << p.first << " freq:" << p.second << " (" + << (p.second * 100.0) / num_kmers << "%)" << std::endl; + } + } + std::cout << "expected_ab_value " << expected_ab_value << std::endl; + std::cout << "entropy_ab " << entropy_ab << " [bits/kmer]" << std::endl; + return entropy_ab; + } + + private: + uint64_t m_most_frequent_abundance; + + /* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */ + std::unordered_map m_abundances_map; + std::vector> m_abundances; // (abundance,frequency) + + std::vector m_kmer_id_interval_values; + std::vector m_kmer_id_interval_lengths; + + std::vector m_abundance_interval_values; + std::vector m_abundance_interval_lengths; + + pthash::compact_vector::builder m_abundance_dictionary_builder; + }; + + bool empty() const { return m_abundance_dictionary.size() == 0; } + + uint64_t abundance(uint64_t kmer_id) const { + bool is_present = false; + uint64_t rank = 0; + + auto [pos, val] = m_kmer_id_interval_values.next_geq(kmer_id); + if (val == kmer_id) { + is_present = true; + rank = m_kmer_id_interval_lengths.access(pos); + } else { + if (pos > 0) { + --pos; + val = m_kmer_id_interval_values.access(pos); + rank = m_kmer_id_interval_lengths.access(pos); + uint64_t length = m_kmer_id_interval_lengths.access(pos + 1) - rank; + if (kmer_id < val + length) { + is_present = true; + assert(kmer_id >= val); + rank += kmer_id - val; + } + } + } + + if (!is_present) return m_most_frequent_abundance; + + uint64_t i = m_abundance_interval_lengths.prev_leq(rank); + uint64_t id = m_abundance_interval_values.access(i); + uint64_t abundance = m_abundance_dictionary.access(id); + + return abundance; + } + + uint64_t num_bits() const { + return sizeof(m_most_frequent_abundance) * 8 + m_kmer_id_interval_values.num_bits() + + m_kmer_id_interval_lengths.num_bits() + m_abundance_interval_values.bytes() * 8 + + m_abundance_interval_lengths.num_bits() + m_abundance_dictionary.bytes() * 8; + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_most_frequent_abundance); + visitor.visit(m_kmer_id_interval_values); + visitor.visit(m_kmer_id_interval_lengths); + visitor.visit(m_abundance_interval_values); + visitor.visit(m_abundance_interval_lengths); + visitor.visit(m_abundance_dictionary); + } + +private: + uint64_t m_most_frequent_abundance; + + /***** + We model abundances as two lists of intervals. + Each interval is a pair (value,length). + - First list: kmer_ids list. In this case, a pair (value,length) + represents all kmer_ids = value, value+1, value+2, ..., value+length-1. + - Second list: abundance list. In this case, a pair (value,length) + represents that the abundance [value] repeats for [length] times. + */ + + ef_sequence m_kmer_id_interval_values; + ef_sequence m_kmer_id_interval_lengths; + + pthash::compact_vector m_abundance_interval_values; + ef_sequence m_abundance_interval_lengths; + /***/ + + /* Abundance dictionary, listing all distinct abundances sorted by decreasing frequency. */ + pthash::compact_vector m_abundance_dictionary; +}; + +} // namespace sshash \ No newline at end of file diff --git a/include/builder/build.cpp b/include/builder/build.cpp index fcff19a..fc95df6 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -3,7 +3,6 @@ #include "../util.hpp" #include "../dictionary.hpp" -#include "../buckets.hpp" #include "../util.hpp" #include "build_util_types.hpp" @@ -16,11 +15,19 @@ namespace sshash { +void expect(char got, char expected) { + if (got != expected) { + std::cout << "got '" << got << "' but expected '" << expected << "'" << std::endl; + throw parse_runtime_error(); + } +} + struct parse_data { parse_data() : num_kmers(0) {} uint64_t num_kmers; minimizers_tuples minimizers; compact_string_pool strings; + abundances::builder abundances_builder; }; void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { @@ -81,10 +88,98 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b return true; }; + uint64_t seq_len = 0; + data.abundances_builder.init(constants::most_frequent_abundance); + + /* intervals of kmer_ids */ + uint64_t kmer_id_value = constants::invalid; + uint64_t kmer_id_length = 1; + + /* intervals of abundances */ + uint64_t ab_value = constants::invalid; + uint64_t ab_length = 1; + + auto parse_header = [&]() { + if (line.empty()) return; + + /* + Heder format: + >[seq_id] LN:i:[seq_len] ab:Z:[ab_seq] + where [ab_seq] is a space-separated sequence of integer counters (the abundances), + whose length is equal to [seq_len]-k+1 + */ + + // example header: '>12 LN:i:41 ab:Z:2 2 2 2 2 2 2 2 2 2 2' + + expect(line[0], '>'); + uint64_t i = 0; + i = line.find_first_of(' ', i); + if (i == std::string::npos) throw parse_runtime_error(); + + i += 1; + expect(line[i + 0], 'L'); + expect(line[i + 1], 'N'); + expect(line[i + 2], ':'); + expect(line[i + 3], 'i'); + expect(line[i + 4], ':'); + i += 5; + uint64_t j = line.find_first_of(' ', i); + if (j == std::string::npos) throw parse_runtime_error(); + + char* end; + seq_len = std::strtoull(line.data() + i, &end, 10); + i = j + 1; + expect(line[i + 0], 'a'); + expect(line[i + 1], 'b'); + expect(line[i + 2], ':'); + expect(line[i + 3], 'Z'); + expect(line[i + 4], ':'); + i += 5; + + kmer_id_value = constants::invalid; + kmer_id_length = 1; + for (uint64_t j = 0, num_kmers = data.num_kmers; j != seq_len - k + 1; ++j, ++num_kmers) { + uint64_t ab = std::strtoull(line.data() + i, &end, 10); + i = line.find_first_of(' ', i) + 1; + + data.abundances_builder.eat(ab); + + if (ab != constants::most_frequent_abundance) { + if (kmer_id_value == constants::invalid) { + kmer_id_value = num_kmers; + kmer_id_length = 1; + } else { + if (num_kmers == kmer_id_value + kmer_id_length) kmer_id_length += 1; + } + + if (ab == ab_value) { + ab_length += 1; + } else { + if (ab_value != constants::invalid) { + data.abundances_builder.push_abundance_interval(ab_value, ab_length); + } + ab_value = ab; + ab_length = 1; + } + + } else { + if (kmer_id_value != constants::invalid) { + data.abundances_builder.push_kmer_id_interval(kmer_id_value, kmer_id_length); + } + kmer_id_value = constants::invalid; + } + } + + if (kmer_id_value != constants::invalid) { + data.abundances_builder.push_kmer_id_interval(kmer_id_value, kmer_id_length); + } + }; + while (!is.eof()) { - std::getline(is, line); // skip header line - std::getline(is, line); + std::getline(is, line); // header line + if (build_config.store_abundances) parse_header(); + std::getline(is, line); // DNA sequence if (line.size() < k) continue; if (++num_read_lines % 100000 == 0) { @@ -98,6 +193,12 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b prev_minimizer = constants::invalid; num_read_bases += line.size(); + if (build_config.store_abundances and seq_len != line.size()) { + std::cout << "ERROR: expected a sequence of length " << seq_len + << " but got one of length " << line.size() << std::endl; + throw std::runtime_error("file is malformed"); + } + while (end != line.size() - k + 1) { char const* kmer = line.data() + end; assert(util::is_valid(kmer, k)); @@ -129,6 +230,11 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b builder.finalize(); builder.build(data.strings); + if (build_config.store_abundances) { + data.abundances_builder.push_abundance_interval(ab_value, ab_length); + data.abundances_builder.finalize(data.num_kmers); + } + std::cout << "read " << num_read_lines << " lines, " << num_read_bases << " bases, " << data.num_kmers << " kmers" << std::endl; std::cout << "num_kmers " << data.num_kmers << std::endl; @@ -209,11 +315,6 @@ buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, return buckets_stats; } -struct empty_bucket_runtime_error : public std::runtime_error { - empty_bucket_runtime_error() - : std::runtime_error("try a different choice of l or change seed") {} -}; - void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets, build_configuration const& build_config, buckets_statistics const& buckets_stats) { @@ -449,6 +550,24 @@ void dictionary::build(std::string const& filename, build_configuration const& b timer.reset(); /******/ + if (build_config.store_abundances) { + /* step 1.1: compress abundances ***/ + timer.start(); + data.abundances_builder.build(m_abundances); + timer.stop(); + timings.push_back(timer.elapsed()); + print_time(timings.back(), data.num_kmers, "step 1.1.: 'build_abundances'"); + timer.reset(); + /******/ + if (build_config.verbose) { + double entropy_ab = data.abundances_builder.print_info(data.num_kmers); + double avg_bits_per_ab = static_cast(m_abundances.num_bits()) / data.num_kmers; + std::cout << "abundances: " << avg_bits_per_ab << " [bits/kmer]" << std::endl; + std::cout << " (" << entropy_ab / avg_bits_per_ab + << "x smaller than the empirical entropy)" << std::endl; + } + } + /* step 2: sort minimizers and build MPHF ***/ timer.start(); data.minimizers.sort(); diff --git a/include/builder/build_util_types.hpp b/include/builder/build_util_types.hpp index d73732b..03370dc 100644 --- a/include/builder/build_util_types.hpp +++ b/include/builder/build_util_types.hpp @@ -2,6 +2,16 @@ namespace sshash { +struct empty_bucket_runtime_error : public std::runtime_error { + empty_bucket_runtime_error() + : std::runtime_error("try a different choice of l or change seed") {} +}; + +struct parse_runtime_error : public std::runtime_error { + parse_runtime_error() + : std::runtime_error("did you provide an input file with abundance counts?") {} +}; + struct compact_string_pool { compact_string_pool() {} diff --git a/include/dictionary.hpp b/include/dictionary.hpp index 4c1743b..41b478d 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -4,6 +4,7 @@ #include "minimizers.hpp" #include "buckets.hpp" #include "skew_index.hpp" +#include "abundances.hpp" namespace sshash { @@ -17,6 +18,7 @@ struct dictionary { uint64_t k() const { return m_k; } uint64_t m() const { return m_m; } bool canonicalized() const { return m_canonical_parsing; } + bool weighted() const { return !m_abundances.empty(); } uint64_t lookup_uint64(uint64_t uint64_kmer) const { uint64_t minimizer = util::compute_minimizer(uint64_kmer, m_k, m_m, m_seed); @@ -87,6 +89,11 @@ struct dictionary { m_buckets.access(kmer_id, string_kmer, m_k); } + uint64_t abundance(uint64_t kmer_id) const { + assert(kmer_id < size()); + return m_abundances.abundance(kmer_id); + } + bool is_member(char const* string_kmer, bool check_reverse_complement_too = true) const { return lookup(string_kmer, check_reverse_complement_too) != constants::invalid; } @@ -128,7 +135,8 @@ struct dictionary { uint64_t num_bits() const { return 8 * (sizeof(m_size) + sizeof(m_seed) + sizeof(m_k) + sizeof(m_m) + sizeof(m_canonical_parsing)) + - m_minimizers.num_bits() + m_buckets.num_bits() + m_skew_index.num_bits(); + m_minimizers.num_bits() + m_buckets.num_bits() + m_skew_index.num_bits() + + m_abundances.num_bits(); } void print_info() const; @@ -144,6 +152,7 @@ struct dictionary { visitor.visit(m_minimizers); visitor.visit(m_buckets); visitor.visit(m_skew_index); + visitor.visit(m_abundances); } private: @@ -155,6 +164,7 @@ struct dictionary { minimizers m_minimizers; buckets m_buckets; skew_index m_skew_index; + abundances m_abundances; }; } // namespace sshash \ No newline at end of file diff --git a/include/ef_sequence.hpp b/include/ef_sequence.hpp index 86c71e0..6b27a2c 100644 --- a/include/ef_sequence.hpp +++ b/include/ef_sequence.hpp @@ -150,9 +150,7 @@ struct ef_sequence { } inline bool contains(uint64_t x) const { return next_geq(x).second == x; } - inline uint64_t back() const { return m_universe; } - inline uint64_t size() const { return m_low_bits.size(); } uint64_t num_bits() const { diff --git a/include/print_info.cpp b/include/print_info.cpp index 8208f0a..44769fd 100644 --- a/include/print_info.cpp +++ b/include/print_info.cpp @@ -41,6 +41,8 @@ void dictionary::print_space_breakdown() const { << " [bits/kmer]\n"; std::cout << " skew_index: " << static_cast(m_skew_index.num_bits()) / size() << " [bits/kmer]\n"; + std::cout << " abundances: " << static_cast(m_abundances.num_bits()) / size() + << " [bits/kmer]\n"; std::cout << " --------------\n"; std::cout << " total: " << static_cast(num_bits()) / size() << " [bits/kmer]" << std::endl; @@ -52,7 +54,8 @@ void dictionary::print_info() const { std::cout << "k = " << k() << '\n'; std::cout << "num_minimizers = " << m_minimizers.size() << std::endl; std::cout << "m = " << m() << '\n'; - std::cout << "canonical_parsing = " << (canonicalized() ? "true" : "false") << '\n'; + std::cout << "canonicalized = " << (canonicalized() ? "true" : "false") << '\n'; + std::cout << "weighted = " << (weighted() ? "true" : "false") << '\n'; std::cout << "num_strings = " << m_buckets.offsets.size() << '\n'; std::cout << "num_pieces = " << m_buckets.pieces.size() << " (+" diff --git a/include/util.hpp b/include/util.hpp index 088fbb2..c33172c 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -17,6 +17,7 @@ constexpr uint64_t hashcode_bits = 64; constexpr double c = 3.0; // for PThash constexpr uint64_t min_l = 6; constexpr uint64_t max_l = 12; +constexpr uint64_t most_frequent_abundance = 1; // usual value } // namespace constants typedef pthash::murmurhash2_64 base_hasher_type; @@ -65,6 +66,7 @@ struct build_configuration { , c(constants::c) , canonical_parsing(false) + , store_abundances(false) , verbose(true) {} uint64_t k; // kmer size @@ -75,13 +77,14 @@ struct build_configuration { double c; // drive PTHash trade-off bool canonical_parsing; + bool store_abundances; bool verbose; void print() const { std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l << ", c = " << c << ", canonical_parsing = " << (canonical_parsing ? "true" : "false") - << std::endl; + << ", store_abundances = " << (store_abundances ? "true" : "false") << std::endl; } }; diff --git a/src/build.cpp b/src/build.cpp index b8acc64..d7b7be5 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -39,6 +39,8 @@ int main(int argc, char** argv) { "Canonical parsing of k-mers. This option changes the parsing and results in a " "trade-off between index space and lookup time.", "--canonical-parsing", true); + parser.add("store_abundances", "Also store the abundances in compressed format.", + "--abundances", true); parser.add("output_filename", "Output file name where the data structure will be serialized.", "-o", false); parser.add("check", "Check correctness after construction.", "--check", true); @@ -61,6 +63,7 @@ int main(int argc, char** argv) { if (parser.parsed("l")) build_config.l = parser.get("l"); if (parser.parsed("c")) build_config.c = parser.get("c"); build_config.canonical_parsing = parser.get("canonical_parsing"); + build_config.store_abundances = parser.get("store_abundances"); build_config.verbose = parser.get("verbose"); build_config.print(); @@ -70,6 +73,7 @@ int main(int argc, char** argv) { bool check = parser.get("check"); if (check) { check_correctness_lookup_access(dict, input_filename); + if (build_config.store_abundances) check_correctness_abundances(dict, input_filename); check_correctness_iterator(dict); } bool bench = parser.get("bench"); diff --git a/src/check_utils.hpp b/src/check_utils.hpp index 222007a..e99f688 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -16,8 +16,6 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { std::string expected_kmer_str(k, 0); std::cout << "checking correctness of access and positive lookup..." << std::endl; - pthash::bit_vector_builder got(n, 0); - __uint128_t sum = 0; while (appendline(is, line)) { if (line.size() == pos || line[pos] == '>' || line[pos] == ';') { @@ -38,22 +36,17 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { } util::uint64_to_string_no_reverse(uint64_kmer, expected_kmer_str.data(), k); uint64_t id = dict.lookup(expected_kmer_str.c_str()); + + /* + Since we assume that we stream through the file from which the index was built, + ids are assigned sequentially to kmers, so it must be id == num_kmers. + */ + if (id != num_kmers) std::cout << "wrong id assigned" << std::endl; + if (id == constants::invalid) { std::cout << "kmer '" << expected_kmer_str << "' not found!" << std::endl; } assert(id != constants::invalid); - if (id >= n) { - std::cout << "ERROR: id out of range " << id << "/" << n << std::endl; - return false; - } - - if (got.get(id) == true) { - std::cout << "id " << id << " was already assigned!" << std::endl; - return false; - } - - got.set(id); - sum += id; // check access dict.access(id, got_kmer_str.data()); @@ -76,16 +69,6 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { } std::cout << "checked " << num_kmers << " kmers" << std::endl; - if (n != num_kmers) { - std::cout << "expected " << n << " kmers but checked " << num_kmers << std::endl; - return false; - } - - if (sum != (n * (n - 1)) / 2) { - std::cout << "ERROR: index contains duplicates" << std::endl; - return false; - } - std::cout << "EVERYTHING OK!" << std::endl; std::cout << "checking correctness of negative lookup with random kmers..." << std::endl; @@ -106,6 +89,57 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { return true; } +bool check_correctness_abundances(std::istream& is, dictionary const& dict) { + uint64_t k = dict.k(); + std::string line; + uint64_t kmer_id = 0; + + if (!dict.weighted()) { + std::cerr << "ERROR: the dictionary does not store any abundance" << std::endl; + return false; + } + + std::cout << "checking correctness of abundances..." << std::endl; + + while (!is.eof()) { + std::getline(is, line); // header line + if (line.empty()) break; + + uint64_t i = 0; + i = line.find_first_of(' ', i); + assert(i != std::string::npos); + + i += 1; + i += 5; // skip "LN:i:" + uint64_t j = line.find_first_of(' ', i); + assert(j != std::string::npos); + + char* end; + uint64_t seq_len = std::strtoull(line.data() + i, &end, 10); + i = j + 1; + i += 5; // skip "ab:Z:" + + for (uint64_t j = 0; j != seq_len - k + 1; ++j, ++kmer_id) { + uint64_t expected_ab = std::strtoull(line.data() + i, &end, 10); + i = line.find_first_of(' ', i) + 1; + uint64_t got_ab = dict.abundance(kmer_id); + if (expected_ab != got_ab) { + std::cout << "ERROR for kmer_id " << kmer_id << ": expected_ab " << expected_ab + << " but got_ab " << got_ab << std::endl; + } + if (kmer_id != 0 and kmer_id % 5000000 == 0) { + std::cout << "checked " << kmer_id << " abundances" << std::endl; + } + } + + std::getline(is, line); // skip DNA sequence + } + + std::cout << "checked " << kmer_id << " abundances" << std::endl; + std::cout << "EVERYTHING OK!" << std::endl; + return true; +} + /* The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. @@ -124,6 +158,23 @@ bool check_correctness_lookup_access(dictionary const& dict, std::string const& return good; } +/* + The input file must be the one the index was built from. +*/ +bool check_correctness_abundances(dictionary const& dict, std::string const& filename) { + std::ifstream is(filename.c_str()); + if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); + bool good = true; + if (util::ends_with(filename, ".gz")) { + zip_istream zis(is); + good = check_correctness_abundances(zis, dict); + } else { + good = check_correctness_abundances(is, dict); + } + is.close(); + return good; +} + bool check_dictionary(dictionary const& dict) { uint64_t k = dict.k(); uint64_t n = dict.size();