From b218b63a6dde86f785f1f871801fbbecc9400523 Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 24 May 2022 18:32:05 +0200 Subject: [PATCH] general renaming to match description in papers; added new pre-print info --- README.md | 42 ++-- .../ecoli1_k31_ust.weights.fa.gz} | Bin ...li_sakai.BA000007.3.k31_ust.weights.fa.gz} | Bin ...salmonella_enterica_k31_ust.weights.fa.gz} | Bin include/abundances.hpp | 183 ------------------ include/builder/build.cpp | 73 ++++--- include/builder/cover.hpp | 40 ++-- include/builder/util.hpp | 3 +- include/dictionary.hpp | 14 +- include/info.cpp | 6 +- include/skew_index.hpp | 8 +- include/util.hpp | 6 +- include/weights.hpp | 183 ++++++++++++++++++ src/bench.cpp | 2 +- src/bench_utils.hpp | 11 +- src/build.cpp | 9 +- src/check_utils.hpp | 26 +-- src/permute.cpp | 101 +++++----- 18 files changed, 352 insertions(+), 355 deletions(-) rename data/unitigs_stitched/{with_abundances/ecoli1_k31_ust.abundances.fa.gz => with_weights/ecoli1_k31_ust.weights.fa.gz} (100%) rename data/unitigs_stitched/{with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz => with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz} (100%) rename data/unitigs_stitched/{with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz => with_weights/salmonella_enterica_k31_ust.weights.fa.gz} (100%) delete mode 100644 include/abundances.hpp create mode 100644 include/weights.hpp diff --git a/README.md b/README.md index 2d99194..e66a087 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,12 @@ SSHash This is a compressed dictionary data structure for k-mers (strings of length k over the DNA alphabet {A,C,G,T}), based on **S**parse and **S**kew **Hash**ing. -**A (pre-print) paper describing the data structure can be found [here](https://www.biorxiv.org/content/10.1101/2022.01.15.476199) [1]. Please, cite the paper if you use SSHash.** +The data structure is described in the following papers: + +* [*"Sparse and Skew Hashing of K-Mers"*](https://www.biorxiv.org/content/10.1101/2022.01.15.476199) [1] +* [*"On Weighted K-Mers Dictionaries"*](https://doi.org/10.1101/2022.05.23.493024) [2] + +Please, cite these papers if you use SSHash. For a dictionary of n k-mers, two basic queries are supported: @@ -15,11 +20,11 @@ two basic queries are supported: - i = Lookup(g), where i is in [0,n) if the k-mer g is found in the dictionary or i = -1 otherwise; - g = Access(i), where g is the k-mer associated to the identifier i. -If also the abundances of the k-mers (their frequency counts) are stored in the dictionary, then the dictionary is said to be *weighted* and it also supports: +If also the weights of the k-mers (their frequency counts) are stored in the dictionary, then the dictionary is said to be *weighted* and it also supports: -- c = Abundance(i), where i is a given k-mer identifier. +- w = Weight(i), where i is a given k-mer identifier and w is the weight of the k-mer. -A membership query (determine if a given k-mer is present in the dictionary or not) is, therefore, supported by means of the lookup query. +A membership query (determine if a given k-mer is present in the dictionary or not) is, therefore, supported by means of the Lookup query. The dictionary can also stream through all k-mers of a given DNA file (.fasta or .fastq formats) to determine their membership to the dictionary. @@ -95,7 +100,7 @@ where the code was compiled (see the section [Compiling the Code](#compiling-the to show the usage of the driver program (reported below for convenience). - Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--abundances] [-o output_filename] [--check] [--bench] [--verbose] + Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [--check] [--bench] [--verbose] input_filename Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not: @@ -121,8 +126,8 @@ to show the usage of the driver program (reported below for convenience). [--canonical-parsing] Canonical parsing of k-mers. This option changes the parsing and results in a trade-off between index space and lookup time. - [--abundances] - Also store the abundances in compressed format. + [--weighted] + Also store the weights in compressed format. [-o output_filename] Output file name where the data structure will be serialized. @@ -147,7 +152,7 @@ For the examples, we are going to use some collections of *stitched unitigs* from the directory `../data/unitigs_stitched`. These collections were built for k = 31, so dictionaries should be built with k = 31 as well to ensure correctness. -(The subdirectory `../data/unitigs_stitched/with_abundances` contains some files with k-mers' abundances too.) +(The subdirectory `../data/unitigs_stitched/with_weights` contains some files with k-mers' weights too.) In the section [Input Files](#input-files), we explain how such collections of stitched unitigs can be obtained from raw FASTA files. @@ -164,9 +169,9 @@ use: ./bench salmonella_enterica.index -To also store the abundances, use the option `--abundances`: +To also store the weights, use the option `--weighted`: - ./build ../data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz 31 13 --abundances --check --verbose + ./build ../data/unitigs_stitched/with_weights/salmonella_enterica_k31_ust.weights.fa.gz 31 13 --weighted --check --verbose ### Example 2 @@ -221,23 +226,23 @@ even on this tiny example, for only +0.4 bits/k-mer. ### Example 4 - ./permute ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 -o ecoli_sakai.permuted.fa + ./permute ../data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz 31 -o ecoli_sakai.permuted.fa -This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the abundances and, hence, optimize the encoding of the abundances. +This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the weights and, hence, optimize the encoding of the weights. The result is saved to the file `ecoli_sakai.permuted.fa`. -In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the abundances from 5820 to 3723. +In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the weights from 5820 to 3723. Then use the `build` command as usual to build the permuted collection: - ./build ecoli_sakai.permuted.fa 31 13 --abundances --verbose + ./build ecoli_sakai.permuted.fa 31 13 --weighted --verbose The index built on the permuted collection -optimizes the storage space for the abundances which results in a 15.1X better space than the empirical entropy of the abundances. +optimizes the storage space for the weights which results in a 15.1X better space than the empirical entropy of the weights. For reference, the index built on the original collection: - ./build ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 13 --abundances --verbose + ./build ../data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz 31 13 --weighted --verbose already achieves a 12.4X better space than the empirical entropy. @@ -266,7 +271,7 @@ The script `scripts/download_and_preprocess_datasets.sh` contains all the needed steps to download and pre-process the datasets that we used in [1]. -#### Abundances +#### weights Using the option `-all-abundance-counts` of BCALM2, it is possible to also include the abundance counts of the k-mers in the BCALM2 output. Then, use the option `-a 1` of UST to include such counts in the stitched unitigs. @@ -353,4 +358,5 @@ Giulio Ermanno Pibiri - References ----- -* [1] Giulio Ermanno Pibiri. [*"Sparse and Skew Hashing of K-Mers"*](https://www.biorxiv.org/content/10.1101/2022.01.15.476199). ISMB 2022 (Bioinformatics journal). To Appear. \ No newline at end of file +* [1] Giulio Ermanno Pibiri. [*"Sparse and Skew Hashing of K-Mers"*](https://www.biorxiv.org/content/10.1101/2022.01.15.476199). ISMB (Bioinformatics journal). 2022. To Appear. +* [2] Giulio Ermanno Pibiri. [*"On Weighted K-Mers Dictionaries"*](https://doi.org/10.1101/2022.05.23.493024). bioRxiv. 2022. \ No newline at end of file diff --git a/data/unitigs_stitched/with_abundances/ecoli1_k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_weights/ecoli1_k31_ust.weights.fa.gz similarity index 100% rename from data/unitigs_stitched/with_abundances/ecoli1_k31_ust.abundances.fa.gz rename to data/unitigs_stitched/with_weights/ecoli1_k31_ust.weights.fa.gz diff --git a/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz similarity index 100% rename from data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz rename to data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz diff --git a/data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_weights/salmonella_enterica_k31_ust.weights.fa.gz similarity index 100% rename from data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz rename to data/unitigs_stitched/with_weights/salmonella_enterica_k31_ust.weights.fa.gz diff --git a/include/abundances.hpp b/include/abundances.hpp deleted file mode 100644 index 4fd4536..0000000 --- a/include/abundances.hpp +++ /dev/null @@ -1,183 +0,0 @@ -#pragma once - -#include -#include // count the distinct abundances with freq information - -#include "ef_sequence.hpp" - -namespace sshash { - -struct abundances { - struct builder { - builder() {} - - void init() { m_abundance_interval_lengths.push_back(0); } - - void eat(uint64_t abundance) { - assert(abundance > 0); - auto it = m_abundances_map.find(abundance); - if (it != m_abundances_map.cend()) { // found - (*it).second += 1; - } else { - m_abundances_map[abundance] = 1; - } - } - - void push_abundance_interval(uint64_t value, uint64_t length) { - m_abundance_interval_values.push_back(value); - m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length); - } - - uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); } - - void finalize(uint64_t num_kmers) { - assert(std::is_sorted(m_abundance_interval_lengths.begin(), - m_abundance_interval_lengths.end())); - - std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl; - - uint64_t num_distinct_abundances = m_abundances_map.size(); - - std::cout << "found " << num_distinct_abundances << " distint abundances (ceil(log2(" - << num_distinct_abundances - << ")) = " << std::ceil(std::log2(num_distinct_abundances)) << ")" - << std::endl; - - m_abundances.reserve(num_distinct_abundances); - uint64_t n = 0; - uint64_t largest_ab = 0; - for (auto p : m_abundances_map) { - if (p.first > largest_ab) largest_ab = p.first; - n += p.second; - m_abundances.push_back(p); - } - assert(largest_ab > 0); - - std::cout << "largest_ab+1 = " << largest_ab + 1 << " (ceil(log2(" << largest_ab + 1 - << ")) = " << std::ceil(std::log2(largest_ab + 1)) << ")" << std::endl; - - if (n != num_kmers) { - std::cout << "ERROR: expected " << num_kmers << " kmers but got " << n << std::endl; - throw std::runtime_error("file is malformed"); - } - - std::sort(m_abundances.begin(), m_abundances.end(), [](auto const& x, auto const& y) { - if (x.second != y.second) return x.second > y.second; - return x.first < y.first; - }); - - uint64_t rest = num_kmers - m_abundances.front().second; - std::cout << "kmers that do not have the most frequent ab: " << rest << " (" - << (rest * 100.0) / num_kmers << "%)" << std::endl; - - m_abundance_dictionary_builder.resize(num_distinct_abundances, - std::ceil(std::log2(largest_ab + 1))); - for (uint64_t id = 0; id != num_distinct_abundances; ++id) { - uint64_t ab_value = m_abundances[id].first; - m_abundance_dictionary_builder.set(id, ab_value); - m_abundances_map[ab_value] = id; - } - } - - void build(abundances& index) { - uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size(); - pthash::compact_vector::builder abundance_interval_values; - abundance_interval_values.resize( - m_abundance_interval_values.size(), - num_distinct_abundances == 1 ? 1 : std::ceil(std::log2(num_distinct_abundances))); - uint64_t prev_abundance = constants::invalid; - for (uint64_t i = 0; i != m_abundance_interval_values.size(); ++i) { - uint64_t abundance = m_abundance_interval_values[i]; - if (i != 0) { - if (abundance == prev_abundance) { - std::cerr << "Error at " << i << "/" << m_abundance_interval_values.size() - << ": cannot have two consecutive intervals with the same " - "abundance value" - << std::endl; - throw std::runtime_error("abundance intervals are malformed"); - } - } - prev_abundance = abundance; - uint64_t id = m_abundances_map[abundance]; - assert(id < num_distinct_abundances); - abundance_interval_values.set(i, id); - } - abundance_interval_values.build(index.m_abundance_interval_values); - index.m_abundance_interval_lengths.encode(m_abundance_interval_lengths.begin(), - m_abundance_interval_lengths.size()); - - m_abundance_dictionary_builder.build(index.m_abundance_dictionary); - } - - /* return the average empirical entropy per abundance */ - double print_info(uint64_t num_kmers) { - assert(!m_abundances.empty()); - double expected_ab_value = 0.0; - double entropy_ab = 0.0; - uint64_t print = 0; - for (auto p : m_abundances) { - double prob = static_cast(p.second) / num_kmers; - expected_ab_value += p.first * prob; - entropy_ab += prob * std::log2(1.0 / prob); - print += 1; - if (print <= 10) { - std::cout << "ab:" << p.first << " freq:" << p.second << " (" - << (p.second * 100.0) / num_kmers << "%)" << std::endl; - } - } - std::cout << "expected_ab_value " << expected_ab_value << std::endl; - std::cout << "entropy_ab " << entropy_ab << " [bits/kmer]" << std::endl; - return entropy_ab; - } - - private: - /* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */ - std::unordered_map m_abundances_map; - std::vector> m_abundances; // (abundance,frequency) - - std::vector m_abundance_interval_values; - std::vector m_abundance_interval_lengths; - - pthash::compact_vector::builder m_abundance_dictionary_builder; - }; - - bool empty() const { return m_abundance_dictionary.size() == 0; } - - uint64_t abundance(uint64_t kmer_id) const { - uint64_t i = m_abundance_interval_lengths.prev_leq(kmer_id); - uint64_t id = m_abundance_interval_values.access(i); - uint64_t abundance = m_abundance_dictionary.access(id); - return abundance; - } - - uint64_t num_bits() const { - return m_abundance_interval_values.bytes() * 8 + m_abundance_interval_lengths.num_bits() + - m_abundance_dictionary.bytes() * 8; - } - - void print_space_breakdown(uint64_t num_kmers) const { - std::cout << " abundance_interval_values: " - << static_cast(m_abundance_interval_values.bytes() * 8) / num_kmers - << " [bits/kmer]\n"; - std::cout << " abundance_interval_lengths: " - << static_cast(m_abundance_interval_lengths.num_bits()) / num_kmers - << " [bits/kmer]\n"; - std::cout << " abundance_dictionary: " - << static_cast(m_abundance_dictionary.bytes() * 8) / num_kmers - << " [bits/kmer]\n"; - } - - template - void visit(Visitor& visitor) { - visitor.visit(m_abundance_interval_values); - visitor.visit(m_abundance_interval_lengths); - visitor.visit(m_abundance_dictionary); - } - -private: - pthash::compact_vector m_abundance_interval_values; - ef_sequence m_abundance_interval_lengths; - pthash::compact_vector m_abundance_dictionary; -}; - -} // namespace sshash \ No newline at end of file diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 4931b06..bf6d7f3 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -18,7 +18,7 @@ struct parse_data { uint64_t num_kmers; minimizers_tuples minimizers; compact_string_pool strings; - abundances::builder abundances_builder; + weights::builder weights_builder; }; void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { @@ -79,20 +79,20 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b }; uint64_t seq_len = 0; - uint64_t sum_of_abundances = 0; - data.abundances_builder.init(); + uint64_t sum_of_weights = 0; + data.weights_builder.init(); - /* intervals of abundances */ - uint64_t ab_value = constants::invalid; - uint64_t ab_length = 0; + /* intervals of weights */ + uint64_t weight_value = constants::invalid; + uint64_t weight_length = 0; auto parse_header = [&]() { if (sequence.empty()) return; /* Heder format: - >[id] LN:i:[seq_len] ab:Z:[ab_seq] - where [ab_seq] is a space-separated sequence of integer counters (the abundances), + >[id] LN:i:[seq_len] ab:Z:[weight_seq] + where [weight_seq] is a space-separated sequence of integer counters (the weights), whose length is equal to [seq_len]-k+1 */ @@ -123,27 +123,27 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b i += 5; for (uint64_t j = 0; j != seq_len - k + 1; ++j) { - uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10); + uint64_t weight = std::strtoull(sequence.data() + i, nullptr, 10); i = sequence.find_first_of(' ', i) + 1; - data.abundances_builder.eat(ab); - sum_of_abundances += ab; + data.weights_builder.eat(weight); + sum_of_weights += weight; - if (ab == ab_value) { - ab_length += 1; + if (weight == weight_value) { + weight_length += 1; } else { - if (ab_value != constants::invalid) { - data.abundances_builder.push_abundance_interval(ab_value, ab_length); + if (weight_value != constants::invalid) { + data.weights_builder.push_weight_interval(weight_value, weight_length); } - ab_value = ab; - ab_length = 1; + weight_value = weight; + weight_length = 1; } } }; while (!is.eof()) { std::getline(is, sequence); // header sequence - if (build_config.store_abundances) parse_header(); + if (build_config.weighted) parse_header(); std::getline(is, sequence); // DNA sequence if (sequence.size() < k) continue; @@ -159,7 +159,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b prev_minimizer = constants::invalid; num_bases += sequence.size(); - if (build_config.store_abundances and seq_len != sequence.size()) { + if (build_config.weighted and seq_len != sequence.size()) { std::cout << "ERROR: expected a sequence of length " << seq_len << " but got one of length " << sequence.size() << std::endl; throw std::runtime_error("file is malformed"); @@ -204,10 +204,10 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b << std::endl; assert(data.strings.pieces.size() == num_sequences + 1); - if (build_config.store_abundances) { - std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; - data.abundances_builder.push_abundance_interval(ab_value, ab_length); - data.abundances_builder.finalize(data.num_kmers); + if (build_config.weighted) { + std::cout << "sum_of_weights " << sum_of_weights << std::endl; + data.weights_builder.push_weight_interval(weight_value, weight_length); + data.weights_builder.finalize(data.num_kmers); } } @@ -288,12 +288,11 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& uint64_t min_log2_size = m_skew_index.min_log2; uint64_t max_log2_size = m_skew_index.max_log2; - m_skew_index.max_num_strings_in_bucket = buckets_stats.max_num_strings_in_bucket(); + uint64_t max_num_strings_in_bucket = buckets_stats.max_num_strings_in_bucket(); m_skew_index.log2_max_num_strings_in_bucket = std::ceil(std::log2(buckets_stats.max_num_strings_in_bucket())); - std::cout << "max_num_strings_in_bucket " << m_skew_index.max_num_strings_in_bucket - << std::endl; + std::cout << "max_num_strings_in_bucket " << max_num_strings_in_bucket << std::endl; std::cout << "log2_max_num_strings_in_bucket " << m_skew_index.log2_max_num_strings_in_bucket << std::endl; @@ -354,9 +353,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& lower = upper; upper = 2 * lower; - if (partition_id == num_partitions - 1) { - upper = m_skew_index.max_num_strings_in_bucket; - } + if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket; /* If still larger than upper, then there is an empty bucket @@ -447,7 +444,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& upper = 2 * lower; num_bits_per_pos += 1; if (partition_id == num_partitions - 1) { - upper = m_skew_index.max_num_strings_in_bucket; + upper = max_num_strings_in_bucket; num_bits_per_pos = m_skew_index.log2_max_num_strings_in_bucket; } @@ -517,20 +514,20 @@ void dictionary::build(std::string const& filename, build_configuration const& b timer.reset(); /******/ - if (build_config.store_abundances) { - /* step 1.1: compress abundances ***/ + if (build_config.weighted) { + /* step 1.1: compress weights ***/ timer.start(); - data.abundances_builder.build(m_abundances); + data.weights_builder.build(m_weights); timer.stop(); timings.push_back(timer.elapsed()); - print_time(timings.back(), data.num_kmers, "step 1.1.: 'build_abundances'"); + print_time(timings.back(), data.num_kmers, "step 1.1.: 'build_weights'"); timer.reset(); /******/ if (build_config.verbose) { - double entropy_ab = data.abundances_builder.print_info(data.num_kmers); - double avg_bits_per_ab = static_cast(m_abundances.num_bits()) / data.num_kmers; - std::cout << "abundances: " << avg_bits_per_ab << " [bits/kmer]" << std::endl; - std::cout << " (" << entropy_ab / avg_bits_per_ab + double entropy_weights = data.weights_builder.print_info(data.num_kmers); + double avg_bits_per_weight = static_cast(m_weights.num_bits()) / data.num_kmers; + std::cout << "weights: " << avg_bits_per_weight << " [bits/kmer]" << std::endl; + std::cout << " (" << entropy_weights / avg_bits_per_weight << "x smaller than the empirical entropy)" << std::endl; } } diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp index a830304..9373f8f 100644 --- a/include/builder/cover.hpp +++ b/include/builder/cover.hpp @@ -10,12 +10,12 @@ namespace sshash { namespace constants { -constexpr uint64_t most_frequent_abundance = 1; // usual value +constexpr uint64_t most_frequent_weight = 1; // usual value } struct node { // We assume there are less than 2^32 sequences and that - // the largest abundance fits into a 32-bit uint. + // the largest weight fits into a 32-bit uint. node(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {} uint32_t id, front, back; bool sign; // 'true' --> '+' @@ -26,8 +26,8 @@ struct cover { typedef std::deque walk_t; typedef std::vector walks_t; - cover(uint64_t num_sequences, uint64_t num_runs_abundances) - : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {} + cover(uint64_t num_sequences, uint64_t num_runs_weights) + : m_num_sequences(num_sequences), m_num_runs_weights(num_runs_weights) {} void compute(std::vector& nodes) { assert(nodes.size() == m_num_sequences); @@ -35,8 +35,8 @@ struct cover { essentials::timer_type timer; timer.start(); - /* (abundance, position of a candidate abundance with front = abundance) */ - std::unordered_map abundance_map; + /* (weight, position of a candidate weight with front = weight) */ + std::unordered_map weight_map; /* visited[node.id] = true if node has been visited; false otherwise */ std::vector visited; @@ -51,7 +51,7 @@ struct cover { unvisited_nodes.reserve(nodes.size()); // at most m_walks.reserve(nodes.size()); // at most - std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl; + std::cout << "initial number of runs = " << m_num_runs_weights << std::endl; /* add all the nodes with backward orientation */ uint64_t num_nodes = nodes.size(); @@ -80,12 +80,12 @@ struct cover { return x.id < y.id; }); - /* fill abundance_map */ + /* fill weight_map */ { uint64_t prev_front = constants::invalid; uint64_t offset = 0; for (auto const& node : nodes) { - if (node.front != prev_front) abundance_map[node.front] = offset; + if (node.front != prev_front) weight_map[node.front] = offset; offset += 1; prev_front = node.front; } @@ -186,26 +186,26 @@ struct cover { assert(candidate_i <= num_nodes); if (no_match_found or candidate_i == num_nodes) { - abundance_map[back] = candidate_i; + weight_map[back] = candidate_i; return false; } /* valid match was found, then visit it next */ i = candidate_i; - /* update candidate position in abundance_map to point to *next* position */ - abundance_map[back] = candidate_i + 1; + /* update candidate position in weight_map to point to *next* position */ + weight_map[back] = candidate_i + 1; return true; }; /* try to extend to the right */ - uint64_t candidate_i = abundance_map[walk.back().back]; + uint64_t candidate_i = weight_map[walk.back().back]; bool found = try_to_extend(walk.back().back, candidate_i); /* if not possible, try to extend to the left */ if (!found) { - candidate_i = abundance_map[walk.front().front]; + candidate_i = weight_map[walk.front().front]; found = try_to_extend(walk.front().front, candidate_i); } @@ -253,17 +253,17 @@ struct cover { // std::cout << std::endl; #endif } - assert(m_num_runs_abundances > num_matches_in_round); - m_num_runs_abundances -= num_matches_in_round; + assert(m_num_runs_weights > num_matches_in_round); + m_num_runs_weights -= num_matches_in_round; std::cout << " num_matches = " << num_matches_in_round << std::endl; - std::cout << " num_runs " << m_num_runs_abundances << std::endl; + std::cout << " num_runs " << m_num_runs_weights << std::endl; } timer.stop(); std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl; - std::cout << "final number of runs R = " << m_num_runs_abundances << std::endl; + std::cout << "final number of runs R = " << m_num_runs_weights << std::endl; - assert(m_num_runs_abundances >= 1); + assert(m_num_runs_weights >= 1); } void save(std::string const& filename) { @@ -287,7 +287,7 @@ struct cover { private: uint64_t m_num_sequences; - uint64_t m_num_runs_abundances; + uint64_t m_num_runs_weights; walks_t m_walks; void change_orientation(node& n) { diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 725c56e..864cf52 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -8,8 +8,7 @@ struct empty_bucket_runtime_error : public std::runtime_error { }; struct parse_runtime_error : public std::runtime_error { - parse_runtime_error() - : std::runtime_error("did you provide an input file with abundance counts?") {} + parse_runtime_error() : std::runtime_error("did you provide an input file with weights?") {} }; void expect(char got, char expected) { diff --git a/include/dictionary.hpp b/include/dictionary.hpp index 6aa0993..d1b15d4 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -4,7 +4,7 @@ #include "minimizers.hpp" #include "buckets.hpp" #include "skew_index.hpp" -#include "abundances.hpp" +#include "weights.hpp" namespace sshash { @@ -18,7 +18,7 @@ struct dictionary { uint64_t k() const { return m_k; } uint64_t m() const { return m_m; } bool canonicalized() const { return m_canonical_parsing; } - bool weighted() const { return !m_abundances.empty(); } + bool weighted() const { return !m_weights.empty(); } uint64_t lookup(char const* string_kmer, bool check_reverse_complement_too = true) const { uint64_t uint64_kmer = util::string_to_uint64_no_reverse(string_kmer, m_k); @@ -40,9 +40,9 @@ struct dictionary { m_buckets.access(kmer_id, string_kmer, m_k); } - uint64_t abundance(uint64_t kmer_id) const { + uint64_t weight(uint64_t kmer_id) const { assert(kmer_id < size()); - return m_abundances.abundance(kmer_id); + return m_weights.weight(kmer_id); } bool is_member(char const* string_kmer, bool check_reverse_complement_too = true) const { @@ -87,7 +87,7 @@ struct dictionary { return 8 * (sizeof(m_size) + sizeof(m_seed) + sizeof(m_k) + sizeof(m_m) + sizeof(m_canonical_parsing)) + m_minimizers.num_bits() + m_buckets.num_bits() + m_skew_index.num_bits() + - m_abundances.num_bits(); + m_weights.num_bits(); } void print_info() const; @@ -103,7 +103,7 @@ struct dictionary { visitor.visit(m_minimizers); visitor.visit(m_buckets); visitor.visit(m_skew_index); - visitor.visit(m_abundances); + visitor.visit(m_weights); } private: @@ -115,7 +115,7 @@ struct dictionary { minimizers m_minimizers; buckets m_buckets; skew_index m_skew_index; - abundances m_abundances; + weights m_weights; uint64_t lookup_uint64_regular_parsing(uint64_t uint64_kmer) const; uint64_t lookup_uint64_canonical_parsing(uint64_t uint64_kmer) const; diff --git a/include/info.cpp b/include/info.cpp index aad1b1c..a00bb7d 100644 --- a/include/info.cpp +++ b/include/info.cpp @@ -18,7 +18,7 @@ uint64_t skew_index::print_info() const { num_kmers_in_skew_index += n; lower = upper; upper = 2 * lower; - if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket; + // if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket; } return num_kmers_in_skew_index; } @@ -40,9 +40,9 @@ void dictionary::print_space_breakdown() const { << " [bits/kmer]\n"; std::cout << " skew_index: " << static_cast(m_skew_index.num_bits()) / size() << " [bits/kmer]\n"; - std::cout << " abundances: " << static_cast(m_abundances.num_bits()) / size() + std::cout << " weights: " << static_cast(m_weights.num_bits()) / size() << " [bits/kmer]\n"; - m_abundances.print_space_breakdown(size()); + m_weights.print_space_breakdown(size()); std::cout << " --------------\n"; std::cout << " total: " << static_cast(num_bits()) / size() << " [bits/kmer]" << std::endl; diff --git a/include/skew_index.hpp b/include/skew_index.hpp index bbb6c64..d8afa0c 100644 --- a/include/skew_index.hpp +++ b/include/skew_index.hpp @@ -8,7 +8,6 @@ struct skew_index { skew_index() : min_log2(constants::min_l) , max_log2(constants::max_l) - , max_num_strings_in_bucket(0) , log2_max_num_strings_in_bucket(0) { mphfs.resize(0); positions.resize(0); @@ -34,9 +33,8 @@ struct skew_index { } uint64_t num_bits() const { - uint64_t n = (sizeof(min_log2) + sizeof(max_log2) + sizeof(max_num_strings_in_bucket) + - sizeof(log2_max_num_strings_in_bucket)) * - 8; + uint64_t n = + (sizeof(min_log2) + sizeof(max_log2) + sizeof(log2_max_num_strings_in_bucket)) * 8; for (uint64_t partition_id = 0; partition_id != mphfs.size(); ++partition_id) { auto const& mphf = mphfs[partition_id]; auto const& P = positions[partition_id]; @@ -49,7 +47,6 @@ struct skew_index { void visit(Visitor& visitor) { visitor.visit(min_log2); visitor.visit(max_log2); - visitor.visit(max_num_strings_in_bucket); visitor.visit(log2_max_num_strings_in_bucket); visitor.visit(mphfs); visitor.visit(positions); @@ -57,7 +54,6 @@ struct skew_index { uint16_t min_log2; uint16_t max_log2; - uint32_t max_num_strings_in_bucket; // useless uint32_t log2_max_num_strings_in_bucket; std::vector mphfs; std::vector positions; diff --git a/include/util.hpp b/include/util.hpp index 6403374..f0fe172 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -66,7 +66,7 @@ struct build_configuration { , c(constants::c) , canonical_parsing(false) - , store_abundances(false) + , weighted(false) , verbose(true) {} uint64_t k; // kmer size @@ -77,14 +77,14 @@ struct build_configuration { double c; // drive PTHash trade-off bool canonical_parsing; - bool store_abundances; + bool weighted; bool verbose; void print() const { std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l << ", c = " << c << ", canonical_parsing = " << (canonical_parsing ? "true" : "false") - << ", store_abundances = " << (store_abundances ? "true" : "false") << std::endl; + << ", weighted = " << (weighted ? "true" : "false") << std::endl; } }; diff --git a/include/weights.hpp b/include/weights.hpp new file mode 100644 index 0000000..68aadde --- /dev/null +++ b/include/weights.hpp @@ -0,0 +1,183 @@ +#pragma once + +#include +#include // count the distinct weights with freq information + +#include "ef_sequence.hpp" + +namespace sshash { + +struct weights { + struct builder { + builder() {} + + void init() { m_weight_interval_lengths.push_back(0); } + + void eat(uint64_t weight) { + assert(weight > 0); + auto it = m_weights_map.find(weight); + if (it != m_weights_map.cend()) { // found + (*it).second += 1; + } else { + m_weights_map[weight] = 1; + } + } + + void push_weight_interval(uint64_t value, uint64_t length) { + m_weight_interval_values.push_back(value); + m_weight_interval_lengths.push_back(m_weight_interval_lengths.back() + length); + } + + uint64_t num_weight_intervals() const { return m_weight_interval_values.size(); } + + void finalize(uint64_t num_kmers) { + assert( + std::is_sorted(m_weight_interval_lengths.begin(), m_weight_interval_lengths.end())); + + std::cout << "num_weight_intervals " << num_weight_intervals() << std::endl; + + uint64_t num_distinct_weights = m_weights_map.size(); + + std::cout << "found " << num_distinct_weights << " distint weights (ceil(log2(" + << num_distinct_weights + << ")) = " << std::ceil(std::log2(num_distinct_weights)) << ")" << std::endl; + + m_weights.reserve(num_distinct_weights); + uint64_t n = 0; + uint64_t largest_weight = 0; + for (auto p : m_weights_map) { + if (p.first > largest_weight) largest_weight = p.first; + n += p.second; + m_weights.push_back(p); + } + assert(largest_weight > 0); + + std::cout << "largest_weight+1 = " << largest_weight + 1 << " (ceil(log2(" + << largest_weight + 1 << ")) = " << std::ceil(std::log2(largest_weight + 1)) + << ")" << std::endl; + + if (n != num_kmers) { + std::cout << "ERROR: expected " << num_kmers << " kmers but got " << n << std::endl; + throw std::runtime_error("file is malformed"); + } + + std::sort(m_weights.begin(), m_weights.end(), [](auto const& x, auto const& y) { + if (x.second != y.second) return x.second > y.second; + return x.first < y.first; + }); + + uint64_t rest = num_kmers - m_weights.front().second; + std::cout << "kmers that do not have the most frequent weight: " << rest << " (" + << (rest * 100.0) / num_kmers << "%)" << std::endl; + + m_weight_dictionary_builder.resize(num_distinct_weights, + std::ceil(std::log2(largest_weight + 1))); + for (uint64_t id = 0; id != num_distinct_weights; ++id) { + uint64_t weight_value = m_weights[id].first; + m_weight_dictionary_builder.set(id, weight_value); + m_weights_map[weight_value] = id; + } + } + + void build(weights& index) { + uint64_t num_distinct_weights = m_weight_dictionary_builder.size(); + pthash::compact_vector::builder weight_interval_values; + weight_interval_values.resize( + m_weight_interval_values.size(), + num_distinct_weights == 1 ? 1 : std::ceil(std::log2(num_distinct_weights))); + uint64_t prev_weight = constants::invalid; + for (uint64_t i = 0; i != m_weight_interval_values.size(); ++i) { + uint64_t weight = m_weight_interval_values[i]; + if (i != 0) { + if (weight == prev_weight) { + std::cerr << "Error at " << i << "/" << m_weight_interval_values.size() + << ": cannot have two consecutive intervals with the same " + "weight value" + << std::endl; + throw std::runtime_error("weight intervals are malformed"); + } + } + prev_weight = weight; + uint64_t id = m_weights_map[weight]; + assert(id < num_distinct_weights); + weight_interval_values.set(i, id); + } + weight_interval_values.build(index.m_weight_interval_values); + index.m_weight_interval_lengths.encode(m_weight_interval_lengths.begin(), + m_weight_interval_lengths.size()); + + m_weight_dictionary_builder.build(index.m_weight_dictionary); + } + + /* return the average empirical entropy per weight */ + double print_info(uint64_t num_kmers) { + assert(!m_weights.empty()); + double expected_weight = 0.0; + double entropy_weights = 0.0; + uint64_t print = 0; + for (auto p : m_weights) { + double prob = static_cast(p.second) / num_kmers; + expected_weight += p.first * prob; + entropy_weights += prob * std::log2(1.0 / prob); + print += 1; + if (print <= 10) { + std::cout << "weight:" << p.first << " freq:" << p.second << " (" + << (p.second * 100.0) / num_kmers << "%)" << std::endl; + } + } + std::cout << "expected_weight " << expected_weight << std::endl; + std::cout << "entropy_weights " << entropy_weights << " [bits/kmer]" << std::endl; + return entropy_weights; + } + + private: + /* (weight,frequency) pairs during construction, then (weight,id) after sorting */ + std::unordered_map m_weights_map; + std::vector> m_weights; // (weight,frequency) + + std::vector m_weight_interval_values; + std::vector m_weight_interval_lengths; + + pthash::compact_vector::builder m_weight_dictionary_builder; + }; + + bool empty() const { return m_weight_dictionary.size() == 0; } + + uint64_t weight(uint64_t kmer_id) const { + uint64_t i = m_weight_interval_lengths.prev_leq(kmer_id); + uint64_t id = m_weight_interval_values.access(i); + uint64_t weight = m_weight_dictionary.access(id); + return weight; + } + + uint64_t num_bits() const { + return m_weight_interval_values.bytes() * 8 + m_weight_interval_lengths.num_bits() + + m_weight_dictionary.bytes() * 8; + } + + void print_space_breakdown(uint64_t num_kmers) const { + std::cout << " weight_interval_values: " + << static_cast(m_weight_interval_values.bytes() * 8) / num_kmers + << " [bits/kmer]\n"; + std::cout << " weight_interval_lengths: " + << static_cast(m_weight_interval_lengths.num_bits()) / num_kmers + << " [bits/kmer]\n"; + std::cout << " weight_dictionary: " + << static_cast(m_weight_dictionary.bytes() * 8) / num_kmers + << " [bits/kmer]\n"; + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_weight_interval_values); + visitor.visit(m_weight_interval_lengths); + visitor.visit(m_weight_dictionary); + } + +private: + pthash::compact_vector m_weight_interval_values; + ef_sequence m_weight_interval_lengths; + pthash::compact_vector m_weight_dictionary; +}; + +} // namespace sshash \ No newline at end of file diff --git a/src/bench.cpp b/src/bench.cpp index 1290a86..88ec2a8 100644 --- a/src/bench.cpp +++ b/src/bench.cpp @@ -22,7 +22,7 @@ int main(int argc, char** argv) { dict.print_info(); perf_test_lookup_access(dict); - if (dict.weighted()) perf_test_lookup_abundance(dict); + if (dict.weighted()) perf_test_lookup_weight(dict); perf_test_iterator(dict); return 0; diff --git a/src/bench_utils.hpp b/src/bench_utils.hpp index 21685d6..04827b7 100644 --- a/src/bench_utils.hpp +++ b/src/bench_utils.hpp @@ -94,9 +94,9 @@ void perf_test_lookup_access(dictionary const& dict) { } } -void perf_test_lookup_abundance(dictionary const& dict) { +void perf_test_lookup_weight(dictionary const& dict) { if (!dict.weighted()) { - std::cerr << "ERROR: the dictionary does not store abundances" << std::endl; + std::cerr << "ERROR: the dictionary does not store weights" << std::endl; return; } @@ -126,14 +126,13 @@ void perf_test_lookup_abundance(dictionary const& dict) { for (uint64_t r = 0; r != runs; ++r) { for (auto const& string : lookup_queries) { auto id = dict.lookup(string.c_str()); - auto ab = dict.abundance(id); - essentials::do_not_optimize_away(ab); + auto w = dict.weight(id); + essentials::do_not_optimize_away(w); } } t.stop(); double nanosec_per_lookup = t.elapsed() / (runs * lookup_queries.size()); - std::cout << "avg_nanosec_per_positive_lookup_with_abundance " << nanosec_per_lookup - << std::endl; + std::cout << "avg_nanosec_per_positive_lookup_with_weight " << nanosec_per_lookup << std::endl; } } // namespace sshash \ No newline at end of file diff --git a/src/build.cpp b/src/build.cpp index 1477c9a..67f258c 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -41,8 +41,7 @@ int main(int argc, char** argv) { "Canonical parsing of k-mers. This option changes the parsing and results in a " "trade-off between index space and lookup time.", "--canonical-parsing", true); - parser.add("store_abundances", "Also store the abundances in compressed format.", - "--abundances", true); + parser.add("weighted", "Also store the weights in compressed format.", "--weighted", true); parser.add("output_filename", "Output file name where the data structure will be serialized.", "-o", false); parser.add("check", "Check correctness after construction.", "--check", true); @@ -65,7 +64,7 @@ int main(int argc, char** argv) { if (parser.parsed("l")) build_config.l = parser.get("l"); if (parser.parsed("c")) build_config.c = parser.get("c"); build_config.canonical_parsing = parser.get("canonical_parsing"); - build_config.store_abundances = parser.get("store_abundances"); + build_config.weighted = parser.get("weighted"); build_config.verbose = parser.get("verbose"); build_config.print(); @@ -75,13 +74,13 @@ int main(int argc, char** argv) { bool check = parser.get("check"); if (check) { check_correctness_lookup_access(dict, input_filename); - if (build_config.store_abundances) check_correctness_abundances(dict, input_filename); + if (build_config.weighted) check_correctness_weights(dict, input_filename); check_correctness_iterator(dict); } bool bench = parser.get("bench"); if (bench) { perf_test_lookup_access(dict); - if (dict.weighted()) perf_test_lookup_abundance(dict); + if (dict.weighted()) perf_test_lookup_weight(dict); perf_test_iterator(dict); } if (parser.parsed("output_filename")) { diff --git a/src/check_utils.hpp b/src/check_utils.hpp index a7fbc85..f8e36bf 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -90,17 +90,17 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { return true; } -bool check_correctness_abundances(std::istream& is, dictionary const& dict) { +bool check_correctness_weights(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; uint64_t kmer_id = 0; if (!dict.weighted()) { - std::cerr << "ERROR: the dictionary does not store abundances" << std::endl; + std::cerr << "ERROR: the dictionary does not store weights" << std::endl; return false; } - std::cout << "checking correctness of abundances..." << std::endl; + std::cout << "checking correctness of weights..." << std::endl; while (!is.eof()) { std::getline(is, line); // header line @@ -121,22 +121,22 @@ bool check_correctness_abundances(std::istream& is, dictionary const& dict) { i += 5; // skip "ab:Z:" for (uint64_t j = 0; j != seq_len - k + 1; ++j, ++kmer_id) { - uint64_t expected_ab = std::strtoull(line.data() + i, &end, 10); + uint64_t expected = std::strtoull(line.data() + i, &end, 10); i = line.find_first_of(' ', i) + 1; - uint64_t got_ab = dict.abundance(kmer_id); - if (expected_ab != got_ab) { - std::cout << "ERROR for kmer_id " << kmer_id << ": expected_ab " << expected_ab - << " but got_ab " << got_ab << std::endl; + uint64_t got = dict.weight(kmer_id); + if (expected != got) { + std::cout << "ERROR for kmer_id " << kmer_id << ": expected " << expected + << " but got " << got << std::endl; } if (kmer_id != 0 and kmer_id % 5000000 == 0) { - std::cout << "checked " << kmer_id << " abundances" << std::endl; + std::cout << "checked " << kmer_id << " weights" << std::endl; } } std::getline(is, line); // skip DNA sequence } - std::cout << "checked " << kmer_id << " abundances" << std::endl; + std::cout << "checked " << kmer_id << " weights" << std::endl; std::cout << "EVERYTHING OK!" << std::endl; return true; } @@ -162,15 +162,15 @@ bool check_correctness_lookup_access(dictionary const& dict, std::string const& /* The input file must be the one the index was built from. */ -bool check_correctness_abundances(dictionary const& dict, std::string const& filename) { +bool check_correctness_weights(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); bool good = true; if (util::ends_with(filename, ".gz")) { zip_istream zis(is); - good = check_correctness_abundances(zis, dict); + good = check_correctness_weights(zis, dict); } else { - good = check_correctness_abundances(is, dict); + good = check_correctness_weights(is, dict); } is.close(); return good; diff --git a/src/permute.cpp b/src/permute.cpp index 9b3064c..0d58fb3 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -9,8 +9,8 @@ using namespace sshash; struct permute_data { - permute_data() : num_runs_abundances(0), num_sequences(0) {} - uint64_t num_runs_abundances; + permute_data() : num_runs_weights(0), num_sequences(0) {} + uint64_t num_runs_weights; uint64_t num_sequences; std::vector nodes; }; @@ -22,15 +22,15 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t num_kmers = 0; uint64_t seq_len = 0; - uint64_t sum_of_abundances = 0; - uint64_t num_sequences_diff_abs = 0; // num sequences whose kmers have different abundances - uint64_t num_sequences_all_mfa = 0; // num sequences whose kmers have same abundance == mfa + uint64_t sum_of_weights = 0; + uint64_t num_sequences_diff_weights = 0; // num sequences whose kmers have different weights + uint64_t num_sequences_all_mfw = 0; // num sequences whose kmers have same weight == mfw - data.num_runs_abundances = 0; + data.num_runs_weights = 0; data.num_sequences = 0; - /* count number of distinct abundances */ - std::unordered_set distinct_abundances; + /* count number of distinct weights */ + std::unordered_set distinct_weights; auto parse_header = [&]() { if (sequence.empty()) return; @@ -38,7 +38,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& /* Heder format: >[id] LN:i:[seq_len] ab:Z:[ab_seq] - where [ab_seq] is a space-separated sequence of integer counters (the abundances), + where [ab_seq] is a space-separated sequence of integer counters (the weights), whose length is equal to [seq_len]-k+1 */ @@ -68,34 +68,34 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& expect(sequence[i + 4], ':'); i += 5; - bool kmers_have_all_mfa = true; - bool kmers_have_different_abundances = false; + bool kmers_have_all_mfw = true; + bool kmers_have_different_weights = false; uint64_t front = constants::invalid; uint64_t back = constants::invalid; - for (uint64_t j = 0, prev_abundance = constants::invalid; j != seq_len - k + 1; ++j) { - uint64_t abundance = std::strtoull(sequence.data() + i, nullptr, 10); - sum_of_abundances += abundance; + for (uint64_t j = 0, prev_weight = constants::invalid; j != seq_len - k + 1; ++j) { + uint64_t weight = std::strtoull(sequence.data() + i, nullptr, 10); + sum_of_weights += weight; i = sequence.find_first_of(' ', i) + 1; - distinct_abundances.insert(abundance); + distinct_weights.insert(weight); /* set front and back */ - if (j == 0) front = abundance; - if (j == seq_len - k) back = abundance; + if (j == 0) front = weight; + if (j == seq_len - k) back = weight; /* accumulate statistics */ - if (abundance != constants::most_frequent_abundance) kmers_have_all_mfa = false; - if (j > 0 and abundance != prev_abundance) kmers_have_different_abundances = true; + if (weight != constants::most_frequent_weight) kmers_have_all_mfw = false; + if (j > 0 and weight != prev_weight) kmers_have_different_weights = true; /* count the number of runs */ - if (abundance != prev_abundance) data.num_runs_abundances += 1; + if (weight != prev_weight) data.num_runs_weights += 1; - prev_abundance = abundance; + prev_weight = weight; } - num_sequences_diff_abs += kmers_have_different_abundances; - num_sequences_all_mfa += kmers_have_all_mfa; + num_sequences_diff_weights += kmers_have_different_weights; + num_sequences_all_mfw += kmers_have_all_mfw; constexpr bool sign = true; data.nodes.emplace_back(data.num_sequences, front, back, sign); @@ -124,26 +124,27 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& } assert(data.nodes.size() == data.num_sequences); - assert(data.num_runs_abundances >= data.num_sequences); + assert(data.num_runs_weights >= data.num_sequences); std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, " << num_kmers << " kmers" << std::endl; - std::cout << "found " << distinct_abundances.size() << " distinct abundances" << std::endl; - std::cout << "sum_of_abundances " << sum_of_abundances << std::endl; - std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs + std::cout << "found " << distinct_weights.size() << " distinct weights" << std::endl; + std::cout << "sum_of_weights " << sum_of_weights << std::endl; + std::cout << "num_sequences whose kmers have different weights: " << num_sequences_diff_weights << "/" << data.num_sequences << " (" - << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance != mfa: " - << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " (" - << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences + << (num_sequences_diff_weights * 100.0) / data.num_sequences << "%)" << std::endl; + std::cout << "num_sequences whose kmers all have the same weight != mfw: " + << (data.num_sequences - num_sequences_diff_weights) << "/" << data.num_sequences + << " (" + << ((data.num_sequences - num_sequences_diff_weights) * 100.0) / data.num_sequences << "%)" << std::endl; - std::cout << "num_sequences whose kmers all have the same abundance == mfa: " - << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs) + std::cout << "num_sequences whose kmers all have the same weight == mfw: " + << num_sequences_all_mfw << "/" << (data.num_sequences - num_sequences_diff_weights) << " (" - << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs) + << (num_sequences_all_mfw * 100.0) / (data.num_sequences - num_sequences_diff_weights) << "%)" << std::endl; - /* computation of lower bound to the number of final abundance runs */ + /* computation of lower bound to the number of final weight runs */ { struct info { /* how many times an end-point weight appears */ @@ -187,27 +188,27 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const& uint64_t freq = p.second.freq; sum_of_freqs += freq; - /* special case: abundance will appear as singleton, so count twice */ + /* special case: weight will appear as singleton, so count twice */ if (p.second.all_equal) { num_endpoints += 2; continue; } - /* if the excess of frequency is odd, then the abundance will appear as end-point */ + /* if the excess of frequency is odd, then the weight will appear as end-point */ if (freq % 2 == 1) num_endpoints += 1; } assert(sum_of_freqs == 2 * data.num_sequences); assert(num_endpoints % 2 == 0); (void)sum_of_freqs; - uint64_t num_distinct_abundances = distinct_abundances.size(); - uint64_t num_switch_points = data.num_runs_abundances - data.nodes.size(); + uint64_t num_distinct_weights = distinct_weights.size(); + uint64_t num_switch_points = data.num_runs_weights - data.nodes.size(); uint64_t num_walks = num_endpoints / 2; std::cout << "(estimated) num_walks = " << num_walks << std::endl; - std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl; + std::cout << "num_runs_weights = " << data.num_runs_weights << std::endl; std::cout << "num_switch_points = " << num_switch_points << std::endl; - uint64_t R_lo = std::max(num_distinct_abundances, num_switch_points + 1); + uint64_t R_lo = std::max(num_distinct_weights, num_switch_points + 1); uint64_t R_hi = num_switch_points + num_walks; std::cout << "R_lo = " << R_lo << std::endl; std::cout << "R_hi = " << R_hi << std::endl; @@ -278,16 +279,16 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) { i = j + 6; // skip ' ab:Z:' output.append(input.data(), input.data() + i); - std::vector abundances; - abundances.reserve(seq_len - k + 1); + std::vector weights; + weights.reserve(seq_len - k + 1); for (uint64_t j = 0; j != seq_len - k + 1; ++j) { - uint64_t abundance = std::strtoull(input.data() + i, nullptr, 10); - abundances.push_back(abundance); + uint64_t weight = std::strtoull(input.data() + i, nullptr, 10); + weights.push_back(weight); i = input.find_first_of(' ', i) + 1; } - std::reverse(abundances.begin(), abundances.end()); - for (auto abundance : abundances) output.append(std::to_string(abundance) + " "); + std::reverse(weights.begin(), weights.end()); + for (auto weight : weights) output.append(std::to_string(weight) + " "); } void permute_and_write(std::istream& is, std::string const& output_filename, @@ -344,7 +345,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename, if (!signs[i]) { /* compute reverse complement of dna_sequence - and reverse the abundances in header_sequence */ + and reverse the weights in header_sequence */ dna_sequence_rc.resize(dna_sequence.size()); header_sequence_r.clear(); util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), @@ -467,7 +468,7 @@ int main(int argc, char** argv) { "Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:\n" "\t- without duplicate nor invalid kmers\n" "\t- one DNA sequence per line\n" - "\t- with also kmers' abundances.\n" + "\t- with also kmers' weights.\n" "\tFor example, it could be the de Bruijn graph topology output by BCALM."); parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ")."); @@ -502,7 +503,7 @@ int main(int argc, char** argv) { { /* compute cover */ - cover c(data.num_sequences, data.num_runs_abundances); + cover c(data.num_sequences, data.num_runs_weights); assert(data.nodes.size() == data.num_sequences); c.compute(data.nodes); c.save(permutation_filename);