From b218b63a6dde86f785f1f871801fbbecc9400523 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 24 May 2022 18:32:05 +0200
Subject: [PATCH] general renaming to match description in papers; added new
 pre-print info

---
 README.md                                     |  42 ++--
 .../ecoli1_k31_ust.weights.fa.gz}             | Bin
 ...li_sakai.BA000007.3.k31_ust.weights.fa.gz} | Bin
 ...salmonella_enterica_k31_ust.weights.fa.gz} | Bin
 include/abundances.hpp                        | 183 ------------------
 include/builder/build.cpp                     |  73 ++++---
 include/builder/cover.hpp                     |  40 ++--
 include/builder/util.hpp                      |   3 +-
 include/dictionary.hpp                        |  14 +-
 include/info.cpp                              |   6 +-
 include/skew_index.hpp                        |   8 +-
 include/util.hpp                              |   6 +-
 include/weights.hpp                           | 183 ++++++++++++++++++
 src/bench.cpp                                 |   2 +-
 src/bench_utils.hpp                           |  11 +-
 src/build.cpp                                 |   9 +-
 src/check_utils.hpp                           |  26 +--
 src/permute.cpp                               | 101 +++++-----
 18 files changed, 352 insertions(+), 355 deletions(-)
 rename data/unitigs_stitched/{with_abundances/ecoli1_k31_ust.abundances.fa.gz => with_weights/ecoli1_k31_ust.weights.fa.gz} (100%)
 rename data/unitigs_stitched/{with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz => with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz} (100%)
 rename data/unitigs_stitched/{with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz => with_weights/salmonella_enterica_k31_ust.weights.fa.gz} (100%)
 delete mode 100644 include/abundances.hpp
 create mode 100644 include/weights.hpp

diff --git a/README.md b/README.md
index 2d99194..e66a087 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,12 @@ SSHash
 This is a compressed dictionary data structure for k-mers
 (strings of length k over the DNA alphabet {A,C,G,T}), based on **S**parse and **S**kew **Hash**ing.
 
-**A (pre-print) paper describing the data structure can be found [here](https://www.biorxiv.org/content/10.1101/2022.01.15.476199) [1]. Please, cite the paper if you use SSHash.**
+The data structure is described in the following papers:
+
+* [*"Sparse and Skew Hashing of K-Mers"*](https://www.biorxiv.org/content/10.1101/2022.01.15.476199) [1]
+* [*"On Weighted K-Mers Dictionaries"*](https://doi.org/10.1101/2022.05.23.493024) [2]
+
+Please, cite these papers if you use SSHash.
 
 For a dictionary of n k-mers,
 two basic queries are supported:
@@ -15,11 +20,11 @@ two basic queries are supported:
 - i = Lookup(g), where i is in [0,n) if the k-mer g is found in the dictionary or i = -1 otherwise;
 - g = Access(i), where g is the k-mer associated to the identifier i.
 
-If also the abundances of the k-mers (their frequency counts) are stored in the dictionary, then the dictionary is said to be *weighted* and it also supports:
+If also the weights of the k-mers (their frequency counts) are stored in the dictionary, then the dictionary is said to be *weighted* and it also supports:
 
-- c = Abundance(i), where i is a given k-mer identifier.
+- w = Weight(i), where i is a given k-mer identifier and w is the weight of the k-mer.
 
-A membership query (determine if a given k-mer is present in the dictionary or not) is, therefore, supported by means of the lookup query.
+A membership query (determine if a given k-mer is present in the dictionary or not) is, therefore, supported by means of the Lookup query.
 The dictionary can also stream through all k-mers of a given DNA file
 (.fasta or .fastq formats) to determine their membership to the dictionary.
 
@@ -95,7 +100,7 @@ where the code was compiled (see the section [Compiling the Code](#compiling-the
 
 to show the usage of the driver program (reported below for convenience).
 
-	Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--abundances] [-o output_filename] [--check] [--bench] [--verbose]
+	Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--weighted] [-o output_filename] [--check] [--bench] [--verbose]
 
 	 input_filename
 		Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:
@@ -121,8 +126,8 @@ to show the usage of the driver program (reported below for convenience).
 	 [--canonical-parsing]
 		Canonical parsing of k-mers. This option changes the parsing and results in a trade-off between index space and lookup time.
 
-	 [--abundances]
-		Also store the abundances in compressed format.
+	 [--weighted]
+		Also store the weights in compressed format.
 
 	 [-o output_filename]
 		Output file name where the data structure will be serialized.
@@ -147,7 +152,7 @@ For the examples, we are going to use some collections
 of *stitched unitigs* from the directory `../data/unitigs_stitched`.
 These collections were built for k = 31, so dictionaries should be built with k = 31 as well to ensure correctness.
 
-(The subdirectory `../data/unitigs_stitched/with_abundances` contains some files with k-mers' abundances too.)
+(The subdirectory `../data/unitigs_stitched/with_weights` contains some files with k-mers' weights too.)
 
 In the section [Input Files](#input-files), we explain how
 such collections of stitched unitigs can be obtained from raw FASTA files.
@@ -164,9 +169,9 @@ use:
 
 	./bench salmonella_enterica.index
 
-To also store the abundances, use the option `--abundances`:
+To also store the weights, use the option `--weighted`:
 
-	./build ../data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz 31 13 --abundances --check --verbose
+	./build ../data/unitigs_stitched/with_weights/salmonella_enterica_k31_ust.weights.fa.gz 31 13 --weighted --check --verbose
 	
 ### Example 2
 
@@ -221,23 +226,23 @@ even on this tiny example, for only +0.4 bits/k-mer.
 
 ### Example 4
 
-	./permute ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 -o ecoli_sakai.permuted.fa
+	./permute ../data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz 31 -o ecoli_sakai.permuted.fa
 
-This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the abundances and, hence, optimize the encoding of the abundances.
+This command re-orders (and possibly reverse-complement) the strings in the collection as to *minimize* the number of runs in the weights and, hence, optimize the encoding of the weights.
 The result is saved to the file `ecoli_sakai.permuted.fa`.
 
-In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the abundances from 5820 to 3723.
+In this example for the E.Coli collection (Sakai strain) we reduce the number of runs in the weights from 5820 to 3723.
 
 Then use the `build` command as usual to build the permuted collection:
 
-	./build ecoli_sakai.permuted.fa 31 13 --abundances --verbose
+	./build ecoli_sakai.permuted.fa 31 13 --weighted --verbose
 	
 The index built on the permuted collection
-optimizes the storage space for the abundances which results in a 15.1X better space than the empirical entropy of the abundances.
+optimizes the storage space for the weights which results in a 15.1X better space than the empirical entropy of the weights.
 
 For reference, the index built on the original collection:
 
-	./build ../data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz 31 13 --abundances --verbose
+	./build ../data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz 31 13 --weighted --verbose
 
 already achieves a 12.4X better space than the empirical entropy.
 
@@ -266,7 +271,7 @@ The script `scripts/download_and_preprocess_datasets.sh`
 contains all the needed steps to download and pre-process
 the datasets that we used in [1].
 
-#### Abundances
+#### weights
 Using the option `-all-abundance-counts` of BCALM2, it is possible to also include the abundance counts of the k-mers in the BCALM2 output. Then, use the option `-a 1` of UST to include such counts in the stitched unitigs.
 
 
@@ -353,4 +358,5 @@ Giulio Ermanno Pibiri - <giulio.ermanno.pibiri@isti.cnr.it>
 
 References
 -----
-* [1] Giulio Ermanno Pibiri. [*"Sparse and Skew Hashing of K-Mers"*](https://www.biorxiv.org/content/10.1101/2022.01.15.476199). ISMB 2022 (Bioinformatics journal). To Appear.
\ No newline at end of file
+* [1] Giulio Ermanno Pibiri. [*"Sparse and Skew Hashing of K-Mers"*](https://www.biorxiv.org/content/10.1101/2022.01.15.476199). ISMB (Bioinformatics journal). 2022. To Appear.
+* [2] Giulio Ermanno Pibiri. [*"On Weighted K-Mers Dictionaries"*](https://doi.org/10.1101/2022.05.23.493024). bioRxiv. 2022.
\ No newline at end of file
diff --git a/data/unitigs_stitched/with_abundances/ecoli1_k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_weights/ecoli1_k31_ust.weights.fa.gz
similarity index 100%
rename from data/unitigs_stitched/with_abundances/ecoli1_k31_ust.abundances.fa.gz
rename to data/unitigs_stitched/with_weights/ecoli1_k31_ust.weights.fa.gz
diff --git a/data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz
similarity index 100%
rename from data/unitigs_stitched/with_abundances/ecoli_sakai.BA000007.3.k31_ust.abundances.fa.gz
rename to data/unitigs_stitched/with_weights/ecoli_sakai.BA000007.3.k31_ust.weights.fa.gz
diff --git a/data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz b/data/unitigs_stitched/with_weights/salmonella_enterica_k31_ust.weights.fa.gz
similarity index 100%
rename from data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz
rename to data/unitigs_stitched/with_weights/salmonella_enterica_k31_ust.weights.fa.gz
diff --git a/include/abundances.hpp b/include/abundances.hpp
deleted file mode 100644
index 4fd4536..0000000
--- a/include/abundances.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#pragma once
-
-#include <vector>
-#include <unordered_map>  // count the distinct abundances with freq information
-
-#include "ef_sequence.hpp"
-
-namespace sshash {
-
-struct abundances {
-    struct builder {
-        builder() {}
-
-        void init() { m_abundance_interval_lengths.push_back(0); }
-
-        void eat(uint64_t abundance) {
-            assert(abundance > 0);
-            auto it = m_abundances_map.find(abundance);
-            if (it != m_abundances_map.cend()) {  // found
-                (*it).second += 1;
-            } else {
-                m_abundances_map[abundance] = 1;
-            }
-        }
-
-        void push_abundance_interval(uint64_t value, uint64_t length) {
-            m_abundance_interval_values.push_back(value);
-            m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length);
-        }
-
-        uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); }
-
-        void finalize(uint64_t num_kmers) {
-            assert(std::is_sorted(m_abundance_interval_lengths.begin(),
-                                  m_abundance_interval_lengths.end()));
-
-            std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl;
-
-            uint64_t num_distinct_abundances = m_abundances_map.size();
-
-            std::cout << "found " << num_distinct_abundances << " distint abundances (ceil(log2("
-                      << num_distinct_abundances
-                      << ")) = " << std::ceil(std::log2(num_distinct_abundances)) << ")"
-                      << std::endl;
-
-            m_abundances.reserve(num_distinct_abundances);
-            uint64_t n = 0;
-            uint64_t largest_ab = 0;
-            for (auto p : m_abundances_map) {
-                if (p.first > largest_ab) largest_ab = p.first;
-                n += p.second;
-                m_abundances.push_back(p);
-            }
-            assert(largest_ab > 0);
-
-            std::cout << "largest_ab+1 = " << largest_ab + 1 << " (ceil(log2(" << largest_ab + 1
-                      << ")) = " << std::ceil(std::log2(largest_ab + 1)) << ")" << std::endl;
-
-            if (n != num_kmers) {
-                std::cout << "ERROR: expected " << num_kmers << " kmers but got " << n << std::endl;
-                throw std::runtime_error("file is malformed");
-            }
-
-            std::sort(m_abundances.begin(), m_abundances.end(), [](auto const& x, auto const& y) {
-                if (x.second != y.second) return x.second > y.second;
-                return x.first < y.first;
-            });
-
-            uint64_t rest = num_kmers - m_abundances.front().second;
-            std::cout << "kmers that do not have the most frequent ab: " << rest << " ("
-                      << (rest * 100.0) / num_kmers << "%)" << std::endl;
-
-            m_abundance_dictionary_builder.resize(num_distinct_abundances,
-                                                  std::ceil(std::log2(largest_ab + 1)));
-            for (uint64_t id = 0; id != num_distinct_abundances; ++id) {
-                uint64_t ab_value = m_abundances[id].first;
-                m_abundance_dictionary_builder.set(id, ab_value);
-                m_abundances_map[ab_value] = id;
-            }
-        }
-
-        void build(abundances& index) {
-            uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size();
-            pthash::compact_vector::builder abundance_interval_values;
-            abundance_interval_values.resize(
-                m_abundance_interval_values.size(),
-                num_distinct_abundances == 1 ? 1 : std::ceil(std::log2(num_distinct_abundances)));
-            uint64_t prev_abundance = constants::invalid;
-            for (uint64_t i = 0; i != m_abundance_interval_values.size(); ++i) {
-                uint64_t abundance = m_abundance_interval_values[i];
-                if (i != 0) {
-                    if (abundance == prev_abundance) {
-                        std::cerr << "Error at " << i << "/" << m_abundance_interval_values.size()
-                                  << ": cannot have two consecutive intervals with the same "
-                                     "abundance value"
-                                  << std::endl;
-                        throw std::runtime_error("abundance intervals are malformed");
-                    }
-                }
-                prev_abundance = abundance;
-                uint64_t id = m_abundances_map[abundance];
-                assert(id < num_distinct_abundances);
-                abundance_interval_values.set(i, id);
-            }
-            abundance_interval_values.build(index.m_abundance_interval_values);
-            index.m_abundance_interval_lengths.encode(m_abundance_interval_lengths.begin(),
-                                                      m_abundance_interval_lengths.size());
-
-            m_abundance_dictionary_builder.build(index.m_abundance_dictionary);
-        }
-
-        /* return the average empirical entropy per abundance */
-        double print_info(uint64_t num_kmers) {
-            assert(!m_abundances.empty());
-            double expected_ab_value = 0.0;
-            double entropy_ab = 0.0;
-            uint64_t print = 0;
-            for (auto p : m_abundances) {
-                double prob = static_cast<double>(p.second) / num_kmers;
-                expected_ab_value += p.first * prob;
-                entropy_ab += prob * std::log2(1.0 / prob);
-                print += 1;
-                if (print <= 10) {
-                    std::cout << "ab:" << p.first << " freq:" << p.second << " ("
-                              << (p.second * 100.0) / num_kmers << "%)" << std::endl;
-                }
-            }
-            std::cout << "expected_ab_value " << expected_ab_value << std::endl;
-            std::cout << "entropy_ab " << entropy_ab << " [bits/kmer]" << std::endl;
-            return entropy_ab;
-        }
-
-    private:
-        /* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */
-        std::unordered_map<uint64_t, uint64_t> m_abundances_map;
-        std::vector<std::pair<uint64_t, uint64_t>> m_abundances;  // (abundance,frequency)
-
-        std::vector<uint64_t> m_abundance_interval_values;
-        std::vector<uint64_t> m_abundance_interval_lengths;
-
-        pthash::compact_vector::builder m_abundance_dictionary_builder;
-    };
-
-    bool empty() const { return m_abundance_dictionary.size() == 0; }
-
-    uint64_t abundance(uint64_t kmer_id) const {
-        uint64_t i = m_abundance_interval_lengths.prev_leq(kmer_id);
-        uint64_t id = m_abundance_interval_values.access(i);
-        uint64_t abundance = m_abundance_dictionary.access(id);
-        return abundance;
-    }
-
-    uint64_t num_bits() const {
-        return m_abundance_interval_values.bytes() * 8 + m_abundance_interval_lengths.num_bits() +
-               m_abundance_dictionary.bytes() * 8;
-    }
-
-    void print_space_breakdown(uint64_t num_kmers) const {
-        std::cout << "    abundance_interval_values: "
-                  << static_cast<double>(m_abundance_interval_values.bytes() * 8) / num_kmers
-                  << " [bits/kmer]\n";
-        std::cout << "    abundance_interval_lengths: "
-                  << static_cast<double>(m_abundance_interval_lengths.num_bits()) / num_kmers
-                  << " [bits/kmer]\n";
-        std::cout << "    abundance_dictionary: "
-                  << static_cast<double>(m_abundance_dictionary.bytes() * 8) / num_kmers
-                  << " [bits/kmer]\n";
-    }
-
-    template <typename Visitor>
-    void visit(Visitor& visitor) {
-        visitor.visit(m_abundance_interval_values);
-        visitor.visit(m_abundance_interval_lengths);
-        visitor.visit(m_abundance_dictionary);
-    }
-
-private:
-    pthash::compact_vector m_abundance_interval_values;
-    ef_sequence<true> m_abundance_interval_lengths;
-    pthash::compact_vector m_abundance_dictionary;
-};
-
-}  // namespace sshash
\ No newline at end of file
diff --git a/include/builder/build.cpp b/include/builder/build.cpp
index 4931b06..bf6d7f3 100644
--- a/include/builder/build.cpp
+++ b/include/builder/build.cpp
@@ -18,7 +18,7 @@ struct parse_data {
     uint64_t num_kmers;
     minimizers_tuples minimizers;
     compact_string_pool strings;
-    abundances::builder abundances_builder;
+    weights::builder weights_builder;
 };
 
 void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) {
@@ -79,20 +79,20 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
     };
 
     uint64_t seq_len = 0;
-    uint64_t sum_of_abundances = 0;
-    data.abundances_builder.init();
+    uint64_t sum_of_weights = 0;
+    data.weights_builder.init();
 
-    /* intervals of abundances */
-    uint64_t ab_value = constants::invalid;
-    uint64_t ab_length = 0;
+    /* intervals of weights */
+    uint64_t weight_value = constants::invalid;
+    uint64_t weight_length = 0;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
 
         /*
             Heder format:
-            >[id] LN:i:[seq_len] ab:Z:[ab_seq]
-            where [ab_seq] is a space-separated sequence of integer counters (the abundances),
+            >[id] LN:i:[seq_len] ab:Z:[weight_seq]
+            where [weight_seq] is a space-separated sequence of integer counters (the weights),
             whose length is equal to [seq_len]-k+1
         */
 
@@ -123,27 +123,27 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         i += 5;
 
         for (uint64_t j = 0; j != seq_len - k + 1; ++j) {
-            uint64_t ab = std::strtoull(sequence.data() + i, nullptr, 10);
+            uint64_t weight = std::strtoull(sequence.data() + i, nullptr, 10);
             i = sequence.find_first_of(' ', i) + 1;
 
-            data.abundances_builder.eat(ab);
-            sum_of_abundances += ab;
+            data.weights_builder.eat(weight);
+            sum_of_weights += weight;
 
-            if (ab == ab_value) {
-                ab_length += 1;
+            if (weight == weight_value) {
+                weight_length += 1;
             } else {
-                if (ab_value != constants::invalid) {
-                    data.abundances_builder.push_abundance_interval(ab_value, ab_length);
+                if (weight_value != constants::invalid) {
+                    data.weights_builder.push_weight_interval(weight_value, weight_length);
                 }
-                ab_value = ab;
-                ab_length = 1;
+                weight_value = weight;
+                weight_length = 1;
             }
         }
     };
 
     while (!is.eof()) {
         std::getline(is, sequence);  // header sequence
-        if (build_config.store_abundances) parse_header();
+        if (build_config.weighted) parse_header();
 
         std::getline(is, sequence);  // DNA sequence
         if (sequence.size() < k) continue;
@@ -159,7 +159,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
         prev_minimizer = constants::invalid;
         num_bases += sequence.size();
 
-        if (build_config.store_abundances and seq_len != sequence.size()) {
+        if (build_config.weighted and seq_len != sequence.size()) {
             std::cout << "ERROR: expected a sequence of length " << seq_len
                       << " but got one of length " << sequence.size() << std::endl;
             throw std::runtime_error("file is malformed");
@@ -204,10 +204,10 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
               << std::endl;
     assert(data.strings.pieces.size() == num_sequences + 1);
 
-    if (build_config.store_abundances) {
-        std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
-        data.abundances_builder.push_abundance_interval(ab_value, ab_length);
-        data.abundances_builder.finalize(data.num_kmers);
+    if (build_config.weighted) {
+        std::cout << "sum_of_weights " << sum_of_weights << std::endl;
+        data.weights_builder.push_weight_interval(weight_value, weight_length);
+        data.weights_builder.finalize(data.num_kmers);
     }
 }
 
@@ -288,12 +288,11 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const&
     uint64_t min_log2_size = m_skew_index.min_log2;
     uint64_t max_log2_size = m_skew_index.max_log2;
 
-    m_skew_index.max_num_strings_in_bucket = buckets_stats.max_num_strings_in_bucket();
+    uint64_t max_num_strings_in_bucket = buckets_stats.max_num_strings_in_bucket();
     m_skew_index.log2_max_num_strings_in_bucket =
         std::ceil(std::log2(buckets_stats.max_num_strings_in_bucket()));
 
-    std::cout << "max_num_strings_in_bucket " << m_skew_index.max_num_strings_in_bucket
-              << std::endl;
+    std::cout << "max_num_strings_in_bucket " << max_num_strings_in_bucket << std::endl;
     std::cout << "log2_max_num_strings_in_bucket " << m_skew_index.log2_max_num_strings_in_bucket
               << std::endl;
 
@@ -354,9 +353,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const&
 
                 lower = upper;
                 upper = 2 * lower;
-                if (partition_id == num_partitions - 1) {
-                    upper = m_skew_index.max_num_strings_in_bucket;
-                }
+                if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket;
 
                 /*
                     If still larger than upper, then there is an empty bucket
@@ -447,7 +444,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const&
                 upper = 2 * lower;
                 num_bits_per_pos += 1;
                 if (partition_id == num_partitions - 1) {
-                    upper = m_skew_index.max_num_strings_in_bucket;
+                    upper = max_num_strings_in_bucket;
                     num_bits_per_pos = m_skew_index.log2_max_num_strings_in_bucket;
                 }
 
@@ -517,20 +514,20 @@ void dictionary::build(std::string const& filename, build_configuration const& b
     timer.reset();
     /******/
 
-    if (build_config.store_abundances) {
-        /* step 1.1: compress abundances ***/
+    if (build_config.weighted) {
+        /* step 1.1: compress weights ***/
         timer.start();
-        data.abundances_builder.build(m_abundances);
+        data.weights_builder.build(m_weights);
         timer.stop();
         timings.push_back(timer.elapsed());
-        print_time(timings.back(), data.num_kmers, "step 1.1.: 'build_abundances'");
+        print_time(timings.back(), data.num_kmers, "step 1.1.: 'build_weights'");
         timer.reset();
         /******/
         if (build_config.verbose) {
-            double entropy_ab = data.abundances_builder.print_info(data.num_kmers);
-            double avg_bits_per_ab = static_cast<double>(m_abundances.num_bits()) / data.num_kmers;
-            std::cout << "abundances: " << avg_bits_per_ab << " [bits/kmer]" << std::endl;
-            std::cout << "  (" << entropy_ab / avg_bits_per_ab
+            double entropy_weights = data.weights_builder.print_info(data.num_kmers);
+            double avg_bits_per_weight = static_cast<double>(m_weights.num_bits()) / data.num_kmers;
+            std::cout << "weights: " << avg_bits_per_weight << " [bits/kmer]" << std::endl;
+            std::cout << "  (" << entropy_weights / avg_bits_per_weight
                       << "x smaller than the empirical entropy)" << std::endl;
         }
     }
diff --git a/include/builder/cover.hpp b/include/builder/cover.hpp
index a830304..9373f8f 100644
--- a/include/builder/cover.hpp
+++ b/include/builder/cover.hpp
@@ -10,12 +10,12 @@
 namespace sshash {
 
 namespace constants {
-constexpr uint64_t most_frequent_abundance = 1;  // usual value
+constexpr uint64_t most_frequent_weight = 1;  // usual value
 }
 
 struct node {
     // We assume there are less than 2^32 sequences and that
-    // the largest abundance fits into a 32-bit uint.
+    // the largest weight fits into a 32-bit uint.
     node(uint32_t i, uint32_t f, uint32_t b, bool s) : id(i), front(f), back(b), sign(s) {}
     uint32_t id, front, back;
     bool sign;  // 'true'  --> '+'
@@ -26,8 +26,8 @@ struct cover {
     typedef std::deque<node> walk_t;
     typedef std::vector<walk_t> walks_t;
 
-    cover(uint64_t num_sequences, uint64_t num_runs_abundances)
-        : m_num_sequences(num_sequences), m_num_runs_abundances(num_runs_abundances) {}
+    cover(uint64_t num_sequences, uint64_t num_runs_weights)
+        : m_num_sequences(num_sequences), m_num_runs_weights(num_runs_weights) {}
 
     void compute(std::vector<node>& nodes) {
         assert(nodes.size() == m_num_sequences);
@@ -35,8 +35,8 @@ struct cover {
         essentials::timer_type timer;
         timer.start();
 
-        /* (abundance, position of a candidate abundance with front = abundance) */
-        std::unordered_map<uint32_t, uint32_t> abundance_map;
+        /* (weight, position of a candidate weight with front = weight) */
+        std::unordered_map<uint32_t, uint32_t> weight_map;
 
         /* visited[node.id] = true if node has been visited; false otherwise */
         std::vector<bool> visited;
@@ -51,7 +51,7 @@ struct cover {
         unvisited_nodes.reserve(nodes.size());  // at most
         m_walks.reserve(nodes.size());          // at most
 
-        std::cout << "initial number of runs = " << m_num_runs_abundances << std::endl;
+        std::cout << "initial number of runs = " << m_num_runs_weights << std::endl;
 
         /* add all the nodes with backward orientation */
         uint64_t num_nodes = nodes.size();
@@ -80,12 +80,12 @@ struct cover {
             return x.id < y.id;
         });
 
-        /* fill abundance_map */
+        /* fill weight_map */
         {
             uint64_t prev_front = constants::invalid;
             uint64_t offset = 0;
             for (auto const& node : nodes) {
-                if (node.front != prev_front) abundance_map[node.front] = offset;
+                if (node.front != prev_front) weight_map[node.front] = offset;
                 offset += 1;
                 prev_front = node.front;
             }
@@ -186,26 +186,26 @@ struct cover {
                     assert(candidate_i <= num_nodes);
 
                     if (no_match_found or candidate_i == num_nodes) {
-                        abundance_map[back] = candidate_i;
+                        weight_map[back] = candidate_i;
                         return false;
                     }
 
                     /* valid match was found, then visit it next */
                     i = candidate_i;
 
-                    /* update candidate position in abundance_map to point to *next* position */
-                    abundance_map[back] = candidate_i + 1;
+                    /* update candidate position in weight_map to point to *next* position */
+                    weight_map[back] = candidate_i + 1;
 
                     return true;
                 };
 
                 /* try to extend to the right */
-                uint64_t candidate_i = abundance_map[walk.back().back];
+                uint64_t candidate_i = weight_map[walk.back().back];
                 bool found = try_to_extend(walk.back().back, candidate_i);
 
                 /* if not possible, try to extend to the left */
                 if (!found) {
-                    candidate_i = abundance_map[walk.front().front];
+                    candidate_i = weight_map[walk.front().front];
                     found = try_to_extend(walk.front().front, candidate_i);
                 }
 
@@ -253,17 +253,17 @@ struct cover {
                 // std::cout << std::endl;
 #endif
             }
-            assert(m_num_runs_abundances > num_matches_in_round);
-            m_num_runs_abundances -= num_matches_in_round;
+            assert(m_num_runs_weights > num_matches_in_round);
+            m_num_runs_weights -= num_matches_in_round;
             std::cout << "  num_matches = " << num_matches_in_round << std::endl;
-            std::cout << "  num_runs " << m_num_runs_abundances << std::endl;
+            std::cout << "  num_runs " << m_num_runs_weights << std::endl;
         }
 
         timer.stop();
         std::cout << "cover computed in: " << timer.elapsed() / 1000000 << " [sec]" << std::endl;
-        std::cout << "final number of runs R = " << m_num_runs_abundances << std::endl;
+        std::cout << "final number of runs R = " << m_num_runs_weights << std::endl;
 
-        assert(m_num_runs_abundances >= 1);
+        assert(m_num_runs_weights >= 1);
     }
 
     void save(std::string const& filename) {
@@ -287,7 +287,7 @@ struct cover {
 
 private:
     uint64_t m_num_sequences;
-    uint64_t m_num_runs_abundances;
+    uint64_t m_num_runs_weights;
     walks_t m_walks;
 
     void change_orientation(node& n) {
diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index 725c56e..864cf52 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -8,8 +8,7 @@ struct empty_bucket_runtime_error : public std::runtime_error {
 };
 
 struct parse_runtime_error : public std::runtime_error {
-    parse_runtime_error()
-        : std::runtime_error("did you provide an input file with abundance counts?") {}
+    parse_runtime_error() : std::runtime_error("did you provide an input file with weights?") {}
 };
 
 void expect(char got, char expected) {
diff --git a/include/dictionary.hpp b/include/dictionary.hpp
index 6aa0993..d1b15d4 100644
--- a/include/dictionary.hpp
+++ b/include/dictionary.hpp
@@ -4,7 +4,7 @@
 #include "minimizers.hpp"
 #include "buckets.hpp"
 #include "skew_index.hpp"
-#include "abundances.hpp"
+#include "weights.hpp"
 
 namespace sshash {
 
@@ -18,7 +18,7 @@ struct dictionary {
     uint64_t k() const { return m_k; }
     uint64_t m() const { return m_m; }
     bool canonicalized() const { return m_canonical_parsing; }
-    bool weighted() const { return !m_abundances.empty(); }
+    bool weighted() const { return !m_weights.empty(); }
 
     uint64_t lookup(char const* string_kmer, bool check_reverse_complement_too = true) const {
         uint64_t uint64_kmer = util::string_to_uint64_no_reverse(string_kmer, m_k);
@@ -40,9 +40,9 @@ struct dictionary {
         m_buckets.access(kmer_id, string_kmer, m_k);
     }
 
-    uint64_t abundance(uint64_t kmer_id) const {
+    uint64_t weight(uint64_t kmer_id) const {
         assert(kmer_id < size());
-        return m_abundances.abundance(kmer_id);
+        return m_weights.weight(kmer_id);
     }
 
     bool is_member(char const* string_kmer, bool check_reverse_complement_too = true) const {
@@ -87,7 +87,7 @@ struct dictionary {
         return 8 * (sizeof(m_size) + sizeof(m_seed) + sizeof(m_k) + sizeof(m_m) +
                     sizeof(m_canonical_parsing)) +
                m_minimizers.num_bits() + m_buckets.num_bits() + m_skew_index.num_bits() +
-               m_abundances.num_bits();
+               m_weights.num_bits();
     }
 
     void print_info() const;
@@ -103,7 +103,7 @@ struct dictionary {
         visitor.visit(m_minimizers);
         visitor.visit(m_buckets);
         visitor.visit(m_skew_index);
-        visitor.visit(m_abundances);
+        visitor.visit(m_weights);
     }
 
 private:
@@ -115,7 +115,7 @@ struct dictionary {
     minimizers m_minimizers;
     buckets m_buckets;
     skew_index m_skew_index;
-    abundances m_abundances;
+    weights m_weights;
 
     uint64_t lookup_uint64_regular_parsing(uint64_t uint64_kmer) const;
     uint64_t lookup_uint64_canonical_parsing(uint64_t uint64_kmer) const;
diff --git a/include/info.cpp b/include/info.cpp
index aad1b1c..a00bb7d 100644
--- a/include/info.cpp
+++ b/include/info.cpp
@@ -18,7 +18,7 @@ uint64_t skew_index::print_info() const {
         num_kmers_in_skew_index += n;
         lower = upper;
         upper = 2 * lower;
-        if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket;
+        // if (partition_id == num_partitions - 1) upper = max_num_strings_in_bucket;
     }
     return num_kmers_in_skew_index;
 }
@@ -40,9 +40,9 @@ void dictionary::print_space_breakdown() const {
               << " [bits/kmer]\n";
     std::cout << "  skew_index: " << static_cast<double>(m_skew_index.num_bits()) / size()
               << " [bits/kmer]\n";
-    std::cout << "  abundances: " << static_cast<double>(m_abundances.num_bits()) / size()
+    std::cout << "  weights: " << static_cast<double>(m_weights.num_bits()) / size()
               << " [bits/kmer]\n";
-    m_abundances.print_space_breakdown(size());
+    m_weights.print_space_breakdown(size());
     std::cout << "  --------------\n";
     std::cout << "  total: " << static_cast<double>(num_bits()) / size() << " [bits/kmer]"
               << std::endl;
diff --git a/include/skew_index.hpp b/include/skew_index.hpp
index bbb6c64..d8afa0c 100644
--- a/include/skew_index.hpp
+++ b/include/skew_index.hpp
@@ -8,7 +8,6 @@ struct skew_index {
     skew_index()
         : min_log2(constants::min_l)
         , max_log2(constants::max_l)
-        , max_num_strings_in_bucket(0)
         , log2_max_num_strings_in_bucket(0) {
         mphfs.resize(0);
         positions.resize(0);
@@ -34,9 +33,8 @@ struct skew_index {
     }
 
     uint64_t num_bits() const {
-        uint64_t n = (sizeof(min_log2) + sizeof(max_log2) + sizeof(max_num_strings_in_bucket) +
-                      sizeof(log2_max_num_strings_in_bucket)) *
-                     8;
+        uint64_t n =
+            (sizeof(min_log2) + sizeof(max_log2) + sizeof(log2_max_num_strings_in_bucket)) * 8;
         for (uint64_t partition_id = 0; partition_id != mphfs.size(); ++partition_id) {
             auto const& mphf = mphfs[partition_id];
             auto const& P = positions[partition_id];
@@ -49,7 +47,6 @@ struct skew_index {
     void visit(Visitor& visitor) {
         visitor.visit(min_log2);
         visitor.visit(max_log2);
-        visitor.visit(max_num_strings_in_bucket);
         visitor.visit(log2_max_num_strings_in_bucket);
         visitor.visit(mphfs);
         visitor.visit(positions);
@@ -57,7 +54,6 @@ struct skew_index {
 
     uint16_t min_log2;
     uint16_t max_log2;
-    uint32_t max_num_strings_in_bucket;  // useless
     uint32_t log2_max_num_strings_in_bucket;
     std::vector<pthash_mphf_type> mphfs;
     std::vector<pthash::compact_vector> positions;
diff --git a/include/util.hpp b/include/util.hpp
index 6403374..f0fe172 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -66,7 +66,7 @@ struct build_configuration {
         , c(constants::c)
 
         , canonical_parsing(false)
-        , store_abundances(false)
+        , weighted(false)
         , verbose(true) {}
 
     uint64_t k;  // kmer size
@@ -77,14 +77,14 @@ struct build_configuration {
     double c;    // drive PTHash trade-off
 
     bool canonical_parsing;
-    bool store_abundances;
+    bool weighted;
     bool verbose;
 
     void print() const {
         std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l
                   << ", c = " << c
                   << ", canonical_parsing = " << (canonical_parsing ? "true" : "false")
-                  << ", store_abundances = " << (store_abundances ? "true" : "false") << std::endl;
+                  << ", weighted = " << (weighted ? "true" : "false") << std::endl;
     }
 };
 
diff --git a/include/weights.hpp b/include/weights.hpp
new file mode 100644
index 0000000..68aadde
--- /dev/null
+++ b/include/weights.hpp
@@ -0,0 +1,183 @@
+#pragma once
+
+#include <vector>
+#include <unordered_map>  // count the distinct weights with freq information
+
+#include "ef_sequence.hpp"
+
+namespace sshash {
+
+struct weights {
+    struct builder {
+        builder() {}
+
+        void init() { m_weight_interval_lengths.push_back(0); }
+
+        void eat(uint64_t weight) {
+            assert(weight > 0);
+            auto it = m_weights_map.find(weight);
+            if (it != m_weights_map.cend()) {  // found
+                (*it).second += 1;
+            } else {
+                m_weights_map[weight] = 1;
+            }
+        }
+
+        void push_weight_interval(uint64_t value, uint64_t length) {
+            m_weight_interval_values.push_back(value);
+            m_weight_interval_lengths.push_back(m_weight_interval_lengths.back() + length);
+        }
+
+        uint64_t num_weight_intervals() const { return m_weight_interval_values.size(); }
+
+        void finalize(uint64_t num_kmers) {
+            assert(
+                std::is_sorted(m_weight_interval_lengths.begin(), m_weight_interval_lengths.end()));
+
+            std::cout << "num_weight_intervals " << num_weight_intervals() << std::endl;
+
+            uint64_t num_distinct_weights = m_weights_map.size();
+
+            std::cout << "found " << num_distinct_weights << " distint weights (ceil(log2("
+                      << num_distinct_weights
+                      << ")) = " << std::ceil(std::log2(num_distinct_weights)) << ")" << std::endl;
+
+            m_weights.reserve(num_distinct_weights);
+            uint64_t n = 0;
+            uint64_t largest_weight = 0;
+            for (auto p : m_weights_map) {
+                if (p.first > largest_weight) largest_weight = p.first;
+                n += p.second;
+                m_weights.push_back(p);
+            }
+            assert(largest_weight > 0);
+
+            std::cout << "largest_weight+1 = " << largest_weight + 1 << " (ceil(log2("
+                      << largest_weight + 1 << ")) = " << std::ceil(std::log2(largest_weight + 1))
+                      << ")" << std::endl;
+
+            if (n != num_kmers) {
+                std::cout << "ERROR: expected " << num_kmers << " kmers but got " << n << std::endl;
+                throw std::runtime_error("file is malformed");
+            }
+
+            std::sort(m_weights.begin(), m_weights.end(), [](auto const& x, auto const& y) {
+                if (x.second != y.second) return x.second > y.second;
+                return x.first < y.first;
+            });
+
+            uint64_t rest = num_kmers - m_weights.front().second;
+            std::cout << "kmers that do not have the most frequent weight: " << rest << " ("
+                      << (rest * 100.0) / num_kmers << "%)" << std::endl;
+
+            m_weight_dictionary_builder.resize(num_distinct_weights,
+                                               std::ceil(std::log2(largest_weight + 1)));
+            for (uint64_t id = 0; id != num_distinct_weights; ++id) {
+                uint64_t weight_value = m_weights[id].first;
+                m_weight_dictionary_builder.set(id, weight_value);
+                m_weights_map[weight_value] = id;
+            }
+        }
+
+        void build(weights& index) {
+            uint64_t num_distinct_weights = m_weight_dictionary_builder.size();
+            pthash::compact_vector::builder weight_interval_values;
+            weight_interval_values.resize(
+                m_weight_interval_values.size(),
+                num_distinct_weights == 1 ? 1 : std::ceil(std::log2(num_distinct_weights)));
+            uint64_t prev_weight = constants::invalid;
+            for (uint64_t i = 0; i != m_weight_interval_values.size(); ++i) {
+                uint64_t weight = m_weight_interval_values[i];
+                if (i != 0) {
+                    if (weight == prev_weight) {
+                        std::cerr << "Error at " << i << "/" << m_weight_interval_values.size()
+                                  << ": cannot have two consecutive intervals with the same "
+                                     "weight value"
+                                  << std::endl;
+                        throw std::runtime_error("weight intervals are malformed");
+                    }
+                }
+                prev_weight = weight;
+                uint64_t id = m_weights_map[weight];
+                assert(id < num_distinct_weights);
+                weight_interval_values.set(i, id);
+            }
+            weight_interval_values.build(index.m_weight_interval_values);
+            index.m_weight_interval_lengths.encode(m_weight_interval_lengths.begin(),
+                                                   m_weight_interval_lengths.size());
+
+            m_weight_dictionary_builder.build(index.m_weight_dictionary);
+        }
+
+        /* return the average empirical entropy per weight */
+        double print_info(uint64_t num_kmers) {
+            assert(!m_weights.empty());
+            double expected_weight = 0.0;
+            double entropy_weights = 0.0;
+            uint64_t print = 0;
+            for (auto p : m_weights) {
+                double prob = static_cast<double>(p.second) / num_kmers;
+                expected_weight += p.first * prob;
+                entropy_weights += prob * std::log2(1.0 / prob);
+                print += 1;
+                if (print <= 10) {
+                    std::cout << "weight:" << p.first << " freq:" << p.second << " ("
+                              << (p.second * 100.0) / num_kmers << "%)" << std::endl;
+                }
+            }
+            std::cout << "expected_weight " << expected_weight << std::endl;
+            std::cout << "entropy_weights " << entropy_weights << " [bits/kmer]" << std::endl;
+            return entropy_weights;
+        }
+
+    private:
+        /* (weight,frequency) pairs during construction, then (weight,id) after sorting */
+        std::unordered_map<uint64_t, uint64_t> m_weights_map;
+        std::vector<std::pair<uint64_t, uint64_t>> m_weights;  // (weight,frequency)
+
+        std::vector<uint64_t> m_weight_interval_values;
+        std::vector<uint64_t> m_weight_interval_lengths;
+
+        pthash::compact_vector::builder m_weight_dictionary_builder;
+    };
+
+    bool empty() const { return m_weight_dictionary.size() == 0; }
+
+    uint64_t weight(uint64_t kmer_id) const {
+        uint64_t i = m_weight_interval_lengths.prev_leq(kmer_id);
+        uint64_t id = m_weight_interval_values.access(i);
+        uint64_t weight = m_weight_dictionary.access(id);
+        return weight;
+    }
+
+    uint64_t num_bits() const {
+        return m_weight_interval_values.bytes() * 8 + m_weight_interval_lengths.num_bits() +
+               m_weight_dictionary.bytes() * 8;
+    }
+
+    void print_space_breakdown(uint64_t num_kmers) const {
+        std::cout << "    weight_interval_values: "
+                  << static_cast<double>(m_weight_interval_values.bytes() * 8) / num_kmers
+                  << " [bits/kmer]\n";
+        std::cout << "    weight_interval_lengths: "
+                  << static_cast<double>(m_weight_interval_lengths.num_bits()) / num_kmers
+                  << " [bits/kmer]\n";
+        std::cout << "    weight_dictionary: "
+                  << static_cast<double>(m_weight_dictionary.bytes() * 8) / num_kmers
+                  << " [bits/kmer]\n";
+    }
+
+    template <typename Visitor>
+    void visit(Visitor& visitor) {
+        visitor.visit(m_weight_interval_values);
+        visitor.visit(m_weight_interval_lengths);
+        visitor.visit(m_weight_dictionary);
+    }
+
+private:
+    pthash::compact_vector m_weight_interval_values;
+    ef_sequence<true> m_weight_interval_lengths;
+    pthash::compact_vector m_weight_dictionary;
+};
+
+}  // namespace sshash
\ No newline at end of file
diff --git a/src/bench.cpp b/src/bench.cpp
index 1290a86..88ec2a8 100644
--- a/src/bench.cpp
+++ b/src/bench.cpp
@@ -22,7 +22,7 @@ int main(int argc, char** argv) {
     dict.print_info();
 
     perf_test_lookup_access(dict);
-    if (dict.weighted()) perf_test_lookup_abundance(dict);
+    if (dict.weighted()) perf_test_lookup_weight(dict);
     perf_test_iterator(dict);
 
     return 0;
diff --git a/src/bench_utils.hpp b/src/bench_utils.hpp
index 21685d6..04827b7 100644
--- a/src/bench_utils.hpp
+++ b/src/bench_utils.hpp
@@ -94,9 +94,9 @@ void perf_test_lookup_access(dictionary const& dict) {
     }
 }
 
-void perf_test_lookup_abundance(dictionary const& dict) {
+void perf_test_lookup_weight(dictionary const& dict) {
     if (!dict.weighted()) {
-        std::cerr << "ERROR: the dictionary does not store abundances" << std::endl;
+        std::cerr << "ERROR: the dictionary does not store weights" << std::endl;
         return;
     }
 
@@ -126,14 +126,13 @@ void perf_test_lookup_abundance(dictionary const& dict) {
     for (uint64_t r = 0; r != runs; ++r) {
         for (auto const& string : lookup_queries) {
             auto id = dict.lookup(string.c_str());
-            auto ab = dict.abundance(id);
-            essentials::do_not_optimize_away(ab);
+            auto w = dict.weight(id);
+            essentials::do_not_optimize_away(w);
         }
     }
     t.stop();
     double nanosec_per_lookup = t.elapsed() / (runs * lookup_queries.size());
-    std::cout << "avg_nanosec_per_positive_lookup_with_abundance " << nanosec_per_lookup
-              << std::endl;
+    std::cout << "avg_nanosec_per_positive_lookup_with_weight " << nanosec_per_lookup << std::endl;
 }
 
 }  // namespace sshash
\ No newline at end of file
diff --git a/src/build.cpp b/src/build.cpp
index 1477c9a..67f258c 100644
--- a/src/build.cpp
+++ b/src/build.cpp
@@ -41,8 +41,7 @@ int main(int argc, char** argv) {
                "Canonical parsing of k-mers. This option changes the parsing and results in a "
                "trade-off between index space and lookup time.",
                "--canonical-parsing", true);
-    parser.add("store_abundances", "Also store the abundances in compressed format.",
-               "--abundances", true);
+    parser.add("weighted", "Also store the weights in compressed format.", "--weighted", true);
     parser.add("output_filename", "Output file name where the data structure will be serialized.",
                "-o", false);
     parser.add("check", "Check correctness after construction.", "--check", true);
@@ -65,7 +64,7 @@ int main(int argc, char** argv) {
     if (parser.parsed("l")) build_config.l = parser.get<double>("l");
     if (parser.parsed("c")) build_config.c = parser.get<double>("c");
     build_config.canonical_parsing = parser.get<bool>("canonical_parsing");
-    build_config.store_abundances = parser.get<bool>("store_abundances");
+    build_config.weighted = parser.get<bool>("weighted");
     build_config.verbose = parser.get<bool>("verbose");
     build_config.print();
 
@@ -75,13 +74,13 @@ int main(int argc, char** argv) {
     bool check = parser.get<bool>("check");
     if (check) {
         check_correctness_lookup_access(dict, input_filename);
-        if (build_config.store_abundances) check_correctness_abundances(dict, input_filename);
+        if (build_config.weighted) check_correctness_weights(dict, input_filename);
         check_correctness_iterator(dict);
     }
     bool bench = parser.get<bool>("bench");
     if (bench) {
         perf_test_lookup_access(dict);
-        if (dict.weighted()) perf_test_lookup_abundance(dict);
+        if (dict.weighted()) perf_test_lookup_weight(dict);
         perf_test_iterator(dict);
     }
     if (parser.parsed("output_filename")) {
diff --git a/src/check_utils.hpp b/src/check_utils.hpp
index a7fbc85..f8e36bf 100644
--- a/src/check_utils.hpp
+++ b/src/check_utils.hpp
@@ -90,17 +90,17 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) {
     return true;
 }
 
-bool check_correctness_abundances(std::istream& is, dictionary const& dict) {
+bool check_correctness_weights(std::istream& is, dictionary const& dict) {
     uint64_t k = dict.k();
     std::string line;
     uint64_t kmer_id = 0;
 
     if (!dict.weighted()) {
-        std::cerr << "ERROR: the dictionary does not store abundances" << std::endl;
+        std::cerr << "ERROR: the dictionary does not store weights" << std::endl;
         return false;
     }
 
-    std::cout << "checking correctness of abundances..." << std::endl;
+    std::cout << "checking correctness of weights..." << std::endl;
 
     while (!is.eof()) {
         std::getline(is, line);  // header line
@@ -121,22 +121,22 @@ bool check_correctness_abundances(std::istream& is, dictionary const& dict) {
         i += 5;  // skip "ab:Z:"
 
         for (uint64_t j = 0; j != seq_len - k + 1; ++j, ++kmer_id) {
-            uint64_t expected_ab = std::strtoull(line.data() + i, &end, 10);
+            uint64_t expected = std::strtoull(line.data() + i, &end, 10);
             i = line.find_first_of(' ', i) + 1;
-            uint64_t got_ab = dict.abundance(kmer_id);
-            if (expected_ab != got_ab) {
-                std::cout << "ERROR for kmer_id " << kmer_id << ": expected_ab " << expected_ab
-                          << " but got_ab " << got_ab << std::endl;
+            uint64_t got = dict.weight(kmer_id);
+            if (expected != got) {
+                std::cout << "ERROR for kmer_id " << kmer_id << ": expected " << expected
+                          << " but got " << got << std::endl;
             }
             if (kmer_id != 0 and kmer_id % 5000000 == 0) {
-                std::cout << "checked " << kmer_id << " abundances" << std::endl;
+                std::cout << "checked " << kmer_id << " weights" << std::endl;
             }
         }
 
         std::getline(is, line);  // skip DNA sequence
     }
 
-    std::cout << "checked " << kmer_id << " abundances" << std::endl;
+    std::cout << "checked " << kmer_id << " weights" << std::endl;
     std::cout << "EVERYTHING OK!" << std::endl;
     return true;
 }
@@ -162,15 +162,15 @@ bool check_correctness_lookup_access(dictionary const& dict, std::string const&
 /*
    The input file must be the one the index was built from.
 */
-bool check_correctness_abundances(dictionary const& dict, std::string const& filename) {
+bool check_correctness_weights(dictionary const& dict, std::string const& filename) {
     std::ifstream is(filename.c_str());
     if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'");
     bool good = true;
     if (util::ends_with(filename, ".gz")) {
         zip_istream zis(is);
-        good = check_correctness_abundances(zis, dict);
+        good = check_correctness_weights(zis, dict);
     } else {
-        good = check_correctness_abundances(is, dict);
+        good = check_correctness_weights(is, dict);
     }
     is.close();
     return good;
diff --git a/src/permute.cpp b/src/permute.cpp
index 9b3064c..0d58fb3 100644
--- a/src/permute.cpp
+++ b/src/permute.cpp
@@ -9,8 +9,8 @@
 using namespace sshash;
 
 struct permute_data {
-    permute_data() : num_runs_abundances(0), num_sequences(0) {}
-    uint64_t num_runs_abundances;
+    permute_data() : num_runs_weights(0), num_sequences(0) {}
+    uint64_t num_runs_weights;
     uint64_t num_sequences;
     std::vector<node> nodes;
 };
@@ -22,15 +22,15 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     uint64_t num_kmers = 0;
     uint64_t seq_len = 0;
 
-    uint64_t sum_of_abundances = 0;
-    uint64_t num_sequences_diff_abs = 0;  // num sequences whose kmers have different abundances
-    uint64_t num_sequences_all_mfa = 0;   // num sequences whose kmers have same abundance == mfa
+    uint64_t sum_of_weights = 0;
+    uint64_t num_sequences_diff_weights = 0;  // num sequences whose kmers have different weights
+    uint64_t num_sequences_all_mfw = 0;       // num sequences whose kmers have same weight == mfw
 
-    data.num_runs_abundances = 0;
+    data.num_runs_weights = 0;
     data.num_sequences = 0;
 
-    /* count number of distinct abundances */
-    std::unordered_set<uint32_t> distinct_abundances;
+    /* count number of distinct weights */
+    std::unordered_set<uint32_t> distinct_weights;
 
     auto parse_header = [&]() {
         if (sequence.empty()) return;
@@ -38,7 +38,7 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         /*
             Heder format:
             >[id] LN:i:[seq_len] ab:Z:[ab_seq]
-            where [ab_seq] is a space-separated sequence of integer counters (the abundances),
+            where [ab_seq] is a space-separated sequence of integer counters (the weights),
             whose length is equal to [seq_len]-k+1
         */
 
@@ -68,34 +68,34 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
         expect(sequence[i + 4], ':');
         i += 5;
 
-        bool kmers_have_all_mfa = true;
-        bool kmers_have_different_abundances = false;
+        bool kmers_have_all_mfw = true;
+        bool kmers_have_different_weights = false;
         uint64_t front = constants::invalid;
         uint64_t back = constants::invalid;
 
-        for (uint64_t j = 0, prev_abundance = constants::invalid; j != seq_len - k + 1; ++j) {
-            uint64_t abundance = std::strtoull(sequence.data() + i, nullptr, 10);
-            sum_of_abundances += abundance;
+        for (uint64_t j = 0, prev_weight = constants::invalid; j != seq_len - k + 1; ++j) {
+            uint64_t weight = std::strtoull(sequence.data() + i, nullptr, 10);
+            sum_of_weights += weight;
             i = sequence.find_first_of(' ', i) + 1;
 
-            distinct_abundances.insert(abundance);
+            distinct_weights.insert(weight);
 
             /* set front and back */
-            if (j == 0) front = abundance;
-            if (j == seq_len - k) back = abundance;
+            if (j == 0) front = weight;
+            if (j == seq_len - k) back = weight;
 
             /* accumulate statistics */
-            if (abundance != constants::most_frequent_abundance) kmers_have_all_mfa = false;
-            if (j > 0 and abundance != prev_abundance) kmers_have_different_abundances = true;
+            if (weight != constants::most_frequent_weight) kmers_have_all_mfw = false;
+            if (j > 0 and weight != prev_weight) kmers_have_different_weights = true;
 
             /* count the number of runs */
-            if (abundance != prev_abundance) data.num_runs_abundances += 1;
+            if (weight != prev_weight) data.num_runs_weights += 1;
 
-            prev_abundance = abundance;
+            prev_weight = weight;
         }
 
-        num_sequences_diff_abs += kmers_have_different_abundances;
-        num_sequences_all_mfa += kmers_have_all_mfa;
+        num_sequences_diff_weights += kmers_have_different_weights;
+        num_sequences_all_mfw += kmers_have_all_mfw;
 
         constexpr bool sign = true;
         data.nodes.emplace_back(data.num_sequences, front, back, sign);
@@ -124,26 +124,27 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
     }
 
     assert(data.nodes.size() == data.num_sequences);
-    assert(data.num_runs_abundances >= data.num_sequences);
+    assert(data.num_runs_weights >= data.num_sequences);
 
     std::cout << "read " << data.num_sequences << " sequences, " << num_bases << " bases, "
               << num_kmers << " kmers" << std::endl;
-    std::cout << "found " << distinct_abundances.size() << " distinct abundances" << std::endl;
-    std::cout << "sum_of_abundances " << sum_of_abundances << std::endl;
-    std::cout << "num_sequences whose kmers have different abundances: " << num_sequences_diff_abs
+    std::cout << "found " << distinct_weights.size() << " distinct weights" << std::endl;
+    std::cout << "sum_of_weights " << sum_of_weights << std::endl;
+    std::cout << "num_sequences whose kmers have different weights: " << num_sequences_diff_weights
               << "/" << data.num_sequences << " ("
-              << (num_sequences_diff_abs * 100.0) / data.num_sequences << "%)" << std::endl;
-    std::cout << "num_sequences whose kmers all have the same abundance != mfa: "
-              << (data.num_sequences - num_sequences_diff_abs) << "/" << data.num_sequences << " ("
-              << ((data.num_sequences - num_sequences_diff_abs) * 100.0) / data.num_sequences
+              << (num_sequences_diff_weights * 100.0) / data.num_sequences << "%)" << std::endl;
+    std::cout << "num_sequences whose kmers all have the same weight != mfw: "
+              << (data.num_sequences - num_sequences_diff_weights) << "/" << data.num_sequences
+              << " ("
+              << ((data.num_sequences - num_sequences_diff_weights) * 100.0) / data.num_sequences
               << "%)" << std::endl;
-    std::cout << "num_sequences whose kmers all have the same abundance == mfa: "
-              << num_sequences_all_mfa << "/" << (data.num_sequences - num_sequences_diff_abs)
+    std::cout << "num_sequences whose kmers all have the same weight == mfw: "
+              << num_sequences_all_mfw << "/" << (data.num_sequences - num_sequences_diff_weights)
               << " ("
-              << (num_sequences_all_mfa * 100.0) / (data.num_sequences - num_sequences_diff_abs)
+              << (num_sequences_all_mfw * 100.0) / (data.num_sequences - num_sequences_diff_weights)
               << "%)" << std::endl;
 
-    /* computation of lower bound to the number of final abundance runs */
+    /* computation of lower bound to the number of final weight runs */
     {
         struct info {
             /* how many times an end-point weight appears */
@@ -187,27 +188,27 @@ void parse_file(std::istream& is, permute_data& data, build_configuration const&
             uint64_t freq = p.second.freq;
             sum_of_freqs += freq;
 
-            /* special case: abundance will appear as singleton, so count twice */
+            /* special case: weight will appear as singleton, so count twice */
             if (p.second.all_equal) {
                 num_endpoints += 2;
                 continue;
             }
 
-            /* if the excess of frequency is odd, then the abundance will appear as end-point */
+            /* if the excess of frequency is odd, then the weight will appear as end-point */
             if (freq % 2 == 1) num_endpoints += 1;
         }
 
         assert(sum_of_freqs == 2 * data.num_sequences);
         assert(num_endpoints % 2 == 0);
         (void)sum_of_freqs;
-        uint64_t num_distinct_abundances = distinct_abundances.size();
-        uint64_t num_switch_points = data.num_runs_abundances - data.nodes.size();
+        uint64_t num_distinct_weights = distinct_weights.size();
+        uint64_t num_switch_points = data.num_runs_weights - data.nodes.size();
         uint64_t num_walks = num_endpoints / 2;
         std::cout << "(estimated) num_walks = " << num_walks << std::endl;
-        std::cout << "num_runs_abundances = " << data.num_runs_abundances << std::endl;
+        std::cout << "num_runs_weights = " << data.num_runs_weights << std::endl;
         std::cout << "num_switch_points = " << num_switch_points << std::endl;
 
-        uint64_t R_lo = std::max<uint64_t>(num_distinct_abundances, num_switch_points + 1);
+        uint64_t R_lo = std::max<uint64_t>(num_distinct_weights, num_switch_points + 1);
         uint64_t R_hi = num_switch_points + num_walks;
         std::cout << "R_lo = " << R_lo << std::endl;
         std::cout << "R_hi = " << R_hi << std::endl;
@@ -278,16 +279,16 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) {
     i = j + 6;  // skip ' ab:Z:'
     output.append(input.data(), input.data() + i);
 
-    std::vector<uint32_t> abundances;
-    abundances.reserve(seq_len - k + 1);
+    std::vector<uint32_t> weights;
+    weights.reserve(seq_len - k + 1);
     for (uint64_t j = 0; j != seq_len - k + 1; ++j) {
-        uint64_t abundance = std::strtoull(input.data() + i, nullptr, 10);
-        abundances.push_back(abundance);
+        uint64_t weight = std::strtoull(input.data() + i, nullptr, 10);
+        weights.push_back(weight);
         i = input.find_first_of(' ', i) + 1;
     }
 
-    std::reverse(abundances.begin(), abundances.end());
-    for (auto abundance : abundances) output.append(std::to_string(abundance) + " ");
+    std::reverse(weights.begin(), weights.end());
+    for (auto weight : weights) output.append(std::to_string(weight) + " ");
 }
 
 void permute_and_write(std::istream& is, std::string const& output_filename,
@@ -344,7 +345,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename,
 
         if (!signs[i]) {
             /* compute reverse complement of dna_sequence
-               and reverse the abundances in header_sequence */
+               and reverse the weights in header_sequence */
             dna_sequence_rc.resize(dna_sequence.size());
             header_sequence_r.clear();
             util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(),
@@ -467,7 +468,7 @@ int main(int argc, char** argv) {
                "Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:\n"
                "\t- without duplicate nor invalid kmers\n"
                "\t- one DNA sequence per line\n"
-               "\t- with also kmers' abundances.\n"
+               "\t- with also kmers' weights.\n"
                "\tFor example, it could be the de Bruijn graph topology output by BCALM.");
 
     parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").");
@@ -502,7 +503,7 @@ int main(int argc, char** argv) {
 
     {
         /* compute cover */
-        cover c(data.num_sequences, data.num_runs_abundances);
+        cover c(data.num_sequences, data.num_runs_weights);
         assert(data.nodes.size() == data.num_sequences);
         c.compute(data.nodes);
         c.save(permutation_filename);