Skip to content

Commit

Permalink
resolved conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed Mar 1, 2022
2 parents c06535b + 9c52477 commit 275e186
Show file tree
Hide file tree
Showing 12 changed files with 501 additions and 50 deletions.
37 changes: 24 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ two basic queries are supported:
- i = Lookup(g), where i is in [0,n) if the k-mer g is found in the dictionary or i = -1 otherwise;
- g = Access(i), where g is the k-mer associated to the identifier i.

If also the abundances of the k-mers (their frequency counts) are stored in the dictionary, then the dictionary is said to be *weighted* and it also supports:

- c = Abundance(i), where i is a given k-mer identifier.

A membership query (determine if a given k-mer is present in the dictionary or not) is, therefore, supported by means of the lookup query.
The dictionary can also stream through all k-mers of a given DNA file
(.fasta or .fastq formats) to determine their membership to the dictionary.
Expand Down Expand Up @@ -91,44 +95,47 @@ where the code was compiled (see the section [Compiling the Code](#compiling-the

to show the usage of the driver program (reported below for convenience).

Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [-o output_filename] [--check] [--bench] [--verbose]
Usage: ./build [-h,--help] input_filename k m [-s seed] [-l l] [-c c] [--canonical-parsing] [--abundances] [-o output_filename] [--check] [--bench] [--verbose]

input_filename
Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:
- without duplicate nor invalid kmers
- one DNA sequence per line.
For example, it could be the de Bruijn graph topology output by BCALM.

k
K-mer length (must be <= 31).

m
Minimizer length (must be < k).

[-s seed]
Seed for construction (default is 1).

[-l l]
A (integer) constant that controls the space/time trade-off of the dictionary. A reasonable values lies between 2 and 12 (default is 6).

[-c c]
A (floating point) constant that trades construction speed for space effectiveness of minimal perfect hashing. A reasonable value lies between 3.0 and 10.0 (default is 3.000000).

[--canonical-parsing]
Canonical parsing of k-mers. This option changes the parsing and results in a trade-off between index space and lookup time.


[--abundances]
Also store the abundances in compressed format.

[-o output_filename]
Output file name where the data structure will be serialized.

[--check]
Check correctness after construction.

[--bench]
Run benchmark after construction.

[--verbose]
Verbose output during construction.

[-h,--help]
Print this help text and silently exits.
Expand All @@ -154,6 +161,10 @@ To run a performance benchmark after construction of the index,
use:

./bench salmonella_enterica.index

To also store the abundances, use the option `--abundances`:

./build ../data/unitigs_stitched/with_abundances/salmonella_enterica_k31_ust.abundances.fa.gz 31 13 --abundances --check --verbose

### Example 2

Expand Down
Binary file not shown.
Binary file not shown.
242 changes: 242 additions & 0 deletions include/abundances.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#pragma once

#include <vector>
#include <unordered_map> // count the distinct abundances
#include "ef_sequence.hpp"

namespace sshash {

struct abundances {
struct builder {
builder() : m_most_frequent_abundance(0) {}

void init(uint64_t most_frequent_abundance) {
m_most_frequent_abundance = most_frequent_abundance;
m_kmer_id_interval_lengths.push_back(0);
m_abundance_interval_lengths.push_back(0);
}

void eat(uint64_t abundance) {
assert(abundance > 0);
auto it = m_abundances_map.find(abundance);
if (it != m_abundances_map.cend()) { // found
(*it).second += 1;
} else {
m_abundances_map[abundance] = 1;
}
}

void push_kmer_id_interval(uint64_t value, uint64_t length) {
m_kmer_id_interval_values.push_back(value);
m_kmer_id_interval_lengths.push_back(m_kmer_id_interval_lengths.back() + length);
}

void push_abundance_interval(uint64_t value, uint64_t length) {
m_abundance_interval_values.push_back(value);
m_abundance_interval_lengths.push_back(m_abundance_interval_lengths.back() + length);
}

uint64_t num_kmer_id_intervals() const { return m_kmer_id_interval_values.size(); }
uint64_t num_abundance_intervals() const { return m_abundance_interval_values.size(); }

void finalize(uint64_t num_kmers) {
assert(
std::is_sorted(m_kmer_id_interval_values.begin(), m_kmer_id_interval_values.end()));
assert(std::is_sorted(m_kmer_id_interval_lengths.begin(),
m_kmer_id_interval_lengths.end()));
assert(std::is_sorted(m_abundance_interval_lengths.begin(),
m_abundance_interval_lengths.end()));

std::cout << "num_kmer_id_intervals " << num_kmer_id_intervals() << std::endl;
std::cout << "num_abundance_intervals " << num_abundance_intervals() << std::endl;

uint64_t num_distinct_abundances = m_abundances_map.size();

std::cout << "found " << num_distinct_abundances << " distint abundances (ceil(log2("
<< num_distinct_abundances
<< ")) = " << std::ceil(std::log2(num_distinct_abundances)) << ")"
<< std::endl;

m_abundances.reserve(num_distinct_abundances);
uint64_t n = 0;
uint64_t largest_ab = 0;
for (auto p : m_abundances_map) {
if (p.first > largest_ab) largest_ab = p.first;
n += p.second;
m_abundances.push_back(p);
}
assert(largest_ab > 0);

std::cout << "largest_ab+1 = " << largest_ab + 1 << " (ceil(log2(" << largest_ab + 1
<< ")) = " << std::ceil(std::log2(largest_ab + 1)) << ")" << std::endl;

if (n != num_kmers) {
std::cout << "ERROR: expected " << num_kmers << " kmers but got " << n << std::endl;
throw std::runtime_error("file is malformed");
}

std::sort(m_abundances.begin(), m_abundances.end(), [](auto const& x, auto const& y) {
if (x.second != y.second) return x.second > y.second;
return x.first < y.first;
});

/* If this tests fails, then we need to change the value of
constants::most_frequent_abundance */
if (m_most_frequent_abundance != m_abundances.front().first) {
throw std::runtime_error("the most frequent abundance is not " +
std::to_string(constants::most_frequent_abundance));
}

uint64_t rest = num_kmers - m_abundances.front().second;
std::cout << "kmers that do not have the most frequent ab: " << rest << " ("
<< (rest * 100.0) / num_kmers << "%)" << std::endl;
std::cout << "cumulative_kmer_id_interval_lengths " << m_kmer_id_interval_lengths.back()
<< '/' << rest << std::endl;
std::cout << "cumulative_abundance_interval_lengths "
<< m_abundance_interval_lengths.back() << '/' << rest << std::endl;

m_abundance_dictionary_builder.resize(num_distinct_abundances,
std::ceil(std::log2(largest_ab + 1)));
for (uint64_t id = 0; id != num_distinct_abundances; ++id) {
uint64_t ab_value = m_abundances[id].first;
m_abundance_dictionary_builder.set(id, ab_value);
m_abundances_map[ab_value] = id;
}
}

void build(abundances& index) {
std::swap(index.m_most_frequent_abundance, m_most_frequent_abundance);

index.m_kmer_id_interval_values.encode(m_kmer_id_interval_values.begin(),
m_kmer_id_interval_values.size());
index.m_kmer_id_interval_lengths.encode(m_kmer_id_interval_lengths.begin(),
m_kmer_id_interval_lengths.size());

uint64_t num_distinct_abundances = m_abundance_dictionary_builder.size();
pthash::compact_vector::builder abundance_interval_values;
abundance_interval_values.resize(
m_abundance_interval_values.size(),
num_distinct_abundances == 1 ? 1 : std::ceil(std::log2(num_distinct_abundances)));
for (uint64_t i = 0; i != m_abundance_interval_values.size(); ++i) {
uint64_t abundance = m_abundance_interval_values[i];
uint64_t id = m_abundances_map[abundance];
assert(id < num_distinct_abundances);
abundance_interval_values.set(i, id);
}
abundance_interval_values.build(index.m_abundance_interval_values);
index.m_abundance_interval_lengths.encode(m_abundance_interval_lengths.begin(),
m_abundance_interval_lengths.size());

m_abundance_dictionary_builder.build(index.m_abundance_dictionary);
}

// return the average empirical entropy per abundance
double print_info(uint64_t num_kmers) {
assert(!m_abundances.empty());
double expected_ab_value = 0.0;
double entropy_ab = 0.0;
uint64_t print = 0;
for (auto p : m_abundances) {
double prob = static_cast<double>(p.second) / num_kmers;
expected_ab_value += p.first * prob;
entropy_ab += prob * std::log2(1.0 / prob);
print += 1;
if (print <= 10) {
std::cout << "ab:" << p.first << " freq:" << p.second << " ("
<< (p.second * 100.0) / num_kmers << "%)" << std::endl;
}
}
std::cout << "expected_ab_value " << expected_ab_value << std::endl;
std::cout << "entropy_ab " << entropy_ab << " [bits/kmer]" << std::endl;
return entropy_ab;
}

private:
uint64_t m_most_frequent_abundance;

/* (abundance,frequency) pairs during construction, then (abundance,id) after sorting */
std::unordered_map<uint64_t, uint64_t> m_abundances_map;
std::vector<std::pair<uint64_t, uint64_t>> m_abundances; // (abundance,frequency)

std::vector<uint64_t> m_kmer_id_interval_values;
std::vector<uint64_t> m_kmer_id_interval_lengths;

std::vector<uint64_t> m_abundance_interval_values;
std::vector<uint64_t> m_abundance_interval_lengths;

pthash::compact_vector::builder m_abundance_dictionary_builder;
};

bool empty() const { return m_abundance_dictionary.size() == 0; }

uint64_t abundance(uint64_t kmer_id) const {
bool is_present = false;
uint64_t rank = 0;

auto [pos, val] = m_kmer_id_interval_values.next_geq(kmer_id);
if (val == kmer_id) {
is_present = true;
rank = m_kmer_id_interval_lengths.access(pos);
} else {
if (pos > 0) {
--pos;
val = m_kmer_id_interval_values.access(pos);
rank = m_kmer_id_interval_lengths.access(pos);
uint64_t length = m_kmer_id_interval_lengths.access(pos + 1) - rank;
if (kmer_id < val + length) {
is_present = true;
assert(kmer_id >= val);
rank += kmer_id - val;
}
}
}

if (!is_present) return m_most_frequent_abundance;

uint64_t i = m_abundance_interval_lengths.prev_leq(rank);
uint64_t id = m_abundance_interval_values.access(i);
uint64_t abundance = m_abundance_dictionary.access(id);

return abundance;
}

uint64_t num_bits() const {
return sizeof(m_most_frequent_abundance) * 8 + m_kmer_id_interval_values.num_bits() +
m_kmer_id_interval_lengths.num_bits() + m_abundance_interval_values.bytes() * 8 +
m_abundance_interval_lengths.num_bits() + m_abundance_dictionary.bytes() * 8;
}

template <typename Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_most_frequent_abundance);
visitor.visit(m_kmer_id_interval_values);
visitor.visit(m_kmer_id_interval_lengths);
visitor.visit(m_abundance_interval_values);
visitor.visit(m_abundance_interval_lengths);
visitor.visit(m_abundance_dictionary);
}

private:
uint64_t m_most_frequent_abundance;

/*****
We model abundances as two lists of intervals.
Each interval is a pair (value,length).
- First list: kmer_ids list. In this case, a pair (value,length)
represents all kmer_ids = value, value+1, value+2, ..., value+length-1.
- Second list: abundance list. In this case, a pair (value,length)
represents that the abundance [value] repeats for [length] times.
*/

ef_sequence<true> m_kmer_id_interval_values;
ef_sequence<false> m_kmer_id_interval_lengths;

pthash::compact_vector m_abundance_interval_values;
ef_sequence<true> m_abundance_interval_lengths;
/***/

/* Abundance dictionary, listing all distinct abundances sorted by decreasing frequency. */
pthash::compact_vector m_abundance_dictionary;
};

} // namespace sshash
Loading

0 comments on commit 275e186

Please sign in to comment.