Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/jermp/sshash
Browse files Browse the repository at this point in the history
  • Loading branch information
adamant-pwn committed Jul 1, 2024
2 parents 86e483d + 6904074 commit 2b7eeb0
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 142 deletions.
21 changes: 3 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,26 +216,11 @@ Below a comparison between the dictionary built in Example 2 (not canonical)
and the one just built (Example 3, canonical).

./sshash query -i salmonella_100.index -q ../data/queries/SRR5833294.10K.fastq.gz
index size: 10.3981 [MB] (6.36232 [bits/kmer])
==== query report:
num_kmers = 460000
num_positive_kmers = 46 (0.01%)
num_searches = 42/46 (91.3043%)
num_extensions = 4/46 (8.69565%)
elapsed = 229.159 millisec / 0.229159 sec / 0.00381932 min / 498.172 ns/kmer

./sshash query -i salmonella_100.canon.index -q ../data/queries/SRR5833294.10K.fastq.gz
index size: 11.0657 [MB] (6.77083 [bits/kmer])
==== query report:
num_kmers = 460000
num_positive_kmers = 46 (0.01%)
num_searches = 42/46 (91.3043%)
num_extensions = 4/46 (8.69565%)
elapsed = 107.911 millisec / 0.107911 sec / 0.00179852 min / 234.589 ns/kmer

We see that the canonical dictionary is twice as fast as the regular dictionary
for low-hit workloads,
even on this tiny example, for only +0.4 bits/k-mer.

The canonical dictionary can be twice as fast as the regular dictionary
for low-hit workloads, even on this tiny example, for only +0.4 bits/k-mer.

### Example 4

Expand Down
17 changes: 13 additions & 4 deletions include/buckets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,12 +256,14 @@ struct buckets {
8 * (offsets.bytes() + strings.bytes());
}

template <typename Visitor>
void visit(Visitor& visitor) const {
visit_impl(visitor, *this);
}

template <typename Visitor>
void visit(Visitor& visitor) {
visitor.visit(pieces);
visitor.visit(num_super_kmers_before_bucket);
visitor.visit(offsets);
visitor.visit(strings);
visit_impl(visitor, *this);
}

ef_sequence<true> pieces;
Expand All @@ -270,6 +272,13 @@ struct buckets {
pthash::bit_vector strings;

private:
template <typename Visitor, typename T>
static void visit_impl(Visitor& visitor, T&& t) {
visitor.visit(t.pieces);
visitor.visit(t.num_super_kmers_before_bucket);
visitor.visit(t.offsets);
visitor.visit(t.strings);
}
bool is_valid(lookup_result res) const {
return (res.contig_size != constants::invalid_uint64 and
res.kmer_id_in_contig < res.contig_size) and
Expand Down
137 changes: 67 additions & 70 deletions include/builder/build_skew_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
buckets_statistics const& buckets_stats) {
const uint64_t min_log2_size = m_skew_index.min_log2;
const uint64_t max_log2_size = m_skew_index.max_log2;
const uint64_t min_size = 1ULL << min_log2_size;

uint64_t max_num_super_kmers_in_bucket = buckets_stats.max_num_super_kmers_in_bucket();
m_skew_index.log2_max_num_super_kmers_in_bucket =
std::ceil(std::log2(buckets_stats.max_num_super_kmers_in_bucket()));

std::cout << "max_num_super_kmers_in_bucket " << max_num_super_kmers_in_bucket << std::endl;
std::cout << "max_num_super_kmers_in_bucket " << buckets_stats.max_num_super_kmers_in_bucket()
<< std::endl;
std::cout << "log2_max_num_super_kmers_in_bucket "
<< m_skew_index.log2_max_num_super_kmers_in_bucket << std::endl;

Expand All @@ -27,13 +28,13 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); it.has_next();
it.next()) {
uint64_t list_size = it.list().size();
if (list_size > (1ULL << min_log2_size)) {
if (list_size > min_size) {
num_super_kmers_in_skew_index += list_size;
++num_buckets_in_skew_index;
}
}
std::cout << "num_buckets_in_skew_index " << num_buckets_in_skew_index << "/"
<< buckets_stats.num_buckets() << "("
<< buckets_stats.num_buckets() << " ("
<< (num_buckets_in_skew_index * 100.0) / buckets_stats.num_buckets() << "%)"
<< std::endl;

Expand All @@ -49,7 +50,7 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); it.has_next();
it.next()) {
auto list = it.list();
if (list.size() > (1ULL << min_log2_size)) {
if (list.size() > min_size) {
minimizer_tuple const* begin = lists_tuples.data() + lists_tuples.size();
std::copy(list.begin_ptr(), list.end_ptr(), std::back_inserter(lists_tuples));
minimizer_tuple const* end = lists_tuples.data() + lists_tuples.size();
Expand All @@ -73,52 +74,40 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
m_skew_index.positions.resize(num_partitions);

{
std::cout << "computing partitions..." << std::endl;
std::cout << "computing sizes of partitions..." << std::endl;

uint64_t partition_id = 0;
uint64_t lower = 1ULL << min_log2_size;
uint64_t lower = min_size;
uint64_t upper = 2 * lower;
uint64_t num_kmers_in_skew_index = 0;
for (uint64_t i = 0; i != lists.size() + 1; ++i) {
if (i == lists.size() or lists[i].size() > upper) {
std::cout << "num_kmers belonging to buckets of size > " << lower
for (uint64_t i = 0; i <= lists.size(); ++i) {
while (i == lists.size() or lists[i].size() > upper) {
std::cout << " partition_id = " << partition_id
<< ": num_kmers belonging to buckets of size > " << lower
<< " and <= " << upper << ": " << num_kmers_in_partition[partition_id]
<< std::endl;
if (num_kmers_in_partition[partition_id] == 0) {
std::cout << "==> Empty bucket detected:\n";
std::cout << "there is no k-mer that belongs to a list of size > " << lower
<< " and <= " << upper << std::endl;
throw empty_bucket_runtime_error();
}
num_kmers_in_skew_index += num_kmers_in_partition[partition_id];
partition_id += 1;

if (i == lists.size()) break;

lower = upper;
upper = 2 * lower;
if (partition_id == num_partitions - 1) upper = max_num_super_kmers_in_bucket;

/*
If still larger than upper, then there is an empty bucket
and we should try different parameters.
*/
if (lists[i].size() > upper) {
std::cout << "==> Empty bucket detected:\n";
std::cout << "there is no list of size > " << lower << " and <= " << upper
<< std::endl;
throw empty_bucket_runtime_error();
partition_id += 1;
if (partition_id == num_partitions - 1) {
upper = buckets_stats.max_num_super_kmers_in_bucket();
}
}

if (i == lists.size()) break;

assert(lists[i].size() > lower and lists[i].size() <= upper);
for (auto [offset, num_kmers_in_super_kmer] : lists[i]) {
(void)offset; // unused
num_kmers_in_partition[partition_id] += num_kmers_in_super_kmer;
}
}
assert(partition_id == num_partitions);
std::cout << "num_kmers_in_skew_index " << num_kmers_in_skew_index << "("
assert(partition_id == num_partitions - 1);
std::cout << "num_kmers_in_skew_index " << num_kmers_in_skew_index << " ("
<< (num_kmers_in_skew_index * 100.0) / buckets_stats.num_kmers() << "%)"
<< std::endl;
assert(num_kmers_in_skew_index == std::accumulate(num_kmers_in_partition.begin(),
Expand All @@ -127,6 +116,8 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
}

{
std::cout << "building partitions..." << std::endl;

pthash::build_configuration mphf_config;
mphf_config.c = build_config.c;
mphf_config.alpha = 0.94;
Expand All @@ -137,7 +128,7 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
mphf_config.num_partitions = 4 * mphf_config.num_threads;

uint64_t partition_id = 0;
uint64_t lower = 1ULL << min_log2_size;
uint64_t lower = min_size;
uint64_t upper = 2 * lower;
uint64_t num_bits_per_pos = min_log2_size + 1;

Expand All @@ -150,59 +141,63 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos);
/*******/

for (uint64_t i = 0; i != lists.size() + 1; ++i) {
if (i == lists.size() or lists[i].size() > upper) {
std::cout << "lower " << lower << "; upper " << upper << "; num_bits_per_pos "
for (uint64_t i = 0; i <= lists.size(); ++i) {
while (i == lists.size() or lists[i].size() > upper) {
std::cout << " lower " << lower << "; upper " << upper << "; num_bits_per_pos "
<< num_bits_per_pos << "; keys_in_partition.size() "
<< keys_in_partition.size() << std::endl;

auto& mphf = m_skew_index.mphfs[partition_id];
assert(num_kmers_in_partition[partition_id] == keys_in_partition.size());
assert(num_kmers_in_partition[partition_id] == super_kmer_ids_in_partition.size());

if (keys_in_partition.size() / mphf_config.num_partitions <
pthash::constants::min_partition_size) {
mphf_config.num_partitions = std::max<uint64_t>(
1, keys_in_partition.size() / (2 * pthash::constants::min_partition_size));
}

if (build_config.verbose) {
std::cout << " building minimizers MPHF with " << mphf_config.num_threads
<< " threads and " << mphf_config.num_partitions << " partitions..."
if (num_kmers_in_partition[partition_id] > 0) //
{
if (keys_in_partition.size() / mphf_config.num_partitions <
pthash::constants::min_partition_size) {
mphf_config.num_partitions =
std::max<uint64_t>(1, keys_in_partition.size() /
(2 * pthash::constants::min_partition_size));
}

if (build_config.verbose) {
std::cout << " building MPHF with " << mphf_config.num_threads
<< " threads and " << mphf_config.num_partitions
<< " partitions..." << std::endl;
}

auto& mphf = m_skew_index.mphfs[partition_id];
mphf.build_in_internal_memory(keys_in_partition.begin(),
keys_in_partition.size(), mphf_config);

mphf_config.num_partitions =
4 * mphf_config.num_threads; // restore default value

std::cout << " built mphs[" << partition_id << "] for "
<< keys_in_partition.size() << " keys; bits/key = "
<< static_cast<double>(mphf.num_bits()) / mphf.num_keys()
<< std::endl;
}

mphf.build_in_internal_memory(keys_in_partition.begin(), keys_in_partition.size(),
mphf_config);

mphf_config.num_partitions = 4 * mphf_config.num_threads; // restore default value

std::cout << " built mphs[" << partition_id << "] for " << keys_in_partition.size()
<< " keys; bits/key = "
<< static_cast<double>(mphf.num_bits()) / mphf.num_keys() << std::endl;

for (uint64_t i = 0; i != keys_in_partition.size(); ++i) {
kmer_t kmer = keys_in_partition[i];
uint64_t pos = mphf(kmer);
uint32_t super_kmer_id = super_kmer_ids_in_partition[i];
cvb_positions.set(pos, super_kmer_id);
for (uint64_t i = 0; i != keys_in_partition.size(); ++i) {
kmer_t kmer = keys_in_partition[i];
uint64_t pos = mphf(kmer);
uint32_t super_kmer_id = super_kmer_ids_in_partition[i];
cvb_positions.set(pos, super_kmer_id);
}
auto& positions = m_skew_index.positions[partition_id];
cvb_positions.build(positions);

std::cout << " built positions[" << partition_id << "] for "
<< positions.size() << " keys; bits/key = "
<< (positions.bytes() * 8.0) / positions.size() << std::endl;
}
auto& positions = m_skew_index.positions[partition_id];
cvb_positions.build(positions);

std::cout << " built positions[" << partition_id << "] for " << positions.size()
<< " keys; bits/key = " << (positions.bytes() * 8.0) / positions.size()
<< std::endl;

partition_id += 1;

if (i == lists.size()) break;

lower = upper;
upper = 2 * lower;
num_bits_per_pos += 1;
partition_id += 1;
if (partition_id == num_partitions - 1) {
upper = max_num_super_kmers_in_bucket;
upper = buckets_stats.max_num_super_kmers_in_bucket();
num_bits_per_pos = m_skew_index.log2_max_num_super_kmers_in_bucket;
}

Expand All @@ -213,6 +208,8 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
cvb_positions.resize(num_kmers_in_partition[partition_id], num_bits_per_pos);
}

if (i == lists.size()) break;

assert(lists[i].size() > lower and lists[i].size() <= upper);
uint64_t super_kmer_id = 0;
for (auto [offset, num_kmers_in_super_kmer] : lists[i]) {
Expand All @@ -228,7 +225,7 @@ void build_skew_index(skew_index<kmer_t>& m_skew_index, parse_data<kmer_t>& data
++super_kmer_id;
}
}
assert(partition_id == num_partitions);
assert(partition_id == num_partitions - 1);
}

std::cout << "num_bits_for_skew_index " << m_skew_index.num_bits() << "("
Expand Down
5 changes: 0 additions & 5 deletions include/builder/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ namespace sshash {
<< (time * 1000) / num_kmers << " [ns/kmer])" << std::endl;
}

struct empty_bucket_runtime_error : public std::runtime_error {
empty_bucket_runtime_error()
: std::runtime_error("try a different choice of l or change seed") {}
};

struct parse_runtime_error : public std::runtime_error {
parse_runtime_error() : std::runtime_error("did you provide an input file with weights?") {}
};
Expand Down
28 changes: 19 additions & 9 deletions include/dictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,20 +127,30 @@ struct dictionary {
void print_space_breakdown() const;
void compute_statistics() const;

template <typename Visitor>
void visit(Visitor& visitor) const {
visit_impl(visitor, *this);
}

template <typename Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_size);
visitor.visit(m_seed);
visitor.visit(m_k);
visitor.visit(m_m);
visitor.visit(m_canonical_parsing);
visitor.visit(m_minimizers);
visitor.visit(m_buckets);
visitor.visit(m_skew_index);
visitor.visit(m_weights);
visit_impl(visitor, *this);
}

private:
template <typename Visitor, typename T>
static void visit_impl(Visitor& visitor, T&& t) {
visitor.visit(t.m_size);
visitor.visit(t.m_seed);
visitor.visit(t.m_k);
visitor.visit(t.m_m);
visitor.visit(t.m_canonical_parsing);
visitor.visit(t.m_minimizers);
visitor.visit(t.m_buckets);
visitor.visit(t.m_skew_index);
visitor.visit(t.m_weights);
}

uint64_t m_size;
uint64_t m_seed;
uint16_t m_k;
Expand Down
19 changes: 14 additions & 5 deletions include/ef_sequence.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,16 +186,25 @@ struct ef_sequence {
m_high_bits_d0.bytes() + m_low_bits.bytes());
}

template <typename Visitor>
void visit(Visitor& visitor) const {
visit_impl(visitor, *this);
}

template <typename Visitor>
void visit(Visitor& visitor) {
visitor.visit(m_universe);
visitor.visit(m_high_bits);
visitor.visit(m_high_bits_d1);
visitor.visit(m_high_bits_d0);
visitor.visit(m_low_bits);
visit_impl(visitor, *this);
}

private:
template <typename Visitor, typename T>
static void visit_impl(Visitor& visitor, T&& t) {
visitor.visit(t.m_universe);
visitor.visit(t.m_high_bits);
visitor.visit(t.m_high_bits_d1);
visitor.visit(t.m_high_bits_d0);
visitor.visit(t.m_low_bits);
}
uint64_t m_universe;
pthash::bit_vector m_high_bits;
pthash::darray1 m_high_bits_d1;
Expand Down
Loading

0 comments on commit 2b7eeb0

Please sign in to comment.