Skip to content

Commit

Permalink
count num distinct minimizers during merge
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed May 25, 2022
1 parent 65b523e commit 7d8d5e1
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 11 deletions.
12 changes: 1 addition & 11 deletions include/builder/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ void dictionary::build(std::string const& filename, build_configuration const& b
throw std::runtime_error("l must be <= " + std::to_string(constants::max_l));
}

// TODO: have user input here
std::string tmp_dirname = constants::default_tmp_dirname;

m_k = build_config.k;
m_m = build_config.m;
m_seed = build_config.seed;
Expand Down Expand Up @@ -77,15 +74,8 @@ void dictionary::build(std::string const& filename, build_configuration const& b
{
mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
mm::advice::sequential);

uint64_t num_minimizers = 0;
for (minimizers_tuples_iterator it(input.data(), input.data() + input.size());
it.has_next(); it.next()) {
++num_minimizers;
}

minimizers_tuples_iterator iterator(input.data(), input.data() + input.size());
m_minimizers.build(iterator, num_minimizers);
m_minimizers.build(iterator, data.minimizers.num_minimizers());
input.close();
}
timer.stop();
Expand Down
9 changes: 9 additions & 0 deletions include/builder/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ struct minimizers_tuples {
minimizers_tuples(std::string tmp_dirname = constants::default_tmp_dirname)
: m_buffer_size(0)
, m_num_files_to_merge(0)
, m_num_minimizers(0)
, m_run_identifier(pthash::clock_type::now().time_since_epoch().count())
, m_tmp_dirname(tmp_dirname) {
m_buffer_size = ram_limit / sizeof(minimizer_tuple);
Expand Down Expand Up @@ -263,9 +264,14 @@ struct minimizers_tuples {
if (!out.is_open()) throw std::runtime_error("cannot open file");

uint64_t num_written_tuples = 0;
uint64_t prev_minimizer = constants::invalid;
while (!idx_heap.empty()) {
minimizer_tuple const* begin = iterators[idx_heap.front()].begin;
out.write(reinterpret_cast<char const*>(begin), sizeof(minimizer_tuple));
if ((*begin).minimizer != prev_minimizer) {
prev_minimizer = (*begin).minimizer;
++m_num_minimizers;
}
num_written_tuples += 1;
if (num_written_tuples % 50000000 == 0) {
std::cout << "num_written_tuples = " << num_written_tuples << std::endl;
Expand All @@ -286,11 +292,14 @@ struct minimizers_tuples {
m_num_files_to_merge = 0; // any other call to merge() will do nothing
}

uint64_t num_minimizers() const { return m_num_minimizers; }

void remove_tmp_file() { std::remove(get_minimizers_filename().c_str()); }

private:
uint64_t m_buffer_size;
uint64_t m_num_files_to_merge;
uint64_t m_num_minimizers;
uint64_t m_run_identifier;
std::string m_tmp_dirname;
std::vector<minimizer_tuple> m_buffer;
Expand Down

0 comments on commit 7d8d5e1

Please sign in to comment.