From 7d8d5e1bff44594a323acfb02f0c34df6242a2fe Mon Sep 17 00:00:00 2001 From: jermp Date: Wed, 25 May 2022 16:59:05 +0200 Subject: [PATCH] count num distinct minimizers during merge --- include/builder/build.cpp | 12 +----------- include/builder/util.hpp | 9 +++++++++ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/builder/build.cpp b/include/builder/build.cpp index f8e6862..999a4d8 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -30,9 +30,6 @@ void dictionary::build(std::string const& filename, build_configuration const& b throw std::runtime_error("l must be <= " + std::to_string(constants::max_l)); } - // TODO: have user input here - std::string tmp_dirname = constants::default_tmp_dirname; - m_k = build_config.k; m_m = build_config.m; m_seed = build_config.seed; @@ -77,15 +74,8 @@ void dictionary::build(std::string const& filename, build_configuration const& b { mm::file_source input(data.minimizers.get_minimizers_filename(), mm::advice::sequential); - - uint64_t num_minimizers = 0; - for (minimizers_tuples_iterator it(input.data(), input.data() + input.size()); - it.has_next(); it.next()) { - ++num_minimizers; - } - minimizers_tuples_iterator iterator(input.data(), input.data() + input.size()); - m_minimizers.build(iterator, num_minimizers); + m_minimizers.build(iterator, data.minimizers.num_minimizers()); input.close(); } timer.stop(); diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 73a49ef..dfc3e16 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -163,6 +163,7 @@ struct minimizers_tuples { minimizers_tuples(std::string tmp_dirname = constants::default_tmp_dirname) : m_buffer_size(0) , m_num_files_to_merge(0) + , m_num_minimizers(0) , m_run_identifier(pthash::clock_type::now().time_since_epoch().count()) , m_tmp_dirname(tmp_dirname) { m_buffer_size = ram_limit / sizeof(minimizer_tuple); @@ -263,9 +264,14 @@ struct minimizers_tuples { if (!out.is_open()) throw std::runtime_error("cannot open file"); uint64_t num_written_tuples = 0; + uint64_t prev_minimizer = constants::invalid; while (!idx_heap.empty()) { minimizer_tuple const* begin = iterators[idx_heap.front()].begin; out.write(reinterpret_cast(begin), sizeof(minimizer_tuple)); + if ((*begin).minimizer != prev_minimizer) { + prev_minimizer = (*begin).minimizer; + ++m_num_minimizers; + } num_written_tuples += 1; if (num_written_tuples % 50000000 == 0) { std::cout << "num_written_tuples = " << num_written_tuples << std::endl; @@ -286,11 +292,14 @@ struct minimizers_tuples { m_num_files_to_merge = 0; // any other call to merge() will do nothing } + uint64_t num_minimizers() const { return m_num_minimizers; } + void remove_tmp_file() { std::remove(get_minimizers_filename().c_str()); } private: uint64_t m_buffer_size; uint64_t m_num_files_to_merge; + uint64_t m_num_minimizers; uint64_t m_run_identifier; std::string m_tmp_dirname; std::vector m_buffer;