diff --git a/include/utilities/prepare/parse_bin_paths.hpp b/include/utilities/prepare/parse_bin_paths.hpp index 1ff38ecd..2a138bf6 100644 --- a/include/utilities/prepare/parse_bin_paths.hpp +++ b/include/utilities/prepare/parse_bin_paths.hpp @@ -24,12 +24,16 @@ inline auto parse_bin_paths(build_arguments const & arguments) } else { - valik::metadata meta(arguments.ref_meta_path); for (size_t bin{0}; bin < arguments.bins; bin++) { - std::filesystem::path file = arguments.bin_path[0][0]; - minimiser_files.emplace_back((std::vector){file.replace_extension(std::to_string(bin) + ".minimiser")}); - //seqan3::debug_stream << minimiser_files[minimiser_files.size() - 1][0] << '\n'; + std::filesystem::path const ref_file{arguments.bin_path[0][0]}; + std::filesystem::path file = arguments.out_dir; + file /= ref_file.stem(); + file += "."; + file += std::to_string(bin); + file += ".minimiser"; + minimiser_files.emplace_back((std::vector){file}); + seqan3::debug_stream << minimiser_files[minimiser_files.size() - 1][0] << '\n'; } } diff --git a/include/valik/build/index_factory.hpp b/include/valik/build/index_factory.hpp index c2d2fb6d..24e0ea32 100644 --- a/include/valik/build/index_factory.hpp +++ b/include/valik/build/index_factory.hpp @@ -65,7 +65,9 @@ class index_factory }; std::vector> file_paths = parse_bin_paths(*arguments); + seqan3::debug_stream << "Parsed bin paths\n"; call_parallel_on_bins(minimiser_worker, file_paths, arguments->threads); + seqan3::debug_stream << "Called parallel on bins\n"; } else if (arguments->bin_path.size() > 1) { diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index 6c21c3af..47b14cc8 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -92,7 +92,7 @@ pattern_bounds make_pattern_bounds(size_t const & begin, size_t const minimiser_count = pattern.end_position - pattern.begin_position; pattern.threshold = thresholder.get(minimiser_count); - + //seqan3::debug_stream << "threshold\t" << pattern.threshold << '\n'; return pattern; } diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index 6fc1e8f8..5041ee4d 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -97,7 +97,8 @@ struct build_arguments bool manual_parameters{false}; bool input_is_minimiser{false}; - bool kmer_count_cutoff{false}; + uint8_t kmer_count_min_cutoff{2}; + uint8_t kmer_count_max_cutoff{64}; bool use_filesize_dependent_cutoff{false}; std::filesystem::path ref_meta_path{}; diff --git a/src/argument_parsing/build.cpp b/src/argument_parsing/build.cpp index 3857ea97..4aa2ed99 100644 --- a/src/argument_parsing/build.cpp +++ b/src/argument_parsing/build.cpp @@ -66,12 +66,18 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments) .advanced = true}); parser.add_subsection("Processing options"); - parser.add_option(arguments.kmer_count_cutoff, + parser.add_option(arguments.kmer_count_min_cutoff, sharg::config{.short_id = '\0', - .long_id = "kmer-count-cutoff", + .long_id = "kmer-count-min", .description = "Only store k-mers with at least (>=) x occurrences. " "Mutually exclusive with --use-filesize-dependent-cutoff.", .validator = sharg::arithmetic_range_validator{1, 254}}); + parser.add_option(arguments.kmer_count_max_cutoff, + sharg::config{.short_id = '\0', + .long_id = "kmer-count-max", + .description = "Only store k-mers with no more than (<=) x occurrences. " + "Mutually exclusive with --use-filesize-dependent-cutoff.", + .validator = sharg::arithmetic_range_validator{1, 254}}); parser.add_flag(arguments.use_filesize_dependent_cutoff, sharg::config{.short_id = '\0', .long_id = "use-filesize-dependent-cutoff", @@ -145,7 +151,7 @@ void run_build(sharg::parser & parser) // ========================================== // Process minimiser parameters for IBF size calculation. // ========================================== - if (parser.is_option_set("kmer-count-cutoff") && parser.is_option_set("use-filesize-dependent-cutoff")) + if ((parser.is_option_set("kmer-count-min") || parser.is_option_set("kmer-count-max")) && parser.is_option_set("use-filesize-dependent-cutoff")) throw sharg::parser_error{"You cannot use both --kmer-count-cutoff and --use-filesize-dependent-cutoff."}; arguments.shape = seqan3::shape{seqan3::ungapped{arguments.kmer_size}}; @@ -163,6 +169,9 @@ void run_build(sharg::parser & parser) arguments.window_size = arguments.kmer_size; } + seqan3::debug_stream << "Computed minimisers\n"; + + /* try { sharg::output_file_validator{sharg::output_file_open_options::open_or_create}(arguments.out_path); @@ -173,6 +182,7 @@ void run_build(sharg::parser & parser) std::cerr << "[Error] " << ext.what() << '\n'; std::exit(-1); } + */ // ========================================== // Find IBF size. diff --git a/src/prepare/compute_bin_size.cpp b/src/prepare/compute_bin_size.cpp index db5c32da..fc938139 100644 --- a/src/prepare/compute_bin_size.cpp +++ b/src/prepare/compute_bin_size.cpp @@ -116,7 +116,7 @@ void compute_minimiser(valik::build_arguments const & arguments) else std::ofstream outfile{progress_file, std::ios::binary}; - std::unordered_set distinct_minimisers{}; + std::unordered_map minimiser_table{}; // The map is (re-)constructed for each segment. The alternative is to construct it once for each thread // and clear+reuse it for every file that a thread works on. However, this dramatically increases // memory consumption because the map will stay as big as needed for the biggest encountered file. @@ -130,16 +130,19 @@ void compute_minimiser(valik::build_arguments const & arguments) for (auto && value : seq | seqan3::views::slice(seg.start, seg.start + seg.len) | hash_view()) { - distinct_minimisers.insert(value); + minimiser_table[value] = std::min(254u, minimiser_table[value] + 1); } uint64_t count{}; { std::ofstream outfile{minimiser_file, std::ios::binary}; - for (auto && hash : distinct_minimisers) + for (auto && [hash, occurrences] : minimiser_table) { - outfile.write(reinterpret_cast(&hash), sizeof(hash)); - ++count; + if (occurrences > arguments.kmer_count_min_cutoff && occurrences < arguments.kmer_count_max_cutoff) + { + outfile.write(reinterpret_cast(&hash), sizeof(hash)); + ++count; + } } }