Skip to content

Commit

Permalink
Make IBF from minimisers
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed May 4, 2024
1 parent b921738 commit 32d3bf2
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 14 deletions.
12 changes: 8 additions & 4 deletions include/utilities/prepare/parse_bin_paths.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@ inline auto parse_bin_paths(build_arguments const & arguments)
}
else
{
valik::metadata meta(arguments.ref_meta_path);
for (size_t bin{0}; bin < arguments.bins; bin++)
{
std::filesystem::path file = arguments.bin_path[0][0];
minimiser_files.emplace_back((std::vector<std::string>){file.replace_extension(std::to_string(bin) + ".minimiser")});
//seqan3::debug_stream << minimiser_files[minimiser_files.size() - 1][0] << '\n';
std::filesystem::path const ref_file{arguments.bin_path[0][0]};
std::filesystem::path file = arguments.out_dir;
file /= ref_file.stem();
file += ".";
file += std::to_string(bin);
file += ".minimiser";
minimiser_files.emplace_back((std::vector<std::string>){file});
seqan3::debug_stream << minimiser_files[minimiser_files.size() - 1][0] << '\n';
}
}

Expand Down
2 changes: 2 additions & 0 deletions include/valik/build/index_factory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ class index_factory
};

std::vector<std::vector<std::string>> file_paths = parse_bin_paths(*arguments);
seqan3::debug_stream << "Parsed bin paths\n";
call_parallel_on_bins(minimiser_worker, file_paths, arguments->threads);
seqan3::debug_stream << "Called parallel on bins\n";
}
else if (arguments->bin_path.size() > 1)
{
Expand Down
2 changes: 1 addition & 1 deletion include/valik/search/local_prefilter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pattern_bounds make_pattern_bounds(size_t const & begin,
size_t const minimiser_count = pattern.end_position - pattern.begin_position;

pattern.threshold = thresholder.get(minimiser_count);

//seqan3::debug_stream << "threshold\t" << pattern.threshold << '\n';
return pattern;
}

Expand Down
3 changes: 2 additions & 1 deletion include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ struct build_arguments
bool manual_parameters{false};
bool input_is_minimiser{false};

bool kmer_count_cutoff{false};
uint8_t kmer_count_min_cutoff{2};
uint8_t kmer_count_max_cutoff{64};
bool use_filesize_dependent_cutoff{false};

std::filesystem::path ref_meta_path{};
Expand Down
16 changes: 13 additions & 3 deletions src/argument_parsing/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,18 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments)
.advanced = true});

parser.add_subsection("Processing options");
parser.add_option(arguments.kmer_count_cutoff,
parser.add_option(arguments.kmer_count_min_cutoff,
sharg::config{.short_id = '\0',
.long_id = "kmer-count-cutoff",
.long_id = "kmer-count-min",
.description = "Only store k-mers with at least (>=) x occurrences. "
"Mutually exclusive with --use-filesize-dependent-cutoff.",
.validator = sharg::arithmetic_range_validator{1, 254}});
parser.add_option(arguments.kmer_count_max_cutoff,
sharg::config{.short_id = '\0',
.long_id = "kmer-count-max",
.description = "Only store k-mers with no more than (<=) x occurrences. "
"Mutually exclusive with --use-filesize-dependent-cutoff.",
.validator = sharg::arithmetic_range_validator{1, 254}});
parser.add_flag(arguments.use_filesize_dependent_cutoff,
sharg::config{.short_id = '\0',
.long_id = "use-filesize-dependent-cutoff",
Expand Down Expand Up @@ -145,7 +151,7 @@ void run_build(sharg::parser & parser)
// ==========================================
// Process minimiser parameters for IBF size calculation.
// ==========================================
if (parser.is_option_set("kmer-count-cutoff") && parser.is_option_set("use-filesize-dependent-cutoff"))
if ((parser.is_option_set("kmer-count-min") || parser.is_option_set("kmer-count-max")) && parser.is_option_set("use-filesize-dependent-cutoff"))
throw sharg::parser_error{"You cannot use both --kmer-count-cutoff and --use-filesize-dependent-cutoff."};

arguments.shape = seqan3::shape{seqan3::ungapped{arguments.kmer_size}};
Expand All @@ -163,6 +169,9 @@ void run_build(sharg::parser & parser)
arguments.window_size = arguments.kmer_size;
}

seqan3::debug_stream << "Computed minimisers\n";

/*
try
{
sharg::output_file_validator{sharg::output_file_open_options::open_or_create}(arguments.out_path);
Expand All @@ -173,6 +182,7 @@ void run_build(sharg::parser & parser)
std::cerr << "[Error] " << ext.what() << '\n';
std::exit(-1);
}
*/

// ==========================================
// Find IBF size.
Expand Down
13 changes: 8 additions & 5 deletions src/prepare/compute_bin_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ void compute_minimiser(valik::build_arguments const & arguments)
else
std::ofstream outfile{progress_file, std::ios::binary};

std::unordered_set<uint64_t> distinct_minimisers{};
std::unordered_map<uint64_t, uint8_t> minimiser_table{};
// The map is (re-)constructed for each segment. The alternative is to construct it once for each thread
// and clear+reuse it for every file that a thread works on. However, this dramatically increases
// memory consumption because the map will stay as big as needed for the biggest encountered file.
Expand All @@ -130,16 +130,19 @@ void compute_minimiser(valik::build_arguments const & arguments)

for (auto && value : seq | seqan3::views::slice(seg.start, seg.start + seg.len) | hash_view())
{
distinct_minimisers.insert(value);
minimiser_table[value] = std::min<uint8_t>(254u, minimiser_table[value] + 1);
}

uint64_t count{};
{
std::ofstream outfile{minimiser_file, std::ios::binary};
for (auto && hash : distinct_minimisers)
for (auto && [hash, occurrences] : minimiser_table)
{
outfile.write(reinterpret_cast<const char *>(&hash), sizeof(hash));
++count;
if (occurrences > arguments.kmer_count_min_cutoff && occurrences < arguments.kmer_count_max_cutoff)
{
outfile.write(reinterpret_cast<const char *>(&hash), sizeof(hash));
++count;
}
}
}

Expand Down

0 comments on commit 32d3bf2

Please sign in to comment.