From 11c314048349c0e0ed996ec948d9a8b51d7c3595 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 21 Sep 2023 15:44:07 +0200 Subject: [PATCH] Split segments for index --- include/valik/shared.hpp | 9 +++++--- include/valik/split/metadata.hpp | 14 +++++------- src/argument_parsing/split.cpp | 2 +- test/cli/cli_test.hpp | 1 + test/cli/valik_test.cpp | 39 ++++++++++++++++++++++++++++++++ 5 files changed, 53 insertions(+), 12 deletions(-) diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index 154633f8..d1615e0d 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -21,17 +21,20 @@ constexpr static uint64_t adjust_seed(uint8_t const kmer_size, uint64_t const se } /** - * @brief Function that rounds the chosen segment count to the next multiple of 64. + * @brief Function that rounds the chosen segment count to the closest multiple of 64. * * @param n Segment count. */ constexpr static size_t adjust_bin_count(size_t const & n) { int remainder = n % 64; + if (remainder == 0) return n; - - return n + 64 - remainder; + else if (remainder < 32) + return std::max((uint32_t) n - remainder, 128u) - 64; // previous multiple of 64 + else + return n + 64 - remainder; // next multiple of 64 } //!\brief Strong type for passing the window size. diff --git a/include/valik/split/metadata.hpp b/include/valik/split/metadata.hpp index 1a6df433..8a1731ee 100644 --- a/include/valik/split/metadata.hpp +++ b/include/valik/split/metadata.hpp @@ -169,23 +169,21 @@ struct metadata // sequences that are contained in a single segment might not have the exact segment length // dynamically update segment length to divide the rest of the remaining database as equally as possible among the chosen number of segments size_t remaining_seg_count = n - segments.size(); - size_t updated_seg_len = remaining_db_len / remaining_seg_count; - + size_t updated_seg_len = std::round((float) remaining_db_len / remaining_seg_count); size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len); if (segments_per_seq == 1) add_segment(seq.ind, start, seq.len); else { - size_t actual_seg_len = seq.len / segments_per_seq + std::ceil(overlap / 2.0f); + size_t actual_seg_len = std::ceil(((float) seq.len - overlap) / segments_per_seq); // divide database sequence into multiple segments - add_segment(seq.ind, start, actual_seg_len); - start = start + actual_seg_len - overlap; - while (start + actual_seg_len < seq.len) + add_segment(seq.ind, 0, actual_seg_len + overlap); + + for (start += actual_seg_len; start + actual_seg_len + overlap < seq.len - overlap; start += actual_seg_len) { - add_segment(seq.ind, start, actual_seg_len); - start = start + actual_seg_len - overlap; + add_segment(seq.ind, start, actual_seg_len + overlap); } add_segment(seq.ind, start, seq.len - start); } diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index 86b19b4e..b6a9ca65 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -28,7 +28,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) .long_id = "seg-count", .description = "Dividing the database into this many segments.", .validator = sharg::arithmetic_range_validator{1, 29952}}); - parser.add_option(arguments.split_index, + parser.add_flag(arguments.split_index, sharg::config{.short_id = '\0', .long_id = "split-index", .description = "Split a reference database before building an Interleaved Bloom Filter where the number of bins should be a multiple of 64."}); diff --git a/test/cli/cli_test.hpp b/test/cli/cli_test.hpp index 176210f7..dc5778e8 100644 --- a/test/cli/cli_test.hpp +++ b/test/cli/cli_test.hpp @@ -526,6 +526,7 @@ struct valik_base : public cli_test } }; +struct valik_split_ref : public valik_base, public testing::WithParamInterface> {}; struct valik_split_various : public valik_base, public testing::WithParamInterface> {}; struct valik_split_short : public valik_base, public testing::WithParamInterface> {}; struct valik_split_long : public valik_base, public testing::WithParamInterface> {}; diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index 0c5fe97c..2e130f55 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -6,6 +6,45 @@ #include "cli_test.hpp" +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////// valik split index bins ///////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +TEST_P(valik_split_ref, split_ref) +{ + auto const [seg_count, overlap] = GetParam(); + + cli_test_result const result = execute_app("valik", "split", + data("ref.fasta"), + "--split-index", + "--out reference_metadata.txt", + "--seg-count ", std::to_string(seg_count), + "--overlap ", std::to_string(overlap)); + + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); + + + valik::metadata meta("reference_metadata.txt"); + if (meta.seg_count < 97) + EXPECT_EQ(meta.seg_count, 64); + else + EXPECT_EQ(meta.seg_count, 128); + EXPECT_GE(0.1f, meta.segment_length_cv()); // create segments of roughly equal length +} + +INSTANTIATE_TEST_SUITE_P(split_ref_suite, + valik_split_ref, + testing::Combine(testing::Values(8, 63, 64, 65, 96, 97, 159), testing::Values(0, 1, 9)), + [] (testing::TestParamInfo const & info) + { + std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" + + std::to_string(std::get<1>(info.param)) + "_overlap"; + return name; + }); + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////// valik split equal length //////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////