Skip to content

Commit

Permalink
Split segments for index
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Sep 21, 2023
1 parent 975bd1d commit 11c3140
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 12 deletions.
9 changes: 6 additions & 3 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,20 @@ constexpr static uint64_t adjust_seed(uint8_t const kmer_size, uint64_t const se
}

/**
* @brief Function that rounds the chosen segment count to the next multiple of 64.
* @brief Function that rounds the chosen segment count to the closest multiple of 64.
*
* @param n Segment count.
*/
constexpr static size_t adjust_bin_count(size_t const & n)
{
int remainder = n % 64;

if (remainder == 0)
return n;

return n + 64 - remainder;
else if (remainder < 32)
return std::max((uint32_t) n - remainder, 128u) - 64; // previous multiple of 64
else
return n + 64 - remainder; // next multiple of 64
}

//!\brief Strong type for passing the window size.
Expand Down
14 changes: 6 additions & 8 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,23 +169,21 @@ struct metadata
// sequences that are contained in a single segment might not have the exact segment length
// dynamically update segment length to divide the rest of the remaining database as equally as possible among the chosen number of segments
size_t remaining_seg_count = n - segments.size();
size_t updated_seg_len = remaining_db_len / remaining_seg_count;

size_t updated_seg_len = std::round((float) remaining_db_len / remaining_seg_count);
size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len);

if (segments_per_seq == 1)
add_segment(seq.ind, start, seq.len);
else
{
size_t actual_seg_len = seq.len / segments_per_seq + std::ceil(overlap / 2.0f);
size_t actual_seg_len = std::ceil(((float) seq.len - overlap) / segments_per_seq);

// divide database sequence into multiple segments
add_segment(seq.ind, start, actual_seg_len);
start = start + actual_seg_len - overlap;
while (start + actual_seg_len < seq.len)
add_segment(seq.ind, 0, actual_seg_len + overlap);

for (start += actual_seg_len; start + actual_seg_len + overlap < seq.len - overlap; start += actual_seg_len)
{
add_segment(seq.ind, start, actual_seg_len);
start = start + actual_seg_len - overlap;
add_segment(seq.ind, start, actual_seg_len + overlap);
}
add_segment(seq.ind, start, seq.len - start);
}
Expand Down
2 changes: 1 addition & 1 deletion src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
.long_id = "seg-count",
.description = "Dividing the database into this many segments.",
.validator = sharg::arithmetic_range_validator{1, 29952}});
parser.add_option(arguments.split_index,
parser.add_flag(arguments.split_index,
sharg::config{.short_id = '\0',
.long_id = "split-index",
.description = "Split a reference database before building an Interleaved Bloom Filter where the number of bins should be a multiple of 64."});
Expand Down
1 change: 1 addition & 0 deletions test/cli/cli_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ struct valik_base : public cli_test
}
};

struct valik_split_ref : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};
struct valik_split_various : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};
struct valik_split_short : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};
struct valik_split_long : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};
Expand Down
39 changes: 39 additions & 0 deletions test/cli/valik_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,45 @@

#include "cli_test.hpp"

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////// valik split index bins /////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

TEST_P(valik_split_ref, split_ref)
{
auto const [seg_count, overlap] = GetParam();

cli_test_result const result = execute_app("valik", "split",
data("ref.fasta"),
"--split-index",
"--out reference_metadata.txt",
"--seg-count ", std::to_string(seg_count),
"--overlap ", std::to_string(overlap));

EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{});


valik::metadata meta("reference_metadata.txt");
if (meta.seg_count < 97)
EXPECT_EQ(meta.seg_count, 64);
else
EXPECT_EQ(meta.seg_count, 128);
EXPECT_GE(0.1f, meta.segment_length_cv()); // create segments of roughly equal length
}

INSTANTIATE_TEST_SUITE_P(split_ref_suite,
valik_split_ref,
testing::Combine(testing::Values(8, 63, 64, 65, 96, 97, 159), testing::Values(0, 1, 9)),
[] (testing::TestParamInfo<valik_split_long::ParamType> const & info)
{
std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" +
std::to_string(std::get<1>(info.param)) + "_overlap";
return name;
});


////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////// valik split equal length ////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down

0 comments on commit 11c3140

Please sign in to comment.