Skip to content

Commit

Permalink
Split algorithm edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Sep 15, 2023
1 parent 3018884 commit 472286d
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 30 deletions.
66 changes: 54 additions & 12 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@ struct metadata
}
};

struct length_order
{
inline bool operator() (sequence_stats const & left, sequence_stats const & right)
{
return (left.len < right.len);
}
};

struct fasta_order
{
inline bool operator() (sequence_stats const & left, sequence_stats const & right)
{
return (left.ind < right.ind);
}
};

/** !\brief a struct that represents a single segment.
*
* All indices and positions are 0-based.
Expand Down Expand Up @@ -99,6 +115,7 @@ struct metadata
sequences.push_back(seq);
fasta_ind++;
}
std::stable_sort(sequences.begin(), sequences.end(), length_order());
}

void add_segment(size_t const i, size_t const ind, size_t const s, size_t const l)
Expand All @@ -115,20 +132,32 @@ struct metadata
*/
void scan_database_sequences(size_t const n, size_t const overlap)
{
assert(n > 0);
default_seg_len = total_len / n + 1;
assert(default_seg_len > overlap);
size_t len_lower_bound = default_seg_len / 10;

// Check how many sequences are discarded for being too short
auto first_long_seq = std::find_if(sequences.begin(), sequences.end(), [&](auto const & seq){return (seq.len >= len_lower_bound);});
size_t discarded_short_sequences = first_long_seq - sequences.begin();
// But do not discard all sequences if they are of uniform size
if (first_long_seq == sequences.end())
discarded_short_sequences = 0;

if (n < (sequences.size() - discarded_short_sequences))
throw std::runtime_error("Can not split " + std::to_string(sequences.size()) + " sequences into " + std::to_string(n) + " segments.");

std::stable_sort(sequences.begin(), sequences.end(), fasta_order());

size_t remaining_db_len = total_len;
for (const auto & seq : sequences)
{
size_t start = 0;
if (seq.len < (default_seg_len / 2))
if (seq.len < len_lower_bound)
{
seqan3::debug_stream << "Sequence: " << seq.id << " is too short and will be skipped.\n";
remaining_db_len -= seq.len;
}
else if ((seq.len >= default_seg_len / 2) & (seq.len <= default_seg_len * 1.5))
else if ((seq.len >= len_lower_bound) & (seq.len <= default_seg_len * 1.5))
{
// database sequence is single segment
add_segment(segments.size(), seq.ind, start, seq.len);
Expand All @@ -142,22 +171,30 @@ struct metadata
size_t updated_seg_len = remaining_db_len / remaining_seg_count;

size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len);
size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down

// divide database sequence into multiple segments
add_segment(segments.size(), seq.ind, start, actual_seg_len);
start = start + actual_seg_len - overlap;
while (start + actual_seg_len - overlap < seq.len)
if (segments_per_seq == 1)
add_segment(segments.size(), seq.ind, start, seq.len);
else
{
size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down

// divide database sequence into multiple segments
add_segment(segments.size(), seq.ind, start, actual_seg_len);
start = start + actual_seg_len - overlap;
while (start + actual_seg_len - overlap < seq.len)
{
add_segment(segments.size(), seq.ind, start, actual_seg_len);
start = start + actual_seg_len - overlap;
}
add_segment(segments.size(), seq.ind, start, seq.len - start);
}
add_segment(segments.size(), seq.ind, start, seq.len - start);

remaining_db_len -= seq.len;
}
}
assert(segments.size() == n);

if (segments.size() != n)
seqan3::debug_stream << "WARNING: Database was split into " << segments.size() <<
" instead of " << n << " segments.";
}

public:
Expand Down Expand Up @@ -252,7 +289,12 @@ struct metadata
if (sequences.size() <= ind)
throw std::runtime_error{"Sequence " + std::to_string(ind) + " index out of range."};

return segments | std::views::filter([&](segment_stats & seg) {return ind == seg.seq_ind;});
std::vector<segment_stats> sequence_segments{};
auto is_sequence_segment = [&](segment_stats & seg) {return ind == seg.seq_ind;};
for (auto & s : segments | std::views::filter(is_sequence_segment))
sequence_segments.push_back(s);

return sequence_segments;
}

/**
Expand Down
2 changes: 2 additions & 0 deletions test/cli/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ target_use_datasources (valik_test FILES bin_5.fasta)
target_use_datasources (valik_test FILES bin_6.fasta)
target_use_datasources (valik_test FILES bin_7.fasta)

target_use_datasources (valik_test FILES query.fastq)
target_use_datasources (valik_test FILES ref.fasta)
target_use_datasources (valik_test FILES various_chromosome_lengths.fasta)
target_use_datasources (valik_test FILES 0overlap4bins.txt)
target_use_datasources (valik_test FILES 20overlap4bins.txt)
Expand Down
7 changes: 6 additions & 1 deletion test/cli/cli_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,12 +526,17 @@ struct valik_base : public cli_test
}
};

struct valik_split : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};
struct valik_split_various : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};
struct valik_split_short : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};
struct valik_split_long : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t>> {};

struct valik_build_clusters : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t, bool>> {};
struct valik_build_segments : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t, size_t>> {};

struct valik_search_clusters : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t, size_t,
size_t, size_t>> {};
struct valik_search_segments : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t, size_t, size_t,
size_t, size_t>> {};

struct dream_short_search : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t, size_t>> {};
struct dream_split_search : public valik_base, public testing::WithParamInterface<std::tuple<size_t, size_t, size_t>> {};
97 changes: 88 additions & 9 deletions test/cli/valik_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
///////////////////////////////////////////////// valik split tests ///////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

TEST_P(valik_split, split)
TEST_P(valik_split_various, split_various_lengths)
{
auto const [overlap, seg_count] = GetParam();
auto const [seg_count, overlap] = GetParam();

cli_test_result const result = execute_app("valik", "split",
data("various_chromosome_lengths.fasta"),
"--overlap ", std::to_string(overlap),
"--out reference_metadata.txt",
"--seg-count ", std::to_string(seg_count),
"--out reference_metadata.txt");
"--overlap ", std::to_string(overlap));
EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{"Sequence: chr5 is too short and will be skipped.\n"});
Expand All @@ -31,15 +31,94 @@ TEST_P(valik_split, split)


INSTANTIATE_TEST_SUITE_P(split_suite,
valik_split,
testing::Combine(testing::Values(0, 20), testing::Values(4, 16)),
[] (testing::TestParamInfo<valik_split::ParamType> const & info)
valik_split_various,
testing::Combine(testing::Values(4, 16), testing::Values(0, 20)),
[] (testing::TestParamInfo<valik_split_various::ParamType> const & info)
{
std::string name = std::to_string(std::get<0>(info.param)) + "_overlap_" +
std::to_string(std::get<1>(info.param)) + "_bins";
std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" +
std::to_string(std::get<1>(info.param)) + "_overlap";
return name;
});

TEST_P(valik_split_short, split_many_short)
{
auto const [seg_count, overlap] = GetParam();

cli_test_result const result = execute_app("valik", "split",
data("query.fastq"),
"--out query_metadata.txt",
"--seg-count ", std::to_string(seg_count),
"--overlap ", std::to_string(overlap));

EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});

if (seg_count > 45)
{
EXPECT_EQ(result.err, std::string{});
}
// it is not advantageous to split 30 sequences of equal length into e.g 39 segments
else
{
const std::string expected = "WARNING: Database was split into 30 instead of " + std::to_string(seg_count) + " segments.";
EXPECT_EQ(result.err, expected);
}
}

INSTANTIATE_TEST_SUITE_P(split_many_short_suite,
valik_split_short,
testing::Combine(testing::Values(31, 39, 41, 49, 51, 55, 60, 61, 71), testing::Values(0, 1, 9)),
[] (testing::TestParamInfo<valik_split_short::ParamType> const & info)
{
std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" +
std::to_string(std::get<1>(info.param)) + "_overlap";
return name;
});

TEST_P(valik_split_long, split_few_long)
{
auto const [seg_count, overlap] = GetParam();

cli_test_result const result = execute_app("valik", "split",
data("ref.fasta"),
"--out reference_metadata.txt",
"--seg-count ", std::to_string(seg_count),
"--overlap ", std::to_string(overlap));

EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{});
}

INSTANTIATE_TEST_SUITE_P(split_few_long_suite,
valik_split_long,
testing::Combine(testing::Values(3, 12, 19), testing::Values(0, 1, 9)),
[] (testing::TestParamInfo<valik_split_long::ParamType> const & info)
{
std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" +
std::to_string(std::get<1>(info.param)) + "_overlap";
return name;
});

struct split_options : public valik_base {};

TEST_F(split_options, too_few_segments)
{
size_t n = 29;
size_t o = 0;
cli_test_result const result = execute_app("valik", "split", data("query.fastq"), "--seg-count",
std::to_string(n), "--overlap", std::to_string(o),
"--out", "meta.txt");
std::string const expected
{
"[Error] Can not split 30 sequences into " + std::to_string(n) + " segments.\n"
};

EXPECT_NE(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, expected);
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////// valik build clusters /////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
8 changes: 0 additions & 8 deletions test/data/datasources.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,6 @@ declare_datasource (FILE 150overlap4bins15window.ibf
declare_datasource (FILE 150overlap4bins.txt
URL ${CMAKE_SOURCE_DIR}/test/data/split/single/150overlap4bins.txt
URL_HASH SHA256=770920e15d8a49dd4cc756fea7b106a160dc2465fe9b3b6a48cb288c59f75f0b)
declare_datasource (FILE reference_metadata.txt
URL ${CMAKE_SOURCE_DIR}/test/data/split/single/reference_metadata.txt
URL_HASH SHA256=b82abe243ea2872d540d3548759997f93db2345351f76c93dd6ab40992abf1cd)


declare_datasource (FILE 0overlap16bins.txt
URL ${CMAKE_SOURCE_DIR}/test/data/split/multi/0overlap16bins.txt
Expand All @@ -63,10 +59,6 @@ declare_datasource (FILE 20overlap16bins.txt
declare_datasource (FILE 20overlap4bins.txt
URL ${CMAKE_SOURCE_DIR}/test/data/split/multi/20overlap4bins.txt
URL_HASH SHA256=c769012bdccd3a918c6e47a1e9bc6f3988d085babc591bfa5461982156cd4188)
declare_datasource (FILE chromosome_metadata.txt
URL ${CMAKE_SOURCE_DIR}/test/data/split/multi/chromosome_metadata.txt
URL_HASH SHA256=368803a8d29419321ba9704bc7cbd52abf6f7b2f528d725ed54a5ecadf5c6ae3)


declare_datasource (FILE write_out_0_16_reference_metadata.txt
URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_0_16/reference_metadata.txt
Expand Down

0 comments on commit 472286d

Please sign in to comment.