diff --git a/include/valik/split/metadata.hpp b/include/valik/split/metadata.hpp index 62c657c0..4ea2adde 100644 --- a/include/valik/split/metadata.hpp +++ b/include/valik/split/metadata.hpp @@ -42,6 +42,22 @@ struct metadata } }; + struct length_order + { + inline bool operator() (sequence_stats const & left, sequence_stats const & right) + { + return (left.len < right.len); + } + }; + + struct fasta_order + { + inline bool operator() (sequence_stats const & left, sequence_stats const & right) + { + return (left.ind < right.ind); + } + }; + /** !\brief a struct that represents a single segment. * * All indices and positions are 0-based. @@ -99,6 +115,7 @@ struct metadata sequences.push_back(seq); fasta_ind++; } + std::stable_sort(sequences.begin(), sequences.end(), length_order()); } void add_segment(size_t const i, size_t const ind, size_t const s, size_t const l) @@ -115,20 +132,32 @@ struct metadata */ void scan_database_sequences(size_t const n, size_t const overlap) { - assert(n > 0); default_seg_len = total_len / n + 1; assert(default_seg_len > overlap); + size_t len_lower_bound = default_seg_len / 10; + + // Check how many sequences are discarded for being too short + auto first_long_seq = std::find_if(sequences.begin(), sequences.end(), [&](auto const & seq){return (seq.len >= len_lower_bound);}); + size_t discarded_short_sequences = first_long_seq - sequences.begin(); + // But do not discard all sequences if they are of uniform size + if (first_long_seq == sequences.end()) + discarded_short_sequences = 0; + + if (n < (sequences.size() - discarded_short_sequences)) + throw std::runtime_error("Can not split " + std::to_string(sequences.size()) + " sequences into " + std::to_string(n) + " segments."); + + std::stable_sort(sequences.begin(), sequences.end(), fasta_order()); size_t remaining_db_len = total_len; for (const auto & seq : sequences) { size_t start = 0; - if (seq.len < (default_seg_len / 2)) + if (seq.len < len_lower_bound) { seqan3::debug_stream << "Sequence: " << seq.id << " is too short and will be skipped.\n"; remaining_db_len -= seq.len; } - else if ((seq.len >= default_seg_len / 2) & (seq.len <= default_seg_len * 1.5)) + else if ((seq.len >= len_lower_bound) & (seq.len <= default_seg_len * 1.5)) { // database sequence is single segment add_segment(segments.size(), seq.ind, start, seq.len); @@ -142,22 +171,30 @@ struct metadata size_t updated_seg_len = remaining_db_len / remaining_seg_count; size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len); - size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down - // divide database sequence into multiple segments - add_segment(segments.size(), seq.ind, start, actual_seg_len); - start = start + actual_seg_len - overlap; - while (start + actual_seg_len - overlap < seq.len) + if (segments_per_seq == 1) + add_segment(segments.size(), seq.ind, start, seq.len); + else { + size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down + + // divide database sequence into multiple segments add_segment(segments.size(), seq.ind, start, actual_seg_len); start = start + actual_seg_len - overlap; + while (start + actual_seg_len - overlap < seq.len) + { + add_segment(segments.size(), seq.ind, start, actual_seg_len); + start = start + actual_seg_len - overlap; + } + add_segment(segments.size(), seq.ind, start, seq.len - start); } - add_segment(segments.size(), seq.ind, start, seq.len - start); - remaining_db_len -= seq.len; } } - assert(segments.size() == n); + + if (segments.size() != n) + seqan3::debug_stream << "WARNING: Database was split into " << segments.size() << + " instead of " << n << " segments."; } public: @@ -252,7 +289,12 @@ struct metadata if (sequences.size() <= ind) throw std::runtime_error{"Sequence " + std::to_string(ind) + " index out of range."}; - return segments | std::views::filter([&](segment_stats & seg) {return ind == seg.seq_ind;}); + std::vector sequence_segments{}; + auto is_sequence_segment = [&](segment_stats & seg) {return ind == seg.seq_ind;}; + for (auto & s : segments | std::views::filter(is_sequence_segment)) + sequence_segments.push_back(s); + + return sequence_segments; } /** diff --git a/test/cli/CMakeLists.txt b/test/cli/CMakeLists.txt index 094ab345..22e1bf03 100644 --- a/test/cli/CMakeLists.txt +++ b/test/cli/CMakeLists.txt @@ -27,6 +27,8 @@ target_use_datasources (valik_test FILES bin_5.fasta) target_use_datasources (valik_test FILES bin_6.fasta) target_use_datasources (valik_test FILES bin_7.fasta) +target_use_datasources (valik_test FILES query.fastq) +target_use_datasources (valik_test FILES ref.fasta) target_use_datasources (valik_test FILES various_chromosome_lengths.fasta) target_use_datasources (valik_test FILES 0overlap4bins.txt) target_use_datasources (valik_test FILES 20overlap4bins.txt) diff --git a/test/cli/cli_test.hpp b/test/cli/cli_test.hpp index 91ed40cd..176210f7 100644 --- a/test/cli/cli_test.hpp +++ b/test/cli/cli_test.hpp @@ -526,12 +526,17 @@ struct valik_base : public cli_test } }; -struct valik_split : public valik_base, public testing::WithParamInterface> {}; +struct valik_split_various : public valik_base, public testing::WithParamInterface> {}; +struct valik_split_short : public valik_base, public testing::WithParamInterface> {}; +struct valik_split_long : public valik_base, public testing::WithParamInterface> {}; + struct valik_build_clusters : public valik_base, public testing::WithParamInterface> {}; struct valik_build_segments : public valik_base, public testing::WithParamInterface> {}; + struct valik_search_clusters : public valik_base, public testing::WithParamInterface> {}; struct valik_search_segments : public valik_base, public testing::WithParamInterface> {}; + struct dream_short_search : public valik_base, public testing::WithParamInterface> {}; struct dream_split_search : public valik_base, public testing::WithParamInterface> {}; diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index 6d089251..ac766fe4 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -10,15 +10,15 @@ ///////////////////////////////////////////////// valik split tests /////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -TEST_P(valik_split, split) +TEST_P(valik_split_various, split_various_lengths) { - auto const [overlap, seg_count] = GetParam(); + auto const [seg_count, overlap] = GetParam(); cli_test_result const result = execute_app("valik", "split", data("various_chromosome_lengths.fasta"), - "--overlap ", std::to_string(overlap), + "--out reference_metadata.txt", "--seg-count ", std::to_string(seg_count), - "--out reference_metadata.txt"); + "--overlap ", std::to_string(overlap)); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, std::string{"Sequence: chr5 is too short and will be skipped.\n"}); @@ -31,15 +31,94 @@ TEST_P(valik_split, split) INSTANTIATE_TEST_SUITE_P(split_suite, - valik_split, - testing::Combine(testing::Values(0, 20), testing::Values(4, 16)), - [] (testing::TestParamInfo const & info) + valik_split_various, + testing::Combine(testing::Values(4, 16), testing::Values(0, 20)), + [] (testing::TestParamInfo const & info) { - std::string name = std::to_string(std::get<0>(info.param)) + "_overlap_" + - std::to_string(std::get<1>(info.param)) + "_bins"; + std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" + + std::to_string(std::get<1>(info.param)) + "_overlap"; + return name; + }); + +TEST_P(valik_split_short, split_many_short) +{ + auto const [seg_count, overlap] = GetParam(); + + cli_test_result const result = execute_app("valik", "split", + data("query.fastq"), + "--out query_metadata.txt", + "--seg-count ", std::to_string(seg_count), + "--overlap ", std::to_string(overlap)); + + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + + if (seg_count > 45) + { + EXPECT_EQ(result.err, std::string{}); + } + // it is not advantageous to split 30 sequences of equal length into e.g 39 segments + else + { + const std::string expected = "WARNING: Database was split into 30 instead of " + std::to_string(seg_count) + " segments."; + EXPECT_EQ(result.err, expected); + } +} + +INSTANTIATE_TEST_SUITE_P(split_many_short_suite, + valik_split_short, + testing::Combine(testing::Values(31, 39, 41, 49, 51, 55, 60, 61, 71), testing::Values(0, 1, 9)), + [] (testing::TestParamInfo const & info) + { + std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" + + std::to_string(std::get<1>(info.param)) + "_overlap"; + return name; + }); + +TEST_P(valik_split_long, split_few_long) +{ + auto const [seg_count, overlap] = GetParam(); + + cli_test_result const result = execute_app("valik", "split", + data("ref.fasta"), + "--out reference_metadata.txt", + "--seg-count ", std::to_string(seg_count), + "--overlap ", std::to_string(overlap)); + + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +INSTANTIATE_TEST_SUITE_P(split_few_long_suite, + valik_split_long, + testing::Combine(testing::Values(3, 12, 19), testing::Values(0, 1, 9)), + [] (testing::TestParamInfo const & info) + { + std::string name = std::to_string(std::get<0>(info.param)) + "_seg_count_" + + std::to_string(std::get<1>(info.param)) + "_overlap"; return name; }); +struct split_options : public valik_base {}; + +TEST_F(split_options, too_few_segments) +{ + size_t n = 29; + size_t o = 0; + cli_test_result const result = execute_app("valik", "split", data("query.fastq"), "--seg-count", + std::to_string(n), "--overlap", std::to_string(o), + "--out", "meta.txt"); + std::string const expected + { + "[Error] Can not split 30 sequences into " + std::to_string(n) + " segments.\n" + }; + + EXPECT_NE(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, expected); +} + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////// valik build clusters ///////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index 3872c8ab..3f1ad551 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -46,10 +46,6 @@ declare_datasource (FILE 150overlap4bins15window.ibf declare_datasource (FILE 150overlap4bins.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/single/150overlap4bins.txt URL_HASH SHA256=770920e15d8a49dd4cc756fea7b106a160dc2465fe9b3b6a48cb288c59f75f0b) -declare_datasource (FILE reference_metadata.txt - URL ${CMAKE_SOURCE_DIR}/test/data/split/single/reference_metadata.txt - URL_HASH SHA256=b82abe243ea2872d540d3548759997f93db2345351f76c93dd6ab40992abf1cd) - declare_datasource (FILE 0overlap16bins.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/multi/0overlap16bins.txt @@ -63,10 +59,6 @@ declare_datasource (FILE 20overlap16bins.txt declare_datasource (FILE 20overlap4bins.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/multi/20overlap4bins.txt URL_HASH SHA256=c769012bdccd3a918c6e47a1e9bc6f3988d085babc591bfa5461982156cd4188) -declare_datasource (FILE chromosome_metadata.txt - URL ${CMAKE_SOURCE_DIR}/test/data/split/multi/chromosome_metadata.txt - URL_HASH SHA256=368803a8d29419321ba9704bc7cbd52abf6f7b2f528d725ed54a5ecadf5c6ae3) - declare_datasource (FILE write_out_0_16_reference_metadata.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_0_16/reference_metadata.txt