diff --git a/include/valik/build/index_factory.hpp b/include/valik/build/index_factory.hpp index 423846a4..d428c1d9 100644 --- a/include/valik/build/index_factory.hpp +++ b/include/valik/build/index_factory.hpp @@ -67,12 +67,10 @@ class index_factory metadata meta(arguments->ref_meta_path); auto & ibf = index.ibf(); - int i = 0; + size_t i = 0; for (auto && [seq] : sequence_file_t{arguments->bin_file}) { - // get the relevant segments for each reference - auto ref_seg = [&](metadata::segment_stats & seg) {return meta.sequences.at(i).ind == seg.seq_ind;}; - for (auto & seg : meta.segments | std::views::filter(ref_seg)) + for (auto & seg : meta.segments_from_ind(i)) { for (auto && value : seq | seqan3::views::slice(seg.start, seg.start + seg.len) | hash_view()) ibf.emplace(value, seqan3::bin_index{seg.id}); diff --git a/include/valik/search/iterate_queries.hpp b/include/valik/search/iterate_queries.hpp index 9e4ddcdf..80581b69 100644 --- a/include/valik/search/iterate_queries.hpp +++ b/include/valik/search/iterate_queries.hpp @@ -157,11 +157,9 @@ void iterate_split_queries(search_arguments const & arguments, auto query_ptr = std::make_shared(seq); - auto segments_from_id = [&](metadata::segment_stats & seg) {return seqCount == seg.seq_ind;}; - for (auto const & seg : meta.segments | std::views::filter(segments_from_id)) + for (auto const & seg : meta.segments_from_ind(seqCount)) { seqan2::Segment inf = seqan2::infixWithLength(*query_ptr, seg.start, seg.len); - std::vector seg_vec{}; for (auto & c : inf) { diff --git a/include/valik/split/metadata.hpp b/include/valik/split/metadata.hpp index b7b7a014..f212d02a 100644 --- a/include/valik/split/metadata.hpp +++ b/include/valik/split/metadata.hpp @@ -8,6 +8,7 @@ #include #include +#include namespace valik { @@ -72,195 +73,216 @@ struct metadata }; uint64_t total_len; - std::vector sequences; - size_t default_seg_len; - std::vector segments; + size_t seg_count; - /** - * @brief Function that scans over a sequence file to extract metadata. - * - * @param db_path Path to input file. - */ - void scan_database_file(std::filesystem::path const & db_path) - { - using traits_type = seqan3::sequence_file_input_default_traits_dna; - seqan3::sequence_file_input fin{db_path}; - total_len = 0; - size_t fasta_ind = 0; - for (auto & record : fin) - { - sequence_stats seq(record.id(), fasta_ind, record.sequence().size()); - total_len += seq.len; - sequences.push_back(seq); - fasta_ind++; - } - } - - void add_segment(size_t const i, size_t const ind, size_t const s, size_t const l) - { - segment_stats seg(i, ind, s, l); - segments.push_back(seg); - } + private: - /** - * @brief Function that scans over the collection of sequences and greedily assigns segment positions. - * - * @param seg_count Number of segments. - * @param overlap Length of overlap between adjacent segments. - */ - void scan_database_sequences(size_t const seg_count, size_t const overlap) - { - assert(seg_count > 0); - default_seg_len = total_len / seg_count + 1; - assert(default_seg_len > overlap); + std::vector sequences; + size_t default_seg_len; + std::vector segments; - size_t remaining_db_len = total_len; - for (const auto & seq : sequences) + /** + * @brief Function that scans over a sequence file to extract metadata. + * + * @param db_path Path to input file. + */ + void scan_database_file(std::filesystem::path const & db_path) { - size_t start = 0; - if (seq.len < (default_seg_len / 2)) - { - seqan3::debug_stream << "Sequence: " << seq.id << " is too short and will be skipped.\n"; - remaining_db_len -= seq.len; - } - else if ((seq.len >= default_seg_len / 2) & (seq.len <= default_seg_len * 1.5)) + using traits_type = seqan3::sequence_file_input_default_traits_dna; + seqan3::sequence_file_input fin{db_path}; + total_len = 0; + size_t fasta_ind = 0; + for (auto & record : fin) { - // database sequence is single segment - add_segment(segments.size(), seq.ind, start, seq.len); - remaining_db_len -= seq.len; + sequence_stats seq(record.id(), fasta_ind, record.sequence().size()); + total_len += seq.len; + sequences.push_back(seq); + fasta_ind++; } - else + } + + void add_segment(size_t const i, size_t const ind, size_t const s, size_t const l) + { + segment_stats seg(i, ind, s, l); + segments.push_back(seg); + } + + /** + * @brief Function that scans over the collection of sequences and greedily assigns segment positions. + * + * @param n Chosen number of segments. + * @param overlap Length of overlap between adjacent segments. + */ + void scan_database_sequences(size_t const n, size_t const overlap) + { + assert(n > 0); + default_seg_len = total_len / n + 1; + assert(default_seg_len > overlap); + + size_t remaining_db_len = total_len; + for (const auto & seq : sequences) { - // sequences that are contained in a single segment might not have the exact segment length - // dynamically update segment length to divide the rest of the remaining database as equally as possible among the chosen number of segments - size_t remaining_seg_count = seg_count - segments.size(); - size_t updated_seg_len = remaining_db_len / remaining_seg_count; - - size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len); - size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down - - // divide database sequence into multiple segments - add_segment(segments.size(), seq.ind, start, actual_seg_len); - start = start + actual_seg_len - overlap; - while (start + actual_seg_len - overlap < seq.len) + size_t start = 0; + if (seq.len < (default_seg_len / 2)) + { + seqan3::debug_stream << "Sequence: " << seq.id << " is too short and will be skipped.\n"; + remaining_db_len -= seq.len; + } + else if ((seq.len >= default_seg_len / 2) & (seq.len <= default_seg_len * 1.5)) + { + // database sequence is single segment + add_segment(segments.size(), seq.ind, start, seq.len); + remaining_db_len -= seq.len; + } + else { + // sequences that are contained in a single segment might not have the exact segment length + // dynamically update segment length to divide the rest of the remaining database as equally as possible among the chosen number of segments + size_t remaining_seg_count = n - segments.size(); + size_t updated_seg_len = remaining_db_len / remaining_seg_count; + + size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len); + size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down + + // divide database sequence into multiple segments add_segment(segments.size(), seq.ind, start, actual_seg_len); start = start + actual_seg_len - overlap; + while (start + actual_seg_len - overlap < seq.len) + { + add_segment(segments.size(), seq.ind, start, actual_seg_len); + start = start + actual_seg_len - overlap; + } + add_segment(segments.size(), seq.ind, start, seq.len - start); + + remaining_db_len -= seq.len; } - add_segment(segments.size(), seq.ind, start, seq.len - start); - - remaining_db_len -= seq.len; } + assert(segments.size() == n); } - assert(segments.size() == seg_count); - } - /** - * @brief Constructor that scans a sequence database to create a metadata struct. - */ - metadata(split_arguments const & arguments) - { - scan_database_file(arguments.seq_file); - scan_database_sequences(arguments.seg_count, arguments.overlap); - } - - /** - * @brief Constructor that deserializes a metadata struct from file. - */ - metadata(std::filesystem::path const & filepath) - { - std::ifstream in_file(filepath); - if (in_file.is_open()) + public: + /** + * @brief Constructor that scans a sequence database to create a metadata struct. + */ + metadata(split_arguments const & arguments) { - std::string seq_meta; - std::getline(in_file, seq_meta, '$'); - std::stringstream seq_str(seq_meta); + scan_database_file(arguments.seq_file); + scan_database_sequences(arguments.seg_count, arguments.overlap); + seg_count = segments.size(); + } - std::string seq_id; - size_t fasta_ind, length; - total_len = 0; - while (seq_str >> seq_id) + /** + * @brief Constructor that deserializes a metadata struct from file. + */ + metadata(std::filesystem::path const & filepath) + { + std::ifstream in_file(filepath); + if (in_file.is_open()) { - seq_str >> fasta_ind; - seq_str >> length; - total_len += length; - sequences.push_back(sequence_stats(seq_id, fasta_ind, length)); - } + std::string seq_meta; + std::getline(in_file, seq_meta, '$'); + std::stringstream seq_str(seq_meta); + + std::string seq_id, fasta_ind, length; + total_len = 0; + while(std::getline(seq_str, seq_id, '\t')) + { + std::getline(seq_str, fasta_ind, '\t'); + std::getline(seq_str, length, '\n'); + total_len += stoi(length); + sequences.push_back(sequence_stats(seq_id, stoi(fasta_ind), stoi(length))); + } - std::string seg_meta; - std::getline(in_file, seg_meta); // newline - std::getline(in_file, seg_meta, '$'); - std::stringstream seg_str(seg_meta); + std::string seg_meta; + std::getline(in_file, seg_meta); // newline + std::getline(in_file, seg_meta, '$'); + std::stringstream seg_str(seg_meta); - size_t id, seq_ind, start; - while (seg_str >> id) - { - seg_str >> seq_ind; - seg_str >> start; - seg_str >> length; + size_t id, seq_ind, start; + while (seg_str >> id) + { + seg_str >> seq_ind; + seg_str >> start; + seg_str >> length; - add_segment(id, seq_ind, start, length); + add_segment(id, seq_ind, start, stoi(length)); + } } - } - in_file.close(); - } + in_file.close(); + seg_count = segments.size(); + } - inline size_t ind_from_id(std::string const & string_id) const - { - auto it = std::find_if(sequences.begin(), sequences.end(), [&](const sequence_stats& seq) { return seq.id == string_id;}); - if (it == sequences.end()) - throw seqan3::validation_error{"Sequence metadata does not contain sequence from Stellar output."}; - else - return (*it).ind; - } + /** + * @brief Function that returns the numerical index of a sequence based on its fasta ID. + * + * @param string_id Fasta ID. + */ + inline size_t ind_from_id(std::string const & string_id) const + { + auto it = std::find_if(sequences.begin(), sequences.end(), [&](const sequence_stats& seq) { return seq.id == string_id;}); + if (it == sequences.end()) + throw seqan3::validation_error{"Sequence metadata does not contain sequence from Stellar output."}; + else + return (*it).ind; + } - //!TODO: isn't this just segments[id]??? - /** - * @brief Function that find the segment corresponding to a numerical ID. - * - * @param id - * @return segment - */ - segment_stats segment_from_bin(size_t const id) const - { - /* - for (auto & seg : segments) + /** + * @brief Function that returns the segment corresponding to a numerical ID. + * + * @param id + * @return segment + */ + segment_stats segment_from_bin(size_t const id) const { - if (seg.id == id) - return seg; + if (segments.size() <= id) + throw std::runtime_error{"Segment " + std::to_string(id) + " index out of range."}; + + return segments[id]; } - */ - if (segments.size() - 1 < id) - throw std::runtime_error{"Could not find segment " + std::to_string(id)}; + /** + * @brief Function that returns the slice of segments corresponding to a sequence. + * + * @param ind Index of sequence. + */ + auto segments_from_ind(size_t const ind) + { + if (sequences.size() <= ind) + throw std::runtime_error{"Sequence " + std::to_string(ind) + " index out of range."}; - return segments[id]; - } + auto is_sequence_segment = [&](segment_stats & seg) {return ind == seg.seq_ind;}; + std::vector sequence_segments{}; + for (auto seg : segments | std::views::filter(is_sequence_segment)) + { + sequence_segments.push_back(seg); + } - /** - * @brief Function that serializes the metadata struct. - * - * @param filepath Output file path. - */ - void to_file(std::filesystem::path const & filepath) - { - std::ofstream out_file; - out_file.open(filepath); + return sequence_segments; + } - for (sequence_stats const & seq : sequences) - out_file << seq.id << '\t' << seq.ind << '\t' << seq.len << '\n'; + /** + * @brief Function that serializes the metadata struct. + * + * @param filepath Output file path. + */ + void to_file(std::filesystem::path const & filepath) + { + std::ofstream out_file; + out_file.open(filepath); - out_file << "$\n"; + for (sequence_stats const & seq : sequences) + out_file << seq.id << '\t' << seq.ind << '\t' << seq.len << '\n'; - for (const auto & seg : segments) - out_file << seg.id << '\t' << seg.seq_ind << '\t' << seg.start << '\t' << seg.len << '\n'; + out_file << "$\n"; - out_file << "$\n"; - out_file.close(); - } + for (const auto & seg : segments) + out_file << seg.id << '\t' << seg.seq_ind << '\t' << seg.start << '\t' << seg.len << '\n'; + out_file << "$\n"; + out_file.close(); + } }; } // namespace valik diff --git a/src/argument_parsing/build.cpp b/src/argument_parsing/build.cpp index b83924fa..22dfdf82 100644 --- a/src/argument_parsing/build.cpp +++ b/src/argument_parsing/build.cpp @@ -91,7 +91,7 @@ void run_build(sharg::parser & parser) else { metadata meta(arguments.ref_meta_path); - arguments.bins = meta.segments.size(); + arguments.bins = meta.seg_count; sequence_file_validator(arguments.bin_file); std::string bin_string{arguments.bin_file.string()}; arguments.bin_path.emplace_back(std::vector{bin_string}); diff --git a/src/split/write_seg_sequences.cpp b/src/split/write_seg_sequences.cpp index 881339c2..d8e6bcf3 100644 --- a/src/split/write_seg_sequences.cpp +++ b/src/split/write_seg_sequences.cpp @@ -18,12 +18,10 @@ void write_reference_segments(metadata & reference_metadata, std::ofstream file_paths_out; file_paths_out.open(seg_path_file); - int i = 0; + size_t i = 0; for (auto && [seq] : sequence_file_t{ref_path}) { - // get the underlying sequence - auto is_segment_sequence = [&](metadata::segment_stats & seg) {return reference_metadata.sequences.at(i).ind == seg.seq_ind;}; - for (auto & seg : reference_metadata.segments | std::views::filter(is_segment_sequence)) + for (auto & seg : reference_metadata.segments_from_ind(i)) { std::filesystem::path seg_file = ref_path; std::filesystem::path seg_stem = seg_file.stem(); @@ -55,12 +53,10 @@ void write_query_segments(metadata & query_metadata, seg_out_path.replace_extension("segments.fasta"); seqan3::sequence_file_output seg_out{seg_out_path}; - int i = 0; + size_t i = 0; for (auto && [seq] : sequence_file_t{query_path}) { - // get the underlying sequence - auto is_segment_sequence = [&](metadata::segment_stats & seg) {return query_metadata.sequences.at(i).ind == seg.seq_ind;}; - for (auto & seg : query_metadata.segments | std::views::filter(is_segment_sequence)) + for (auto & seg : query_metadata.segments_from_ind(i)) { std::string id{seg.unique_id()}; seqan3::dna4_vector seg_sequence(&seq[seg.start], &seq[seg.start+seg.len]); diff --git a/test/api/valik/split/write_seg_sequences_test.cpp b/test/api/valik/split/write_seg_sequences_test.cpp index dc581961..3fa34be7 100644 --- a/test/api/valik/split/write_seg_sequences_test.cpp +++ b/test/api/valik/split/write_seg_sequences_test.cpp @@ -40,8 +40,8 @@ static void const test_reference_out(size_t overlap, size_t bins) for (size_t i = 0; i < bins - 1; i++) { - valik::metadata::segment_stats current_seg = meta.segments[i]; - valik::metadata::segment_stats next_seg = meta.segments[i + 1]; + valik::metadata::segment_stats current_seg = meta.segment_from_bin(i); + valik::metadata::segment_stats next_seg = meta.segment_from_bin(i + 1); std::string current_seg_seq = string_from_file(data_path("database_" + std::to_string(i) + ".fasta"), std::ios::binary); std::string next_seg_seq = string_from_file(data_path("database_" + std::to_string(i + 1) + ".fasta"), std::ios::binary); @@ -105,8 +105,8 @@ static void const test_query_out(size_t overlap, size_t bins) { if (i > 1) { - valik::metadata::segment_stats previous_seg = meta.segments[i - 1]; - valik::metadata::segment_stats current_seg = meta.segments[i]; + valik::metadata::segment_stats previous_seg = meta.segment_from_bin(i - 1); + valik::metadata::segment_stats current_seg = meta.segment_from_bin(i); EXPECT_EQ(previous_seg_seq.size(), previous_seg.len); EXPECT_EQ(current_seg_seq.size(), current_seg.len); diff --git a/test/data/split/multi/chromosome_metadata.txt b/test/data/split/multi/chromosome_metadata.txt deleted file mode 100644 index a85e6d7c..00000000 --- a/test/data/split/multi/chromosome_metadata.txt +++ /dev/null @@ -1,5 +0,0 @@ -chr1 0 210 -chr2 1 490 -chr3 2 420 -chr4 3 280 -chr5 4 4 diff --git a/test/data/split/single/reference_metadata.txt b/test/data/split/single/reference_metadata.txt deleted file mode 100644 index 6a5dbd46..00000000 --- a/test/data/split/single/reference_metadata.txt +++ /dev/null @@ -1 +0,0 @@ -1 0 10240