From 8d346a790fd6e306daaad42c95ce293d17ab16aa Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 13 Jul 2023 14:36:24 +0200 Subject: [PATCH 01/11] Shared memory mode: pass query sequences in memory --- .../search/prefilter_queries_parallel.hpp | 4 - src/valik_search.cpp | 76 ++++++++++++------- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/include/valik/search/prefilter_queries_parallel.hpp b/include/valik/search/prefilter_queries_parallel.hpp index 565f58ff..6dbb540a 100644 --- a/include/valik/search/prefilter_queries_parallel.hpp +++ b/include/valik/search/prefilter_queries_parallel.hpp @@ -36,10 +36,6 @@ inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter records_slice{&records[start], &records[end]}; - /** This lambda writes the bin_hits into a file - * - * Caution, it creates a `result_string` of type `std::string` which it reuses for more efficiency - */ auto result_cb = [&queue](query_record const& record, std::unordered_set const& bin_hits) { for (size_t const bin : bin_hits) diff --git a/src/valik_search.cpp b/src/valik_search.cpp index 59abb003..0fd1b83b 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -24,6 +24,51 @@ namespace valik::app { +using fields = seqan3::fields; +using types = seqan3::type_list>; +using sequence_record_type = seqan3::sequence_record; + +template +inline bool get_cart_queries(rec_vec_t & records, + seqan2::StringSet & seqs, + seqan2::StringSet & ids, + TStream & strOut, + TStream & strErr) +{ + std::set uniqueIds; // set of short IDs (cut at first whitespace) + bool idsUnique = true; + + size_t seqCount{0}; + for (auto & record : records) + { + TSequence seq; + for (auto c : record.sequence) + seq += c.to_char(); + + seqan2::appendValue(seqs, seq, seqan2::Generous()); + seqan2::appendValue(ids, (seqan2::String) record.sequence_id, seqan2::Generous()); + seqCount++; + idsUnique &= stellar::_checkUniqueId(uniqueIds, (seqan2::String) record.sequence_id); + } + + strOut << "Loaded " << seqCount << " query sequence" << ((seqCount > 1) ? "s." : ".") << std::endl; + if (!idsUnique) + strErr << "WARNING: Non-unique query ids. Output can be ambiguous.\n"; + return true; +} + +template +void write_cart_queries(rec_vec_t & records, std::filesystem::path const & cart_queries_path) +{ + seqan3::sequence_file_output fout{cart_queries_path, fields{}}; + + for (auto & record : records) + { + sequence_record_type sequence_record{std::move(record.sequence_id), std::move(record.sequence)}; + fout.push_back(sequence_record); + } +} + //----------------------------- // // Setup IBF and launch multithreaded search. @@ -44,10 +89,6 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim time_statistics.index_io_time += std::chrono::duration_cast>(end - start).count(); } - using fields = seqan3::fields; - using types = seqan3::type_list>; - using sequence_record_type = seqan3::sequence_record; - seqan3::sequence_file_input fin{arguments.query_file}; std::vector query_records{}; @@ -93,6 +134,7 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim if (arguments.shared_memory) { + //!TODO: add this to runtime output stellar::stellar_app_runtime stellarTime{}; for (auto bin_paths : index.bin_path()) @@ -140,16 +182,6 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim ld.output_files.push_back(cart_queries_path.string() + ".gff"); - { - seqan3::sequence_file_output fout{cart_queries_path, fields{}}; - - for (auto & record : records) - { - sequence_record_type sequence_record{std::move(record.sequence_id), std::move(record.sequence)}; - fout.push_back(sequence_record); - } - } - if (arguments.shared_memory) { stellar::StellarOptions threadOptions{}; @@ -159,19 +191,9 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim // import query sequences seqan2::StringSet queries; seqan2::StringSet queryIDs; - - using TSize = decltype(length(queries[0])); - TSize queryLen{0}; // does not get populated currently //!TODO: split query sequence - bool const queriesSuccess = stellarThreadTime.input_queries_time.measure_time([&]() - { - return stellar::_importAllSequences(cart_queries_path.c_str(), "query", queries, queryIDs, queryLen, ld.text_out, ld.text_out); - }); - if (!queriesSuccess) - { - std::cerr << "Error importing queries\n"; - error_triggered = true; - } + get_cart_queries(records, queries, queryIDs, ld.text_out, ld.text_out); + using TSize = decltype(length(queries[0])); threadOptions.alphabet = "dna"; // Possible values: dna, rna, protein, char threadOptions.queryFile = cart_queries_path.string(); @@ -348,6 +370,8 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim } else { + write_cart_queries(records, cart_queries_path); + std::vector process_args{}; process_args.insert(process_args.end(), {var_pack.stellar_exec, "--version-check", "0", "-a", "dna"}); From 65ba15a61a96c461c7d2920ce47fba12b0f1b7d8 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 13 Jul 2023 15:08:43 +0200 Subject: [PATCH 02/11] Measure reference I/O time --- include/valik/search/search_time_statistics.hpp | 4 +++- src/valik_search.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/valik/search/search_time_statistics.hpp b/include/valik/search/search_time_statistics.hpp index 98f7edf8..57cfda93 100644 --- a/include/valik/search/search_time_statistics.hpp +++ b/include/valik/search/search_time_statistics.hpp @@ -9,6 +9,7 @@ namespace valik::app struct search_time_statistics { + double ref_io_time{0.0}; double index_io_time{0.0}; std::vector cart_processing_times; double reads_io_time{0.0}; @@ -40,9 +41,10 @@ inline void write_time_statistics(search_time_statistics const & time_statistics std::filesystem::path file_path{time_file}; std::ofstream file_handle(file_path, std::ofstream::app); - file_handle << "IBF I/O\tReads I/O\tPrefilter\tMin cart time\tAvg cart time\tMax cart time\tNr carts\n"; + file_handle << "Ref I/O\tIBF I/O\tReads I/O\tPrefilter\tMin cart time\tAvg cart time\tMax cart time\tNr carts\n"; file_handle << std::fixed << std::setprecision(2) + << time_statistics.ref_io_time << '\t' << time_statistics.index_io_time << '\t' << time_statistics.reads_io_time << '\t' << time_statistics.prefilter_time << '\t'; diff --git a/src/valik_search.cpp b/src/valik_search.cpp index 0fd1b83b..f18ea203 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -134,14 +134,12 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim if (arguments.shared_memory) { - //!TODO: add this to runtime output - stellar::stellar_app_runtime stellarTime{}; - + stellar::stellar_runtime input_databases_time{}; for (auto bin_paths : index.bin_path()) { for (auto path : bin_paths) { - bool const databasesSuccess = stellarTime.input_databases_time.measure_time([&]() + bool const databasesSuccess = input_databases_time.measure_time([&]() { std::cout << "Launching stellar search on a shared memory machine...\n"; return stellar::_importAllSequences(path.c_str(), "database", databases, databaseIDs, refLen, std::cout, std::cerr); @@ -159,6 +157,8 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim seqan2::appendValue(reverseDatabases, database, seqan2::Generous()); } } + + time_statistics.ref_io_time += input_databases_time.milliseconds() / 1000; } stellar::DatabaseIDMap databaseIDMap{databases, databaseIDs}; stellar::DatabaseIDMap reverseDatabaseIDMap{reverseDatabases, databaseIDs}; From fbc74b8fb65977168f748fc9cb0701d0a433c2c3 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 13 Jul 2023 15:55:54 +0200 Subject: [PATCH 03/11] Process disabled queries --- include/valik/shared.hpp | 2 +- src/argument_parsing/search.cpp | 17 ++++++----------- src/valik_search.cpp | 29 ++++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index 1976a0f8..f5d79e7e 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -91,7 +91,7 @@ struct minimiser_threshold_arguments inline minimiser_threshold_arguments::~minimiser_threshold_arguments() = default; -struct search_arguments final : public minimiser_threshold_arguments +struct search_arguments final : public minimiser_threshold_arguments, public stellar::StellarOptions { ~search_arguments() override = default; search_arguments() = default; diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index 5947a83f..5a393b70 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -101,6 +101,12 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments) .long_id = "threads", .description = "Choose the number of threads.", .validator = positive_integer_validator{}}); + parser.add_option(arguments.disableThresh, + sharg::config{.short_id = '\0', + .long_id = "disableThresh", + .description = "Maximal number of verified matches before disabling verification for one query sequence.", + .advanced = true, + .validator = sharg::arithmetic_range_validator{1, 10000}}); ///////////////////////////////////////// // Stellar options @@ -138,11 +144,6 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments) .long_id = "verification", .description = "Verification strategy: exact or bestLocal or bandedGlobal.", .validator = sharg::value_list_validator{"exact", "bestLocal", "bandedGlobal", "bandedGlobalExtend"}}); - parser.add_option(options.disableThresh, - sharg::config{.short_id = '\0', - .long_id = "disableThresh", - .description = "Maximal number of verified matches before disabling verification for one query sequence (default infinity).", - .validator = sharg::arithmetic_range_validator{1, 10000}}); parser.add_option(options.numMatches, sharg::config{.short_id = 'n', .long_id = "numMatches", @@ -151,12 +152,6 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments) sharg::config{.short_id = 's', .long_id = "sortThresh", .description = "Number of matches triggering removal of duplicates. Choose a smaller value for saving space."}); - - parser.add_option(options.disabledQueriesFile, - sharg::config{.short_id = '\0', - .long_id = "disabledQueriesFile", - .description = "Name of output file for disabled query sequences.", - .validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create, {"fa", "fasta"}}}); */ } diff --git a/src/valik_search.cpp b/src/valik_search.cpp index f18ea203..a64999c1 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -127,13 +127,26 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim // negative (reverse complemented) database strand bool const reverse = true /*threadOptions.reverse && threadOptions.alphabet != "protein" && threadOptions.alphabet != "char" */; seqan2::StringSet databases; + using TSize = decltype(length(databases[0])); seqan2::StringSet reverseDatabases; seqan2::StringSet databaseIDs; - using TSize = decltype(length(databases[0])); + std::ofstream disabledQueriesFile; TSize refLen; if (arguments.shared_memory) { + if (arguments.disableThresh != std::numeric_limits::max()) + { + std::filesystem::path disabledPath = arguments.out_file; + disabledPath.replace_extension(".disabled.fa"); + disabledQueriesFile.open(((std::string) disabledPath).c_str(), ::std::ios_base::out | ::std::ios_base::app); + if (!disabledQueriesFile.is_open()) + { + std::cerr << "Could not open file for disabled queries." << std::endl; + return false; + } + } + stellar::stellar_runtime input_databases_time{}; for (auto bin_paths : index.bin_path()) { @@ -218,6 +231,7 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim threadOptions.numEpsilon = er_rate; threadOptions.epsilon = stellar::utils::fraction::from_double(threadOptions.numEpsilon).limit_denominator(); threadOptions.minLength = arguments.pattern_size; + threadOptions.disableThresh = arguments.disableThresh; threadOptions.outputFile = cart_queries_path.string() + ".gff"; stellar::_writeFileNames(threadOptions, ld.text_out); stellar::_writeSpecifiedParams(threadOptions, ld.text_out); @@ -236,7 +250,6 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim ld.text_out << std::endl; stellarThreadTime.swift_index_construction_time.manual_timing(current_time); - //!TODO: process disabled queries std::vector disabledQueryIDs{}; stellar::StellarOutputStatistics outputStatistics{}; @@ -358,7 +371,17 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim }); // measure_time } - stellar::_writeOutputStatistics(outputStatistics, threadOptions.verbose, false /* disabledQueriesFile.is_open() */, ld.text_out); + // Writes disabled query sequences to disabledFile. + if (disabledQueriesFile.is_open()) + { + stellarThreadTime.output_disabled_queries_time.measure_time([&]() + { + // write disabled query file + stellar::_writeDisabledQueriesToFastaFile(disabledQueryIDs, queryIDs, queries, disabledQueriesFile); + }); // measure_time + } + + stellar::_writeOutputStatistics(outputStatistics, threadOptions.verbose, disabledQueriesFile.is_open(), ld.text_out); ld.timeStatistics.emplace_back(stellarThreadTime.milliseconds()); if (arguments.write_time) From d97a79aad6d367da1bcce6ea3c499d07a9ca08fa Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Thu, 13 Jul 2023 15:56:39 +0200 Subject: [PATCH 04/11] Index build can also not have multiple files per bin --- src/valik_search.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/valik_search.cpp b/src/valik_search.cpp index a64999c1..f28f7c4f 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -226,7 +226,7 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim throw std::runtime_error("Could not find reference file with index " + std::to_string(bin_id) + ". Did you forget to provide metadata to search segments in a single reference file instead?"); } - threadOptions.binSequences.push_back(bin_id); //!TODO: what if mutliple sequence files per bin + threadOptions.binSequences.push_back(bin_id); } threadOptions.numEpsilon = er_rate; threadOptions.epsilon = stellar::utils::fraction::from_double(threadOptions.numEpsilon).limit_denominator(); From eaa9e38df1213bb1f3f6291b7fb98a96f5cb3eaa Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Fri, 4 Aug 2023 09:59:28 +0200 Subject: [PATCH 05/11] Rename reference_metadata -> sequence_metadata --- include/utilities/consolidate/io.hpp | 4 +- .../utilities/consolidate/stellar_match.hpp | 4 +- include/valik/build/index_factory.hpp | 4 +- include/valik/split/reference_metadata.hpp | 104 ------------------ include/valik/split/reference_segments.hpp | 6 +- include/valik/split/write_seg_sequences.hpp | 4 +- src/consolidate/io.cpp | 2 +- src/valik_consolidate.cpp | 2 +- src/valik_search.cpp | 6 +- src/valik_split.cpp | 4 +- .../valik/split/write_seg_sequences_test.cpp | 2 +- test/cli/dream_test.cpp | 2 +- test/cli/valik_test.cpp | 2 +- 13 files changed, 21 insertions(+), 125 deletions(-) delete mode 100644 include/valik/split/reference_metadata.hpp diff --git a/include/utilities/consolidate/io.hpp b/include/utilities/consolidate/io.hpp index c589d0a6..658706d1 100644 --- a/include/utilities/consolidate/io.hpp +++ b/include/utilities/consolidate/io.hpp @@ -2,7 +2,7 @@ #include -#include +#include #include @@ -10,7 +10,7 @@ namespace valik { std::vector read_stellar_output(std::filesystem::path const & match_path, - reference_metadata const & reference, + sequence_metadata const & reference, std::ios_base::openmode const mode = std::ios_base::in); void write_stellar_output(std::filesystem::path const & out_path, diff --git a/include/utilities/consolidate/stellar_match.hpp b/include/utilities/consolidate/stellar_match.hpp index 7f7fdec7..5e94fbbd 100644 --- a/include/utilities/consolidate/stellar_match.hpp +++ b/include/utilities/consolidate/stellar_match.hpp @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace valik { @@ -19,7 +19,7 @@ struct stellar_match // 1;seq2Range=1280,1378;cigar=97M1D2M;mutations=14A,45G,58T,92C std::string attributes{}; - stellar_match(std::vector const match_vec, reference_metadata const & reference) + stellar_match(std::vector const match_vec, sequence_metadata const & reference) { dname = match_vec[0]; diff --git a/include/valik/build/index_factory.hpp b/include/valik/build/index_factory.hpp index 515b290b..b10fb154 100644 --- a/include/valik/build/index_factory.hpp +++ b/include/valik/build/index_factory.hpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include @@ -66,7 +66,7 @@ class index_factory if (arguments->from_segments) { reference_segments segments(arguments->seg_path); - reference_metadata reference(arguments->ref_meta_path, false); + sequence_metadata reference(arguments->ref_meta_path, false); auto & ibf = index.ibf(); int i = 0; diff --git a/include/valik/split/reference_metadata.hpp b/include/valik/split/reference_metadata.hpp deleted file mode 100644 index cccf7e17..00000000 --- a/include/valik/split/reference_metadata.hpp +++ /dev/null @@ -1,104 +0,0 @@ -#pragma once - -#include -#include -#include - -#include -#include - -namespace valik -{ - -class reference_metadata -{ - public: - struct sequence_stats - { - std::string id; - size_t ind; - size_t len; - - sequence_stats(std::string const ref_id, size_t const ref_ind, size_t const ref_length) - { - id = ref_id; - ind = ref_ind; - len = ref_length; - } - }; - - uint64_t total_len; - std::vector sequences; - - void construct_by_sequence_file(std::filesystem::path const & filepath) - { - using traits_type = seqan3::sequence_file_input_default_traits_dna; - seqan3::sequence_file_input fin{filepath}; - - total_len = 0; - size_t ref_ind = 0; - for (auto & record : fin) - { - sequence_stats seq(record.id(), ref_ind, record.sequence().size()); - total_len += seq.len; - sequences.push_back(seq); - ref_ind++; - } - } - - void construct_by_metadata_file(std::filesystem::path const & filepath) - { - std::ifstream in_file(filepath); - - std::string seq_id; - size_t ref_ind, length; - total_len = 0; - if (in_file.is_open()) - { - while (in_file >> seq_id) - { - in_file >> ref_ind; - in_file >> length; - total_len += length; - sequences.push_back(sequence_stats(seq_id, ref_ind, length)); - } - } - in_file.close(); - } - - reference_metadata(std::filesystem::path const & filepath, bool const from_sequence) - { - if (from_sequence) - { - construct_by_sequence_file(filepath); - } - else - { - construct_by_metadata_file(filepath); // deserialize - } - } - - inline size_t ind_from_id(std::string const & string_id) const - { - auto it = std::find_if(sequences.begin(), sequences.end(), [&](const sequence_stats& seq) { return seq.id == string_id;}); - if (it == sequences.end()) - throw seqan3::validation_error{"Reference metadata does not contain sequence from Stellar output."}; - else - return (*it).ind; - } - - // serialize - void to_file(std::filesystem::path filepath) - { - std::ofstream out_file; - out_file.open(filepath); - for (sequence_stats const & seq : sequences) - { - out_file << seq.id << '\t' << seq.ind << '\t' << seq.len << '\n'; - } - out_file.close(); - } - -}; - -} // namespace valik diff --git a/include/valik/split/reference_segments.hpp b/include/valik/split/reference_segments.hpp index ef726162..794cc556 100644 --- a/include/valik/split/reference_segments.hpp +++ b/include/valik/split/reference_segments.hpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include @@ -43,7 +43,7 @@ class reference_segments { members.push_back(seg); } - void construct_by_linear_scan(size_t const bins, size_t const overlap, reference_metadata const & reference) + void construct_by_linear_scan(size_t const bins, size_t const overlap, sequence_metadata const & reference) { assert(bins > 0); default_len = reference.total_len / bins + 1; @@ -90,7 +90,7 @@ class reference_segments { assert(members.size() == bins); } - reference_segments(size_t bins, size_t overlap, reference_metadata const & reference) + reference_segments(size_t bins, size_t overlap, sequence_metadata const & reference) { construct_by_linear_scan(bins, overlap, reference); } diff --git a/include/valik/split/write_seg_sequences.hpp b/include/valik/split/write_seg_sequences.hpp index 57d9e937..4cde1257 100644 --- a/include/valik/split/write_seg_sequences.hpp +++ b/include/valik/split/write_seg_sequences.hpp @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include namespace valik @@ -9,7 +9,7 @@ namespace valik // Note: template functions should live in .cpp files. This solution to avoid linker errors may cause code bloat. template -void write_seg_sequences(reference_metadata const & reference, reference_segments & segments, std::filesystem::path const & ref_path) +void write_seg_sequences(sequence_metadata const & reference, reference_segments & segments, std::filesystem::path const & ref_path) { using sequence_file_t = seqan3::sequence_file_input>; diff --git a/src/consolidate/io.cpp b/src/consolidate/io.cpp index a1ccc76d..66641725 100644 --- a/src/consolidate/io.cpp +++ b/src/consolidate/io.cpp @@ -4,7 +4,7 @@ namespace valik { std::vector read_stellar_output(std::filesystem::path const & match_path, - reference_metadata const & reference, + sequence_metadata const & reference, std::ios_base::openmode const mode /* = std::ios_base::in */) { std::vector matches; diff --git a/src/valik_consolidate.cpp b/src/valik_consolidate.cpp index 3da02bf9..8012d77a 100644 --- a/src/valik_consolidate.cpp +++ b/src/valik_consolidate.cpp @@ -5,7 +5,7 @@ namespace valik::app void valik_consolidate(consolidation_arguments const & arguments) { - reference_metadata reference(arguments.ref_meta_path, false); + sequence_metadata reference(arguments.ref_meta_path, false); auto matches = read_stellar_output(arguments.matches_in, reference); diff --git a/src/valik_search.cpp b/src/valik_search.cpp index f28f7c4f..07701b81 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include @@ -102,9 +102,9 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim if (!arguments.seg_path.empty()) segments = reference_segments(arguments.seg_path); - std::optional ref_meta; + std::optional ref_meta; if (!arguments.ref_meta_path.empty()) - ref_meta = reference_metadata(arguments.ref_meta_path, false); + ref_meta = sequence_metadata(arguments.ref_meta_path, false); //!WORKAROUND: Stellar does not allow smaller error rates double er_rate = std::max((double) arguments.errors / (double) arguments.pattern_size, 0.00001); diff --git a/src/valik_split.cpp b/src/valik_split.cpp index 7eba2f0e..287eda52 100644 --- a/src/valik_split.cpp +++ b/src/valik_split.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include @@ -15,7 +15,7 @@ namespace valik::app void valik_split(split_arguments const & arguments) { // Linear scan over reference file to extract metadata - reference_metadata reference(arguments.ref_file, true); + sequence_metadata reference(arguments.ref_file, true); reference.to_file(arguments.ref_out); // For each segment assign start, length and bin number diff --git a/test/api/valik/split/write_seg_sequences_test.cpp b/test/api/valik/split/write_seg_sequences_test.cpp index 309ea070..5922701d 100644 --- a/test/api/valik/split/write_seg_sequences_test.cpp +++ b/test/api/valik/split/write_seg_sequences_test.cpp @@ -34,7 +34,7 @@ static void const test_write_out(size_t overlap, size_t bins) { std::string path_prefix = "write_out_" + std::to_string(overlap) + "_" + std::to_string(bins); - valik::reference_metadata reference(data(path_prefix + "_reference_metadata.txt"), false); + valik::sequence_metadata reference(data(path_prefix + "_reference_metadata.txt"), false); valik::reference_segments segments(data(path_prefix + "_reference_segments.txt")); valik::write_seg_sequences(reference, segments, data(path_prefix + "_ref.fasta")); diff --git a/test/cli/dream_test.cpp b/test/cli/dream_test.cpp index efd7a0a0..06051bdf 100644 --- a/test/cli/dream_test.cpp +++ b/test/cli/dream_test.cpp @@ -18,7 +18,7 @@ TEST_P(dream_search, shared_mem) setenv("VALIK_MERGE", "cat", true); std::filesystem::path ref_meta_path = data("ref_meta.txt"); - valik::reference_metadata reference(ref_meta_path, false); + valik::sequence_metadata reference(ref_meta_path, false); std::filesystem::path seg_meta_path = data("seg_meta150overlap" + std::to_string(number_of_bins) + "bins.txt"); std::filesystem::path index_path = ibf_path(number_of_bins, window_size); diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index 689f370e..defedfb3 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -240,7 +240,7 @@ TEST_P(valik_consolidate, consolidation) auto const [number_of_bins, segment_overlap] = GetParam(); std::filesystem::path ref_meta_path = consolidation_meta_path(number_of_bins, segment_overlap); - valik::reference_metadata reference(ref_meta_path, false); + valik::sequence_metadata reference(ref_meta_path, false); cli_test_result const result = execute_app("valik", "consolidate", "--input ", consolidation_input_path(number_of_bins, segment_overlap), From 1b30a77f0459ce7797bf5ac0402ed3ca88dd92b0 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Sat, 5 Aug 2023 12:16:05 +0200 Subject: [PATCH 06/11] Make split metadata generic for reference and query databases --- include/utilities/consolidate/consolidate.hpp | 2 +- include/utilities/consolidate/io.hpp | 4 +- .../utilities/consolidate/stellar_match.hpp | 4 +- include/valik/build/index_factory.hpp | 12 +- include/valik/split/database_metadata.hpp | 114 +++++++++++++ include/valik/split/database_segments.hpp | 156 ++++++++++++++++++ include/valik/split/reference_segments.hpp | 143 ---------------- include/valik/split/write_seg_sequences.hpp | 10 +- src/argument_parsing/build.cpp | 4 +- src/consolidate/io.cpp | 2 +- src/valik_consolidate.cpp | 2 +- src/valik_search.cpp | 16 +- src/valik_split.cpp | 8 +- .../valik/split/write_seg_sequences_test.cpp | 10 +- test/cli/cli_test.hpp | 2 +- test/cli/dream_test.cpp | 2 +- test/cli/valik_test.cpp | 4 +- 17 files changed, 311 insertions(+), 184 deletions(-) create mode 100644 include/valik/split/database_metadata.hpp create mode 100644 include/valik/split/database_segments.hpp delete mode 100644 include/valik/split/reference_segments.hpp diff --git a/include/utilities/consolidate/consolidate.hpp b/include/utilities/consolidate/consolidate.hpp index 3e25505b..eb6bf11c 100644 --- a/include/utilities/consolidate/consolidate.hpp +++ b/include/utilities/consolidate/consolidate.hpp @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include diff --git a/include/utilities/consolidate/io.hpp b/include/utilities/consolidate/io.hpp index 658706d1..233bb541 100644 --- a/include/utilities/consolidate/io.hpp +++ b/include/utilities/consolidate/io.hpp @@ -2,7 +2,7 @@ #include -#include +#include #include @@ -10,7 +10,7 @@ namespace valik { std::vector read_stellar_output(std::filesystem::path const & match_path, - sequence_metadata const & reference, + database_metadata const & reference, std::ios_base::openmode const mode = std::ios_base::in); void write_stellar_output(std::filesystem::path const & out_path, diff --git a/include/utilities/consolidate/stellar_match.hpp b/include/utilities/consolidate/stellar_match.hpp index 5e94fbbd..9cb141a3 100644 --- a/include/utilities/consolidate/stellar_match.hpp +++ b/include/utilities/consolidate/stellar_match.hpp @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace valik { @@ -19,7 +19,7 @@ struct stellar_match // 1;seq2Range=1280,1378;cigar=97M1D2M;mutations=14A,45G,58T,92C std::string attributes{}; - stellar_match(std::vector const match_vec, sequence_metadata const & reference) + stellar_match(std::vector const match_vec, database_metadata const & reference) { dname = match_vec[0]; diff --git a/include/valik/build/index_factory.hpp b/include/valik/build/index_factory.hpp index b10fb154..70c57801 100644 --- a/include/valik/build/index_factory.hpp +++ b/include/valik/build/index_factory.hpp @@ -2,8 +2,8 @@ #include #include -#include -#include +#include +#include #include @@ -65,19 +65,19 @@ class index_factory if (arguments->from_segments) { - reference_segments segments(arguments->seg_path); - sequence_metadata reference(arguments->ref_meta_path, false); + database_segments segments(arguments->seg_path); + database_metadata reference(arguments->ref_meta_path, false); auto & ibf = index.ibf(); int i = 0; for (auto && [seq] : sequence_file_t{arguments->bin_file}) { // get the relevant segments for each reference - auto ref_seg = [&](reference_segments::segment & seg) {return reference.sequences.at(i).ind == seg.ref_ind;}; + auto ref_seg = [&](database_segments::segment & seg) {return reference.sequences.at(i).ind == seg.seq_ind;}; for (auto & seg : segments.members | std::views::filter(ref_seg)) { for (auto && value : seq | seqan3::views::slice(seg.start, seg.start + seg.len) | hash_view()) - ibf.emplace(value, seqan3::bin_index{seg.bin}); + ibf.emplace(value, seqan3::bin_index{seg.id}); } i++; } diff --git a/include/valik/split/database_metadata.hpp b/include/valik/split/database_metadata.hpp new file mode 100644 index 00000000..a0868d3a --- /dev/null +++ b/include/valik/split/database_metadata.hpp @@ -0,0 +1,114 @@ +#pragma once + +#include +#include +#include + +#include +#include + +namespace valik +{ + +/** !\brief a metadata struct that represents a reference or query database. + * + * \param total_len Total database length. + * \param sequences Collection of database sequences. + */ +struct database_metadata +{ + /** !\brief a metadata struct that represents a single sequence. + * + * \param id The FASTA id. + * \param ind 0-based index in the input FASTA file. + * \param len Sequence length. + */ + struct sequence_stats + { + std::string id; + size_t ind; + size_t len; + + sequence_stats(std::string const fasta_id, size_t const fasta_ind, size_t const seq_length) + { + id = fasta_id; + ind = fasta_ind; + len = seq_length; + } + }; + + uint64_t total_len; + std::vector sequences; + + void construct_by_sequence_file(std::filesystem::path const & filepath) + { + using traits_type = seqan3::sequence_file_input_default_traits_dna; + seqan3::sequence_file_input fin{filepath}; + + total_len = 0; + size_t fasta_ind = 0; + for (auto & record : fin) + { + sequence_stats seq(record.id(), fasta_ind, record.sequence().size()); + total_len += seq.len; + sequences.push_back(seq); + fasta_ind++; + } + } + + void construct_by_metadata_file(std::filesystem::path const & filepath) + { + std::ifstream in_file(filepath); + + std::string seq_id; + size_t fasta_ind, length; + total_len = 0; + if (in_file.is_open()) + { + while (in_file >> seq_id) + { + in_file >> fasta_ind; + in_file >> length; + total_len += length; + sequences.push_back(sequence_stats(seq_id, fasta_ind, length)); + } + } + in_file.close(); + } + + database_metadata(std::filesystem::path const & filepath, bool const from_sequence) + { + if (from_sequence) + { + construct_by_sequence_file(filepath); + } + else + { + construct_by_metadata_file(filepath); // deserialize + } + } + + inline size_t ind_from_id(std::string const & string_id) const + { + auto it = std::find_if(sequences.begin(), sequences.end(), [&](const sequence_stats& seq) { return seq.id == string_id;}); + if (it == sequences.end()) + throw seqan3::validation_error{"Sequence metadata does not contain sequence from Stellar output."}; + else + return (*it).ind; + } + + // serialize + void to_file(std::filesystem::path filepath) + { + std::ofstream out_file; + out_file.open(filepath); + for (sequence_stats const & seq : sequences) + { + out_file << seq.id << '\t' << seq.ind << '\t' << seq.len << '\n'; + } + out_file.close(); + } + +}; + +} // namespace valik diff --git a/include/valik/split/database_segments.hpp b/include/valik/split/database_segments.hpp new file mode 100644 index 00000000..0e8b8ab5 --- /dev/null +++ b/include/valik/split/database_segments.hpp @@ -0,0 +1,156 @@ +#pragma once + +#include +#include + +#include + +#include +#include + +namespace valik +{ + +/** !\brief a struct that represents all segments of a reference or query database. + * + * \param members Collection of database segments. + * \param default_len Default length of a segment that is dynamically updated. + */ +struct database_segments { + + /** !\brief a struct that represents a single segment. + * + * All indices and positions are 0-based. + * + * \param id Index of the segment in the vector of segments. + * \param seq_ind Index of the underlying sequence in the input FASTA file. Corresponds to database_metadata::sequence_stats::id. + * \param start Segment start position in sequence. + * \param len Segment length. + */ + struct segment { + size_t id; + size_t seq_ind; + size_t start; + size_t len; + + segment(size_t i, size_t ind, size_t s, size_t l) + { + id = i; + seq_ind = ind; + start = s; + len = l; + } + + std::string unique_id() + { + return std::to_string(seq_ind) + "_" + std::to_string(start) + "_" + std::to_string(len); + } + }; + + std::vector members; + size_t default_len; + + void add_segment(size_t const i, size_t const ind, size_t const s, size_t const l) + { + segment seg(i, ind, s, l); + members.push_back(seg); + } + + void construct_by_linear_scan(size_t const seg_count, size_t const overlap, database_metadata const & database) + { + assert(seg_count > 0); + default_len = database.total_len / seg_count + 1; + assert(default_len > overlap); + + size_t remaining_db_len = database.total_len; + for (const auto & seq : database.sequences) + { + size_t start = 0; + if (seq.len < (default_len / 2)) + { + seqan3::debug_stream << "Sequence: " << seq.id << " is too short and will be skipped.\n"; + remaining_db_len -= seq.len; + } + else if ((seq.len >= default_len / 2) & (seq.len <= default_len * 1.5)) + { + // database sequence is single segment + add_segment(members.size(), seq.ind, start, seq.len); + remaining_db_len -= seq.len; + } + else + { + // sequences that are contained in a single segment might not have the exact segment length + // dynamically update segment length to divide the rest of the remaining database as equally as possible among the chosen number of segments + size_t remaining_seg_count = seg_count - members.size(); + size_t updated_seg_len = remaining_db_len / remaining_seg_count; + + size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len); + size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down + + // divide database sequence into multiple segments + add_segment(members.size(), seq.ind, start, actual_seg_len); + start = start + actual_seg_len - overlap; + while (start + actual_seg_len - overlap < seq.len) + { + add_segment(members.size(), seq.ind, start, actual_seg_len); + start = start + actual_seg_len - overlap; + } + add_segment(members.size(), seq.ind, start, seq.len - start); + + remaining_db_len -= seq.len; + } + } + assert(members.size() == seg_count); + } + + database_segments(size_t seg_count, size_t overlap, database_metadata const & database) + { + construct_by_linear_scan(seg_count, overlap, database); + } + + // deserialize + database_segments(std::filesystem::path filepath) + { + std::ifstream in_file(filepath); + + size_t id, seq_ind, start, length; + if (in_file.is_open()) + { + while (in_file >> id) + { + in_file >> seq_ind; + in_file >> start; + in_file >> length; + + add_segment(id, seq_ind, start, length); + } + } + in_file.close(); + } + + segment segment_from_bin(size_t const id) const + { + for (auto & seg : members) + { + if (seg.id == id) + return seg; + } + + throw std::runtime_error{"Could not find segment"}; + } + + // serialize + void to_file(std::filesystem::path filepath) + { + std::ofstream out_file; + out_file.open(filepath); + + for (const auto & seg : members) + { + out_file << seg.id << '\t' << seg.seq_ind << '\t' << seg.start << '\t' << seg.len << '\n'; + } + out_file.close(); + } +}; + +} // namespace valik diff --git a/include/valik/split/reference_segments.hpp b/include/valik/split/reference_segments.hpp deleted file mode 100644 index 794cc556..00000000 --- a/include/valik/split/reference_segments.hpp +++ /dev/null @@ -1,143 +0,0 @@ -#pragma once - -#include -#include - -#include - -#include -#include - -namespace valik -{ - -class reference_segments { - public: - class segment { - public: - size_t bin; - size_t ref_ind; - size_t start; - size_t len; - - segment(size_t b, size_t ind, size_t s, size_t l) - { - bin = b; - ref_ind = ind; - start = s; - len = l; - } - - std::string unique_id() - { - return std::to_string(ref_ind) + "_" + std::to_string(start) + "_" + std::to_string(len); - } - }; - - std::vector members; - size_t default_len; - - void add_segment(size_t const b, size_t const ind, size_t const s, size_t const l) - { - segment seg(b, ind, s, l); - members.push_back(seg); - } - - void construct_by_linear_scan(size_t const bins, size_t const overlap, sequence_metadata const & reference) - { - assert(bins > 0); - default_len = reference.total_len / bins + 1; - assert(default_len > overlap); - - size_t remaining_ref_len = reference.total_len; - for (const auto & seq : reference.sequences) - { - size_t start = 0; - if (seq.len < (default_len / 2)) - { - seqan3::debug_stream << "Reference sequence: " << seq.id << " is too short and will be skipped.\n"; - remaining_ref_len -= seq.len; - } - else if ((seq.len >= default_len / 2) & (seq.len <= default_len * 1.5)) - { - // reference sequence is single segment - add_segment(members.size(), seq.ind, start, seq.len); - remaining_ref_len -= seq.len; - } - else - { - // sequences that are contained in a single segment might not have the exact segment length - // dynamically update segment length to divide the rest of the remaining reference as equally as possible along bins - size_t remaining_bins = bins - members.size(); - size_t updated_seg_len = remaining_ref_len / remaining_bins; - - size_t bins_per_seq = std::round( (double) seq.len / (double) updated_seg_len); - size_t actual_seg_len = seq.len / bins_per_seq + overlap + 1; // + 1 because integer division always rounded down - - // divide reference sequence into multiple segments - add_segment(members.size(), seq.ind, start, actual_seg_len); - start = start + actual_seg_len - overlap; - while (start + actual_seg_len - overlap < seq.len) - { - add_segment(members.size(), seq.ind, start, actual_seg_len); - start = start + actual_seg_len - overlap; - } - add_segment(members.size(), seq.ind, start, seq.len - start); - - remaining_ref_len -= seq.len; - } - } - assert(members.size() == bins); - } - - reference_segments(size_t bins, size_t overlap, sequence_metadata const & reference) - { - construct_by_linear_scan(bins, overlap, reference); - } - - // deserialize - reference_segments(std::filesystem::path filepath) - { - std::ifstream in_file(filepath); - - size_t bin, ref_ind, start, length; - if (in_file.is_open()) - { - while (in_file >> bin) - { - in_file >> ref_ind; - in_file >> start; - in_file >> length; - - add_segment(bin, ref_ind, start, length); - } - } - in_file.close(); - } - - segment segment_from_bin(size_t const bin) const - { - for (auto & seg : members) - { - if (seg.bin == bin) - return seg; - } - - throw std::runtime_error{"Could not find segment"}; - } - - // serialize - void to_file(std::filesystem::path filepath) - { - std::ofstream out_file; - out_file.open(filepath); - - for (const auto & seg : members) - { - out_file << seg.bin << '\t' << seg.ref_ind << '\t' << seg.start << '\t' << seg.len << '\n'; - } - out_file.close(); - } -}; - -} // namespace valik diff --git a/include/valik/split/write_seg_sequences.hpp b/include/valik/split/write_seg_sequences.hpp index 4cde1257..77a4e2af 100644 --- a/include/valik/split/write_seg_sequences.hpp +++ b/include/valik/split/write_seg_sequences.hpp @@ -1,15 +1,15 @@ #pragma once #include -#include -#include +#include +#include namespace valik { // Note: template functions should live in .cpp files. This solution to avoid linker errors may cause code bloat. template -void write_seg_sequences(sequence_metadata const & reference, reference_segments & segments, std::filesystem::path const & ref_path) +void write_seg_sequences(database_metadata const & reference, database_segments & segments, std::filesystem::path const & ref_path) { using sequence_file_t = seqan3::sequence_file_input>; @@ -27,13 +27,13 @@ void write_seg_sequences(sequence_metadata const & reference, reference_segments for (auto && [seq] : sequence_file_t{ref_path}) { // get the relevant segments for each reference - auto ref_seg = [&](reference_segments::segment & seg) {return reference.sequences.at(i).ind == seg.ref_ind;}; + auto ref_seg = [&](database_segments::segment & seg) {return reference.sequences.at(i).ind == seg.seq_ind;}; for (auto & seg : segments.members | std::views::filter(ref_seg)) { std::filesystem::path seg_file = ref_path; std::filesystem::path seg_stem = seg_file.stem(); seg_stem += "_"; - seg_stem += std::to_string(seg.bin); + seg_stem += std::to_string(seg.id); seg_stem += seg_file.extension(); seg_file.replace_filename(seg_stem); file_paths_out << seg_file.string() << '\n'; diff --git a/src/argument_parsing/build.cpp b/src/argument_parsing/build.cpp index de75f374..6f3b8d8e 100644 --- a/src/argument_parsing/build.cpp +++ b/src/argument_parsing/build.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include namespace valik::app { @@ -95,7 +95,7 @@ void run_build(sharg::parser & parser) } else { - reference_segments seg(arguments.seg_path); + database_segments seg(arguments.seg_path); arguments.bins = seg.members.size(); sequence_file_validator(arguments.bin_file); std::string bin_string{arguments.bin_file.string()}; diff --git a/src/consolidate/io.cpp b/src/consolidate/io.cpp index 66641725..8959f852 100644 --- a/src/consolidate/io.cpp +++ b/src/consolidate/io.cpp @@ -4,7 +4,7 @@ namespace valik { std::vector read_stellar_output(std::filesystem::path const & match_path, - sequence_metadata const & reference, + database_metadata const & reference, std::ios_base::openmode const mode /* = std::ios_base::in */) { std::vector matches; diff --git a/src/valik_consolidate.cpp b/src/valik_consolidate.cpp index 8012d77a..ed9ee849 100644 --- a/src/valik_consolidate.cpp +++ b/src/valik_consolidate.cpp @@ -5,7 +5,7 @@ namespace valik::app void valik_consolidate(consolidation_arguments const & arguments) { - sequence_metadata reference(arguments.ref_meta_path, false); + database_metadata reference(arguments.ref_meta_path, false); auto matches = read_stellar_output(arguments.matches_in, reference); diff --git a/src/valik_search.cpp b/src/valik_search.cpp index 07701b81..dbcde837 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -5,8 +5,8 @@ #include #include #include -#include -#include +#include +#include #include #include @@ -98,13 +98,13 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim sync_out synced_out{arguments.out_file}; auto queue = cart_queue{index.ibf().bin_count(), arguments.cart_max_capacity, arguments.max_queued_carts}; - std::optional segments; + std::optional segments; if (!arguments.seg_path.empty()) - segments = reference_segments(arguments.seg_path); + segments = database_segments(arguments.seg_path); - std::optional ref_meta; + std::optional ref_meta; if (!arguments.ref_meta_path.empty()) - ref_meta = sequence_metadata(arguments.ref_meta_path, false); + ref_meta = database_metadata(arguments.ref_meta_path, false); //!WORKAROUND: Stellar does not allow smaller error rates double er_rate = std::max((double) arguments.errors / (double) arguments.pattern_size, 0.00001); @@ -216,7 +216,7 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim { threadOptions.searchSegment = true; auto seg = segments->segment_from_bin(bin_id); - threadOptions.binSequences.emplace_back(seg.ref_ind); + threadOptions.binSequences.emplace_back(seg.seq_ind); threadOptions.segmentBegin = seg.start; threadOptions.segmentEnd = seg.start + seg.len; } @@ -405,7 +405,7 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim auto seg = segments->segment_from_bin(bin_id); process_args.insert(process_args.end(), {index.bin_path()[0][0], std::string(cart_queries_path), "--referenceLength", std::to_string(ref_len), - "--sequenceOfInterest", std::to_string(seg.ref_ind), + "--sequenceOfInterest", std::to_string(seg.seq_ind), "--segmentBegin", std::to_string(seg.start), "--segmentEnd", std::to_string(seg.start + seg.len)}); } diff --git a/src/valik_split.cpp b/src/valik_split.cpp index 287eda52..adab7f9e 100644 --- a/src/valik_split.cpp +++ b/src/valik_split.cpp @@ -1,7 +1,7 @@ #include #include -#include -#include +#include +#include #include namespace valik::app @@ -15,11 +15,11 @@ namespace valik::app void valik_split(split_arguments const & arguments) { // Linear scan over reference file to extract metadata - sequence_metadata reference(arguments.ref_file, true); + database_metadata reference(arguments.ref_file, true); reference.to_file(arguments.ref_out); // For each segment assign start, length and bin number - reference_segments segments(arguments.bins, arguments.overlap, reference); + database_segments segments(arguments.bins, arguments.overlap, reference); segments.to_file(arguments.seg_out); if (arguments.write_seg) diff --git a/test/api/valik/split/write_seg_sequences_test.cpp b/test/api/valik/split/write_seg_sequences_test.cpp index 5922701d..2445b91d 100644 --- a/test/api/valik/split/write_seg_sequences_test.cpp +++ b/test/api/valik/split/write_seg_sequences_test.cpp @@ -34,14 +34,14 @@ static void const test_write_out(size_t overlap, size_t bins) { std::string path_prefix = "write_out_" + std::to_string(overlap) + "_" + std::to_string(bins); - valik::sequence_metadata reference(data(path_prefix + "_reference_metadata.txt"), false); - valik::reference_segments segments(data(path_prefix + "_reference_segments.txt")); + valik::database_metadata reference(data(path_prefix + "_reference_metadata.txt"), false); + valik::database_segments segments(data(path_prefix + "_reference_segments.txt")); valik::write_seg_sequences(reference, segments, data(path_prefix + "_ref.fasta")); for (size_t i = 0; i < bins - 1; i++) { - valik::reference_segments::segment current_seg = segments.members[i]; - valik::reference_segments::segment next_seg = segments.members[i + 1]; + valik::database_segments::segment current_seg = segments.members[i]; + valik::database_segments::segment next_seg = segments.members[i + 1]; std::string current_seg_seq = string_from_file(data(path_prefix + "_ref_" + std::to_string(i) + ".fasta"), std::ios::binary); std::string next_seg_seq = string_from_file(data(path_prefix + "_ref_" + std::to_string(i + 1) + ".fasta"), std::ios::binary); @@ -51,7 +51,7 @@ static void const test_write_out(size_t overlap, size_t bins) EXPECT_EQ(current_seg_seq.size(), current_seg.len); EXPECT_EQ(next_seg_seq.size(), next_seg.len); - if (current_seg.ref_ind == next_seg.ref_ind) + if (current_seg.seq_ind == next_seg.seq_ind) { EXPECT_EQ(current_seg_seq.substr(current_seg_seq.size() - overlap), next_seg_seq.substr(0, overlap)); } diff --git a/test/cli/cli_test.hpp b/test/cli/cli_test.hpp index 70a7f4f5..f2ac68f6 100644 --- a/test/cli/cli_test.hpp +++ b/test/cli/cli_test.hpp @@ -7,7 +7,7 @@ #include // strings #include -#include +#include #include #include diff --git a/test/cli/dream_test.cpp b/test/cli/dream_test.cpp index 06051bdf..71c9112e 100644 --- a/test/cli/dream_test.cpp +++ b/test/cli/dream_test.cpp @@ -18,7 +18,7 @@ TEST_P(dream_search, shared_mem) setenv("VALIK_MERGE", "cat", true); std::filesystem::path ref_meta_path = data("ref_meta.txt"); - valik::sequence_metadata reference(ref_meta_path, false); + valik::database_metadata reference(ref_meta_path, false); std::filesystem::path seg_meta_path = data("seg_meta150overlap" + std::to_string(number_of_bins) + "bins.txt"); std::filesystem::path index_path = ibf_path(number_of_bins, window_size); diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index defedfb3..8dc24cba 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -22,7 +22,7 @@ TEST_P(valik_split, split) "--seg-meta reference_segments.txt"); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{"Reference sequence: chr5 is too short and will be skipped.\n"}); + EXPECT_EQ(result.err, std::string{"Sequence: chr5 is too short and will be skipped.\n"}); std::string const expected_metadata = string_from_file(data("chromosome_metadata.txt"), std::ios::binary); std::string const actual_metadata = string_from_file("reference_metadata.txt", std::ios::binary); @@ -240,7 +240,7 @@ TEST_P(valik_consolidate, consolidation) auto const [number_of_bins, segment_overlap] = GetParam(); std::filesystem::path ref_meta_path = consolidation_meta_path(number_of_bins, segment_overlap); - valik::sequence_metadata reference(ref_meta_path, false); + valik::database_metadata reference(ref_meta_path, false); cli_test_result const result = execute_app("valik", "consolidate", "--input ", consolidation_input_path(number_of_bins, segment_overlap), From 61324b66f417905bc90891ebedd197ffd9206149 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Mon, 7 Aug 2023 15:32:15 +0200 Subject: [PATCH 07/11] Write out query segments --- include/valik/shared.hpp | 9 +- include/valik/split/write_seg_sequences.hpp | 54 ++--------- src/CMakeLists.txt | 4 + src/argument_parsing/split.cpp | 25 +++-- src/split/write_seg_sequences.cpp | 78 ++++++++++++++++ src/valik_split.cpp | 12 ++- test/api/valik/split/CMakeLists.txt | 5 +- .../valik/split/write_seg_sequences_test.cpp | 91 ++++++++++++++++--- test/cli/valik_options_test.cpp | 6 +- test/cli/valik_test.cpp | 2 +- test/data/create_output.sh | 4 +- test/data/datasources.cmake | 13 +-- test/data/split/api_test_input.sh | 29 +++--- test/data/split/cli_test_output.sh | 4 +- .../ref.fasta => database.fasta} | 0 test/data/split/write_out_0_4/ref.fasta | 26 ------ test/data/split/write_out_20_16/ref.fasta | 26 ------ test/data/split/write_out_20_4/ref.fasta | 26 ------ 18 files changed, 223 insertions(+), 191 deletions(-) create mode 100644 src/split/write_seg_sequences.cpp rename test/data/split/{write_out_0_16/ref.fasta => database.fasta} (100%) delete mode 100644 test/data/split/write_out_0_4/ref.fasta delete mode 100644 test/data/split/write_out_20_16/ref.fasta delete mode 100644 test/data/split/write_out_20_4/ref.fasta diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index f5d79e7e..22b55071 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -36,13 +36,14 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna struct split_arguments { - std::filesystem::path ref_file{}; - std::filesystem::path ref_out{"reference_metadata.txt"}; + std::filesystem::path db_file{}; + std::filesystem::path db_out{"reference_metadata.txt"}; std::filesystem::path seg_out{"reference_segments.txt"}; size_t overlap{150}; - size_t bins{64}; - bool write_seg{false}; + size_t seg_count{64}; + bool write_ref{false}; + bool write_query{false}; }; struct build_arguments diff --git a/include/valik/split/write_seg_sequences.hpp b/include/valik/split/write_seg_sequences.hpp index 77a4e2af..0b2809bc 100644 --- a/include/valik/split/write_seg_sequences.hpp +++ b/include/valik/split/write_seg_sequences.hpp @@ -7,50 +7,14 @@ namespace valik { -// Note: template functions should live in .cpp files. This solution to avoid linker errors may cause code bloat. -template -void write_seg_sequences(database_metadata const & reference, database_segments & segments, std::filesystem::path const & ref_path) -{ - using sequence_file_t = seqan3::sequence_file_input>; - - using types = seqan3::type_list, std::string>; - using fields = seqan3::fields; - using sequence_record_type = seqan3::sequence_record; - - std::filesystem::path seg_path_file = ref_path; - seg_path_file.replace_filename("seg_files.txt"); - - std::ofstream file_paths_out; - file_paths_out.open(seg_path_file); - - int i = 0; - for (auto && [seq] : sequence_file_t{ref_path}) - { - // get the relevant segments for each reference - auto ref_seg = [&](database_segments::segment & seg) {return reference.sequences.at(i).ind == seg.seq_ind;}; - for (auto & seg : segments.members | std::views::filter(ref_seg)) - { - std::filesystem::path seg_file = ref_path; - std::filesystem::path seg_stem = seg_file.stem(); - seg_stem += "_"; - seg_stem += std::to_string(seg.id); - seg_stem += seg_file.extension(); - seg_file.replace_filename(seg_stem); - file_paths_out << seg_file.string() << '\n'; - - std::string id{seg.unique_id()}; - seqan3::dna4_vector seg_sequence(&seq[seg.start], &seq[seg.start+seg.len]); - assert(seg_sequence.size() == seg.len); - - seqan3::sequence_file_output seg_out{seg_file}; - - sequence_record_type record{std::move(seg_sequence), std::move(id)}; - seg_out.push_back(record); - } - i++; - } - - file_paths_out.close(); -} +/** !\brief Function that writes an output FASTA file for each segment sequence. */ +void write_reference_segments(database_metadata const & reference_metadata, + database_segments & segments, + std::filesystem::path const & ref_path); + +/** !\brief Function that writes segment sequences into a single FASTA file. */ +void write_query_segments(database_metadata const & query_metadata, + database_segments & segments, + std::filesystem::path const & query_path); } // namespace valik diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 941d4835..f079bca3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,9 @@ target_compile_options ("${PROJECT_NAME}_interface" INTERFACE "-pedantic" "-Wall add_library ("${PROJECT_NAME}_split_lib" STATIC valik_split.cpp) target_link_libraries ("${PROJECT_NAME}_split_lib" PUBLIC "${PROJECT_NAME}_interface") +add_library ("${PROJECT_NAME}_split_write_lib" STATIC split/write_seg_sequences.cpp) +target_link_libraries ("${PROJECT_NAME}_split_write_lib" PUBLIC "${PROJECT_NAME}_interface") + # Valik build add_library ("${PROJECT_NAME}_build_lib" STATIC valik_build.cpp) target_link_libraries ("${PROJECT_NAME}_build_lib" PUBLIC "${PROJECT_NAME}_interface") @@ -71,6 +74,7 @@ target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_shared_lib") target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_argument_parsing_top_level_lib") target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_split_lib") +target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_split_write_lib") target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_build_lib") target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_search_lib") target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_consolidation_lib") diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index 11beb69e..bdad2cce 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -9,13 +9,13 @@ namespace valik::app void init_split_parser(sharg::parser & parser, split_arguments & arguments) { init_shared_meta(parser); - parser.add_positional_option(arguments.ref_file, - sharg::config{.description = "File containing reference sequences.", + parser.add_positional_option(arguments.db_file, + sharg::config{.description = "File containing database sequences.", .validator = sharg::input_file_validator{}}); - parser.add_option(arguments.ref_out, + parser.add_option(arguments.db_out, sharg::config{.short_id = '\0', - .long_id = "ref-meta", - .description = "Please provide a valid path to the reference metadata output.", + .long_id = "db-meta", + .description = "Please provide a valid path to the database metadata output.", .required = true, .validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create}}); parser.add_option(arguments.seg_out, @@ -29,15 +29,20 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) .long_id = "overlap", .description = "Choose how much consecutive segments overlap.", .validator = positive_integer_validator{true}}); - parser.add_option(arguments.bins, + parser.add_option(arguments.seg_count, sharg::config{.short_id = '\0', .long_id = "bins", - .description = "Number of bins in the IBF. Multiples of 64 lead to better performance.", + .description = "Dividing the database into this many segments. Corresponds to IBF bin count for the reference sequence so that multiples of 64 lead to better performance.", .validator = sharg::arithmetic_range_validator{1, 29952}}); - parser.add_flag(arguments.write_seg, + parser.add_flag(arguments.write_ref, sharg::config{.short_id = '\0', - .long_id = "write-out", - .description = "Write segment sequences to disk."}); + .long_id = "write-ref", + .description = "Write an output FASTA file for each segment."}); + parser.add_flag(arguments.write_query, + sharg::config{.short_id = '\0', + .long_id = "write-query", + .description = "Write segments into a single output FASTA file."}); + } void run_split(sharg::parser & parser) diff --git a/src/split/write_seg_sequences.cpp b/src/split/write_seg_sequences.cpp new file mode 100644 index 00000000..671e601c --- /dev/null +++ b/src/split/write_seg_sequences.cpp @@ -0,0 +1,78 @@ +#include + +namespace valik +{ + +using sequence_file_t = seqan3::sequence_file_input>; + +using types = seqan3::type_list, std::string>; +using fields = seqan3::fields; +using sequence_record_type = seqan3::sequence_record; + +void write_reference_segments(database_metadata const & reference_metadata, + database_segments & segments, + std::filesystem::path const & ref_path) +{ + std::filesystem::path seg_path_file = ref_path; + seg_path_file.replace_filename("seg_files.txt"); + + std::ofstream file_paths_out; + file_paths_out.open(seg_path_file); + + int i = 0; + for (auto && [seq] : sequence_file_t{ref_path}) + { + // get the underlying sequence + auto is_segment_sequence = [&](database_segments::segment & seg) {return reference_metadata.sequences.at(i).ind == seg.seq_ind;}; + for (auto & seg : segments.members | std::views::filter(is_segment_sequence)) + { + std::filesystem::path seg_file = ref_path; + std::filesystem::path seg_stem = seg_file.stem(); + seg_stem += "_"; + seg_stem += std::to_string(seg.id); + seg_stem += seg_file.extension(); + seg_file.replace_filename(seg_stem); + file_paths_out << seg_file.string() << '\n'; + + std::string id{seg.unique_id()}; + seqan3::dna4_vector seg_sequence(&seq[seg.start], &seq[seg.start+seg.len]); + assert(seg_sequence.size() == seg.len); + + seqan3::sequence_file_output seg_out{seg_file}; + + sequence_record_type record{std::move(seg_sequence), std::move(id)}; + seg_out.push_back(record); + } + i++; + } + + file_paths_out.close(); +} + +void write_query_segments(database_metadata const & query_metadata, + database_segments & segments, + std::filesystem::path const & query_path) +{ + std::filesystem::path seg_out_path = query_path; + seg_out_path.replace_extension("segments.fasta"); + seqan3::sequence_file_output seg_out{seg_out_path}; + + int i = 0; + for (auto && [seq] : sequence_file_t{query_path}) + { + // get the underlying sequence + auto is_segment_sequence = [&](database_segments::segment & seg) {return query_metadata.sequences.at(i).ind == seg.seq_ind;}; + for (auto & seg : segments.members | std::views::filter(is_segment_sequence)) + { + std::string id{seg.unique_id()}; + seqan3::dna4_vector seg_sequence(&seq[seg.start], &seq[seg.start+seg.len]); + assert(seg_sequence.size() == seg.len); + + sequence_record_type record{std::move(seg_sequence), std::move(id)}; + seg_out.push_back(record); + } + i++; + } +} + +} // namespace valik diff --git a/src/valik_split.cpp b/src/valik_split.cpp index adab7f9e..7ec4da88 100644 --- a/src/valik_split.cpp +++ b/src/valik_split.cpp @@ -15,15 +15,17 @@ namespace valik::app void valik_split(split_arguments const & arguments) { // Linear scan over reference file to extract metadata - database_metadata reference(arguments.ref_file, true); - reference.to_file(arguments.ref_out); + database_metadata database(arguments.db_file, true); + database.to_file(arguments.db_out); // For each segment assign start, length and bin number - database_segments segments(arguments.bins, arguments.overlap, reference); + database_segments segments(arguments.seg_count, arguments.overlap, database); segments.to_file(arguments.seg_out); - if (arguments.write_seg) - write_seg_sequences(reference, segments, arguments.ref_file); + if (arguments.write_ref) + write_reference_segments(database, segments, arguments.db_file); + if (arguments.write_query) + write_query_segments(database, segments, arguments.db_file); } } // namespace valik::app diff --git a/test/api/valik/split/CMakeLists.txt b/test/api/valik/split/CMakeLists.txt index 057db795..376f4f37 100644 --- a/test/api/valik/split/CMakeLists.txt +++ b/test/api/valik/split/CMakeLists.txt @@ -1,15 +1,12 @@ add_api_test (write_seg_sequences_test.cpp) +target_use_datasources (write_seg_sequences_test FILES database.fasta) target_use_datasources (write_seg_sequences_test FILES write_out_0_16_reference_metadata.txt) target_use_datasources (write_seg_sequences_test FILES write_out_0_16_reference_segments.txt) -target_use_datasources (write_seg_sequences_test FILES write_out_0_16_ref.fasta) target_use_datasources (write_seg_sequences_test FILES write_out_0_4_reference_metadata.txt) target_use_datasources (write_seg_sequences_test FILES write_out_0_4_reference_segments.txt) -target_use_datasources (write_seg_sequences_test FILES write_out_0_4_ref.fasta) target_use_datasources (write_seg_sequences_test FILES write_out_20_16_reference_metadata.txt) target_use_datasources (write_seg_sequences_test FILES write_out_20_16_reference_segments.txt) target_use_datasources (write_seg_sequences_test FILES write_out_20_16_reference_segments.txt) -target_use_datasources (write_seg_sequences_test FILES write_out_20_16_ref.fasta) target_use_datasources (write_seg_sequences_test FILES write_out_20_4_reference_metadata.txt) target_use_datasources (write_seg_sequences_test FILES write_out_20_4_reference_segments.txt) -target_use_datasources (write_seg_sequences_test FILES write_out_20_4_ref.fasta) diff --git a/test/api/valik/split/write_seg_sequences_test.cpp b/test/api/valik/split/write_seg_sequences_test.cpp index 2445b91d..56e767c7 100644 --- a/test/api/valik/split/write_seg_sequences_test.cpp +++ b/test/api/valik/split/write_seg_sequences_test.cpp @@ -3,6 +3,7 @@ #include #include +#include // Generate the full path of a test input file that is provided in the data directory. static std::filesystem::path data(std::string const & filename) @@ -30,21 +31,21 @@ static void trim_fasta_id(std::string & seg_seq) seg_seq.end()); } -static void const test_write_out(size_t overlap, size_t bins) +static void const test_reference_out(size_t overlap, size_t bins) { std::string path_prefix = "write_out_" + std::to_string(overlap) + "_" + std::to_string(bins); valik::database_metadata reference(data(path_prefix + "_reference_metadata.txt"), false); valik::database_segments segments(data(path_prefix + "_reference_segments.txt")); - valik::write_seg_sequences(reference, segments, data(path_prefix + "_ref.fasta")); + valik::write_reference_segments(reference, segments, data("database.fasta")); for (size_t i = 0; i < bins - 1; i++) { valik::database_segments::segment current_seg = segments.members[i]; valik::database_segments::segment next_seg = segments.members[i + 1]; - std::string current_seg_seq = string_from_file(data(path_prefix + "_ref_" + std::to_string(i) + ".fasta"), std::ios::binary); - std::string next_seg_seq = string_from_file(data(path_prefix + "_ref_" + std::to_string(i + 1) + ".fasta"), std::ios::binary); + std::string current_seg_seq = string_from_file(data("database_" + std::to_string(i) + ".fasta"), std::ios::binary); + std::string next_seg_seq = string_from_file(data("database_" + std::to_string(i + 1) + ".fasta"), std::ios::binary); trim_fasta_id(current_seg_seq); trim_fasta_id(next_seg_seq); @@ -58,34 +59,100 @@ static void const test_write_out(size_t overlap, size_t bins) } } -TEST(write_seg_sequences, o0_b4) +TEST(write_ref_sequences, o0_b4) { size_t overlap = 0; size_t bins = 4; - test_write_out(overlap, bins); + test_reference_out(overlap, bins); } -TEST(write_seg_sequences, o20_b4) +TEST(write_ref_sequences, o20_b4) { size_t overlap = 20; size_t bins = 4; - test_write_out(overlap, bins); + test_reference_out(overlap, bins); } -TEST(write_seg_sequences, o0_b16) +TEST(write_ref_sequences, o0_b16) { size_t overlap = 0; size_t bins = 16; - test_write_out(overlap, bins); + test_reference_out(overlap, bins); } -TEST(write_seg_sequences, o20_b16) +TEST(write_ref_sequences, o20_b16) { size_t overlap = 20; size_t bins = 16; - test_write_out(overlap, bins); + test_reference_out(overlap, bins); +} + +static void const test_query_out(size_t overlap, size_t bins) +{ + std::string path_prefix = "write_out_" + std::to_string(overlap) + "_" + std::to_string(bins); + + valik::database_metadata reference(data(path_prefix + "_reference_metadata.txt"), false); + valik::database_segments segments(data(path_prefix + "_reference_segments.txt")); + valik::write_query_segments(reference, segments, data("database.fasta")); + + using sequence_file_t = seqan3::sequence_file_input>; + + size_t i{0}; + seqan3::dna4_vector previous_seg_seq; + for (auto && [current_seg_seq] : sequence_file_t{data("database.segments.fasta")}) + { + if (i > 1) + { + valik::database_segments::segment previous_seg = segments.members[i - 1]; + valik::database_segments::segment current_seg = segments.members[i]; + + EXPECT_EQ(previous_seg_seq.size(), previous_seg.len); + EXPECT_EQ(current_seg_seq.size(), current_seg.len); + + if (previous_seg.seq_ind == current_seg.seq_ind) + { + EXPECT_RANGE_EQ(std::vector(previous_seg_seq.end() - overlap, previous_seg_seq.end()), + std::vector(current_seg_seq.begin(), current_seg_seq.begin()+ overlap)); + } + } + + i++; + previous_seg_seq = current_seg_seq; + } +} + +TEST(write_query_sequences, o0_b4) +{ + size_t overlap = 0; + size_t bins = 4; + + test_query_out(overlap, bins); +} + +TEST(write_query_sequences, o20_b4) +{ + size_t overlap = 20; + size_t bins = 4; + + test_query_out(overlap, bins); +} + +TEST(write_query_sequences, o0_b16) +{ + size_t overlap = 0; + size_t bins = 16; + + test_query_out(overlap, bins); +} + +TEST(write_query_sequences, o20_b16) +{ + size_t overlap = 20; + size_t bins = 16; + + test_query_out(overlap, bins); } diff --git a/test/cli/valik_options_test.cpp b/test/cli/valik_options_test.cpp index 0a094517..ec24fb16 100644 --- a/test/cli/valik_options_test.cpp +++ b/test/cli/valik_options_test.cpp @@ -101,7 +101,7 @@ TEST_F(argparse_split, input_missing) { cli_test_result const result = execute_app("valik", "split", "--seg-meta seg", - "--ref-meta ref"); + "--db-meta ref"); EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, std::string{"[Error] Not enough positional arguments provided (Need at least 1). See -h/--help for more information.\n"}); @@ -112,7 +112,7 @@ TEST_F(argparse_split, input_invalid) cli_test_result const result = execute_app("valik", "split", "nonexistent", "--seg-meta seg", - "--ref-meta ref"); + "--db-meta ref"); EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, std::string{"[Error] Validation failed for positional option 1: The file \"nonexistent\" does not exist!\n"}); @@ -123,7 +123,7 @@ TEST_F(argparse_split, no_bins) cli_test_result const result = execute_app("valik", "split", "dummy.fasta", "--seg-meta seg", - "--ref-meta ref", + "--db-meta ref", "--bins 0"); EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index 8dc24cba..a27d4500 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -18,7 +18,7 @@ TEST_P(valik_split, split) data("various_chromosome_lengths.fasta"), "--overlap ", std::to_string(overlap), "--bins ", std::to_string(bins), - "--ref-meta reference_metadata.txt", + "--db-meta reference_metadata.txt", "--seg-meta reference_segments.txt"); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); diff --git a/test/data/create_output.sh b/test/data/create_output.sh index ce89ba53..fc06b114 100755 --- a/test/data/create_output.sh +++ b/test/data/create_output.sh @@ -3,7 +3,7 @@ cd "$(dirname "$0")" set -Eeuo pipefail if ! which valik 2> /dev/null; then - echo "valik not found. please compile it and add it the path to it to the PATH variable" + echo "valik not found. please compile it and add its path to the PATH variable" exit 255 fi @@ -29,4 +29,4 @@ export VALIK_MERGE=cat echo "### Running distributed DREAM-Stellar ###" ./dream/cli_test_output.sh -echo "### Finished ###" +echo "### Finished ###" \ No newline at end of file diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index 5f32506f..99f58529 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -74,8 +74,8 @@ declare_datasource (FILE write_out_0_16_reference_metadata.txt declare_datasource (FILE write_out_0_16_reference_segments.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_0_16/reference_segments.txt URL_HASH SHA256=f41420e802d66d96bb3d2246d459791ebe5796da669600552a0c3eb43082345f) -declare_datasource (FILE write_out_0_16_ref.fasta - URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_0_16/ref.fasta +declare_datasource (FILE database.fasta + URL ${CMAKE_SOURCE_DIR}/test/data/split/database.fasta URL_HASH SHA256=7c7a8fcdd52a932cda76219f24024c1624292377103d9fd5a55abd288c6072be) declare_datasource (FILE write_out_0_4_reference_metadata.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_0_4/reference_metadata.txt @@ -83,27 +83,18 @@ declare_datasource (FILE write_out_0_4_reference_metadata.txt declare_datasource (FILE write_out_0_4_reference_segments.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_0_4/reference_segments.txt URL_HASH SHA256=171e201e20f2dc9d5d855b753fc7bf6abe7297ded613e727c0ed1dd69bbaf02f) -declare_datasource (FILE write_out_0_4_ref.fasta - URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_0_4/ref.fasta - URL_HASH SHA256=7c7a8fcdd52a932cda76219f24024c1624292377103d9fd5a55abd288c6072be) declare_datasource (FILE write_out_20_16_reference_metadata.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_20_16/reference_metadata.txt URL_HASH SHA256=368803a8d29419321ba9704bc7cbd52abf6f7b2f528d725ed54a5ecadf5c6ae3) declare_datasource (FILE write_out_20_16_reference_segments.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_20_16/reference_segments.txt URL_HASH SHA256=32a40e7211cd45832506f16d0b3a613a222efd0783463978a50aa85ecb302837) -declare_datasource (FILE write_out_20_16_ref.fasta - URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_20_16/ref.fasta - URL_HASH SHA256=7c7a8fcdd52a932cda76219f24024c1624292377103d9fd5a55abd288c6072be) declare_datasource (FILE write_out_20_4_reference_metadata.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_20_4/reference_metadata.txt URL_HASH SHA256=368803a8d29419321ba9704bc7cbd52abf6f7b2f528d725ed54a5ecadf5c6ae3) declare_datasource (FILE write_out_20_4_reference_segments.txt URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_20_4/reference_segments.txt URL_HASH SHA256=171e201e20f2dc9d5d855b753fc7bf6abe7297ded613e727c0ed1dd69bbaf02f) -declare_datasource (FILE write_out_20_4_ref.fasta - URL ${CMAKE_SOURCE_DIR}/test/data/split/write_out_20_4/ref.fasta - URL_HASH SHA256=7c7a8fcdd52a932cda76219f24024c1624292377103d9fd5a55abd288c6072be) declare_datasource (FILE 8bins19window.ibf URL ${CMAKE_SOURCE_DIR}/test/data/build/8bins19window.ibf URL_HASH SHA256=3a13c890650bf857770816244ed9420295ad8bbe681dac335f687863fc79a603) diff --git a/test/data/split/api_test_input.sh b/test/data/split/api_test_input.sh index e9624d0c..6866c501 100755 --- a/test/data/split/api_test_input.sh +++ b/test/data/split/api_test_input.sh @@ -6,26 +6,27 @@ set -Eeuo pipefail SEED=${1} +i=1 +for length in 210 490 420 280 4 +do + echo "Simulating chromosome with length $length" + chr_out="chr"$i".fasta" + mason_genome -l $length -o $chr_out -s $SEED &> /dev/null + + sed -i "s/^>.*$/>chr$i/g" $chr_out + let i=i+1 +done + +cat chr*.fasta > database.fasta +rm chr*.fasta + for overlap in 0 20 do for bins in 4 16 do - i=1 - for length in 210 490 420 280 4 - do - echo "Simulating chromosome with length $length" - chr_out="chr"$i".fasta" - mason_genome -l $length -o $chr_out -s $SEED &> /dev/null - - sed -i "s/^>.*$/>chr$i/g" $chr_out - let i=i+1 - done - out_dir=write_out_${overlap}_${bins} mkdir -p ${out_dir} - cat chr*.fasta > ${out_dir}/ref.fasta - rm chr*.fasta - valik split ${out_dir}/ref.fasta --overlap ${overlap} --bins ${bins} --ref-meta ${out_dir}/reference_metadata.txt --seg-meta ${out_dir}/reference_segments.txt + valik split database.fasta --overlap ${overlap} --bins ${bins} --ref-meta ${out_dir}/reference_metadata.txt --seg-meta ${out_dir}/reference_segments.txt done done diff --git a/test/data/split/cli_test_output.sh b/test/data/split/cli_test_output.sh index 98c59ea2..c897097e 100755 --- a/test/data/split/cli_test_output.sh +++ b/test/data/split/cli_test_output.sh @@ -19,7 +19,7 @@ do echo "Splitting the genome into $b segments that overlap by $o" ref_meta="multi/chromosome_"$o"overlap"$b"bins.txt" seg_meta="multi/"$o"overlap"$b"bins.txt" - valik split "$seg_input" --overlap "$o" --bins "$b" --ref-meta "$ref_meta" --seg-meta "$seg_meta" + valik split "$seg_input" --overlap "$o" --bins "$b" --db-meta "$ref_meta" --seg-meta "$seg_meta" done done @@ -53,7 +53,7 @@ do echo "Splitting the genome into $b segments that overlap by $seg_overlap" ref_meta="single/ref_"$seg_overlap"overlap"$b"bins.txt" seg_meta="single/"$seg_overlap"overlap"$b"bins.txt" - valik split "$ref_input" --overlap "$seg_overlap" --bins "$b" --ref-meta "$ref_meta" --seg-meta "$seg_meta" + valik split "$ref_input" --overlap "$seg_overlap" --bins "$b" --db-meta "$ref_meta" --seg-meta "$seg_meta" for w in 13 15 do diff --git a/test/data/split/write_out_0_16/ref.fasta b/test/data/split/database.fasta similarity index 100% rename from test/data/split/write_out_0_16/ref.fasta rename to test/data/split/database.fasta diff --git a/test/data/split/write_out_0_4/ref.fasta b/test/data/split/write_out_0_4/ref.fasta deleted file mode 100644 index 9c3f1269..00000000 --- a/test/data/split/write_out_0_4/ref.fasta +++ /dev/null @@ -1,26 +0,0 @@ ->chr1 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC ->chr2 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA -TAGCGCATTTCACGCTCTCTACGAATGACCGCAACGATCAAATGGGCGAGAACAACTAATTCCGATTCAT -GGGGTTTGTGGATTGTGACACAGCGCGCCCGCTACTGCGGGACGTGAGGACGCCCAATTCTGCCAAGGAT -TATTTAGGGTGTTTCACTAGAGTTATGCGCCGACCCCGGTTGGACCAGCTTGCATTCGAAACTGCGTTAC ->chr3 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA -TAGCGCATTTCACGCTCTCTACGAATGACCGCAACGATCAAATGGGCGAGAACAACTAATTCCGATTCAT -GGGGTTTGTGGATTGTGACACAGCGCGCCCGCTACTGCGGGACGTGAGGACGCCCAATTCTGCCAAGGAT ->chr4 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA ->chr5 -TATG diff --git a/test/data/split/write_out_20_16/ref.fasta b/test/data/split/write_out_20_16/ref.fasta deleted file mode 100644 index 9c3f1269..00000000 --- a/test/data/split/write_out_20_16/ref.fasta +++ /dev/null @@ -1,26 +0,0 @@ ->chr1 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC ->chr2 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA -TAGCGCATTTCACGCTCTCTACGAATGACCGCAACGATCAAATGGGCGAGAACAACTAATTCCGATTCAT -GGGGTTTGTGGATTGTGACACAGCGCGCCCGCTACTGCGGGACGTGAGGACGCCCAATTCTGCCAAGGAT -TATTTAGGGTGTTTCACTAGAGTTATGCGCCGACCCCGGTTGGACCAGCTTGCATTCGAAACTGCGTTAC ->chr3 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA -TAGCGCATTTCACGCTCTCTACGAATGACCGCAACGATCAAATGGGCGAGAACAACTAATTCCGATTCAT -GGGGTTTGTGGATTGTGACACAGCGCGCCCGCTACTGCGGGACGTGAGGACGCCCAATTCTGCCAAGGAT ->chr4 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA ->chr5 -TATG diff --git a/test/data/split/write_out_20_4/ref.fasta b/test/data/split/write_out_20_4/ref.fasta deleted file mode 100644 index 9c3f1269..00000000 --- a/test/data/split/write_out_20_4/ref.fasta +++ /dev/null @@ -1,26 +0,0 @@ ->chr1 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC ->chr2 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA -TAGCGCATTTCACGCTCTCTACGAATGACCGCAACGATCAAATGGGCGAGAACAACTAATTCCGATTCAT -GGGGTTTGTGGATTGTGACACAGCGCGCCCGCTACTGCGGGACGTGAGGACGCCCAATTCTGCCAAGGAT -TATTTAGGGTGTTTCACTAGAGTTATGCGCCGACCCCGGTTGGACCAGCTTGCATTCGAAACTGCGTTAC ->chr3 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA -TAGCGCATTTCACGCTCTCTACGAATGACCGCAACGATCAAATGGGCGAGAACAACTAATTCCGATTCAT -GGGGTTTGTGGATTGTGACACAGCGCGCCCGCTACTGCGGGACGTGAGGACGCCCAATTCTGCCAAGGAT ->chr4 -TATGCACCAGAGTATGGAAGCATAAGCTCTGCATGCAAAGGTACATCAGATCCTGCGGTTGGGTGCCAAC -CCAAGTGTGTTCACGGGCGCTTGACAGACATCGGAGGATGGTGCACACTCACTCGACCAGCGCAAAGCAC -AGGATCTCACGGGCGGACATCTCTTAGGTCAGTCATCGTGGAGGAATGCTTGTACGTTCTTTTGGCTTCC -CCTAACACGGCGGGCGTCTCCGGTACGTATCCTGTCGGTACACCCCTTAAGCCCCTAGGCCCGAAGAACA ->chr5 -TATG From 90f05e495851ca9ccf65acac2d28d25869184df5 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Wed, 9 Aug 2023 10:27:38 +0200 Subject: [PATCH 08/11] Rename bins -> seg-count --- src/argument_parsing/split.cpp | 2 +- src/valik_split.cpp | 2 +- test/cli/valik_options_test.cpp | 6 +++--- test/cli/valik_test.cpp | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index bdad2cce..c38ec50e 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -31,7 +31,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) .validator = positive_integer_validator{true}}); parser.add_option(arguments.seg_count, sharg::config{.short_id = '\0', - .long_id = "bins", + .long_id = "seg-count", .description = "Dividing the database into this many segments. Corresponds to IBF bin count for the reference sequence so that multiples of 64 lead to better performance.", .validator = sharg::arithmetic_range_validator{1, 29952}}); parser.add_flag(arguments.write_ref, diff --git a/src/valik_split.cpp b/src/valik_split.cpp index 7ec4da88..a6e9ecf7 100644 --- a/src/valik_split.cpp +++ b/src/valik_split.cpp @@ -9,7 +9,7 @@ namespace valik::app //----------------------------- // -// Divide reference database into partially overlapping segments. +// Divide reference or query database into partially overlapping segments. // //----------------------------- void valik_split(split_arguments const & arguments) diff --git a/test/cli/valik_options_test.cpp b/test/cli/valik_options_test.cpp index ec24fb16..8c686d86 100644 --- a/test/cli/valik_options_test.cpp +++ b/test/cli/valik_options_test.cpp @@ -118,16 +118,16 @@ TEST_F(argparse_split, input_invalid) EXPECT_EQ(result.err, std::string{"[Error] Validation failed for positional option 1: The file \"nonexistent\" does not exist!\n"}); } -TEST_F(argparse_split, no_bins) +TEST_F(argparse_split, no_seg_count) { cli_test_result const result = execute_app("valik", "split", "dummy.fasta", "--seg-meta seg", "--db-meta ref", - "--bins 0"); + "--seg-count 0"); EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option --bins: Value 0 is not in range [1,29952].\n"}); + EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option --seg-count: Value 0 is not in range [1,29952].\n"}); } TEST_F(argparse_build, input_missing) diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index a27d4500..08793220 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -12,12 +12,12 @@ TEST_P(valik_split, split) { - auto const [overlap, bins] = GetParam(); + auto const [overlap, seg_count] = GetParam(); cli_test_result const result = execute_app("valik", "split", data("various_chromosome_lengths.fasta"), "--overlap ", std::to_string(overlap), - "--bins ", std::to_string(bins), + "--seg-count ", std::to_string(seg_count), "--db-meta reference_metadata.txt", "--seg-meta reference_segments.txt"); EXPECT_EQ(result.exit_code, 0); @@ -29,7 +29,7 @@ TEST_P(valik_split, split) EXPECT_TRUE(expected_metadata == actual_metadata); - std::string const expected_segments = string_from_file(segment_metadata_path(overlap, bins), std::ios::binary); + std::string const expected_segments = string_from_file(segment_metadata_path(overlap, seg_count), std::ios::binary); std::string const actual_segments = string_from_file("reference_segments.txt", std::ios::binary); EXPECT_TRUE(expected_segments == actual_segments); From d5df235049805cff552b1180ac46892f496084a5 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Tue, 15 Aug 2023 17:29:51 +0200 Subject: [PATCH 09/11] Partial specialized template class approach --- .../utilities/consolidate/merge_processes.hpp | 28 + include/utilities/external_process.hpp | 17 + include/valik/search/cart_query_io.hpp | 82 +++ include/valik/search/env_var_pack.hpp | 5 + include/valik/search/execution_metadata.hpp | 45 ++ include/valik/search/iterate_queries.hpp | 305 ++++++++++ include/valik/search/local_prefilter.hpp | 7 +- .../search/prefilter_queries_parallel.hpp | 18 +- include/valik/search/query_record.hpp | 12 + include/valik/search/search.hpp | 5 + include/valik/search/search_distributed.hpp | 131 +++++ include/valik/search/search_local.hpp | 368 ++++++++++++ include/valik/shared.hpp | 6 +- lib/stellar3 | 2 +- src/CMakeLists.txt | 4 + src/argument_parsing/search.cpp | 14 +- src/argument_parsing/split.cpp | 2 +- src/consolidate/merge_processes.cpp | 35 ++ src/valik_search.cpp | 529 +----------------- test/cli/valik_options_test.cpp | 2 +- test/data/dream/16bins13window.ibf | Bin 32891 -> 0 bytes test/data/dream/16bins15window.ibf | Bin 32891 -> 0 bytes test/data/dream/4bins13window.ibf | Bin 32891 -> 0 bytes test/data/dream/4bins15window.ibf | Bin 32891 -> 0 bytes test/data/dream/cli_test_output.sh | 2 +- test/data/split/api_test_input.sh | 2 +- test/data/split/cli_test_output.sh | 4 +- 27 files changed, 1095 insertions(+), 530 deletions(-) create mode 100644 include/utilities/consolidate/merge_processes.hpp create mode 100644 include/valik/search/cart_query_io.hpp create mode 100644 include/valik/search/execution_metadata.hpp create mode 100644 include/valik/search/iterate_queries.hpp create mode 100644 include/valik/search/search_distributed.hpp create mode 100644 include/valik/search/search_local.hpp create mode 100644 src/consolidate/merge_processes.cpp diff --git a/include/utilities/consolidate/merge_processes.hpp b/include/utilities/consolidate/merge_processes.hpp new file mode 100644 index 00000000..4f5d79ad --- /dev/null +++ b/include/utilities/consolidate/merge_processes.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "iostream" + +#include +#include +#include +#include + +namespace valik +{ + +/** + * @brief Function that merges search results and metadata across threads. + * + * @param arguments Command line arguments. + * @param time_statistics Run time statistics. IN-OUT parameter. + * @param exec_meta Metadata table across all search threads. + * @param var_pack Environmental variables, this function calls the merge executable. + * @return false if merge failed. + */ + +bool merge_processes(search_arguments const & arguments, + app::search_time_statistics & time_statistics, + app::execution_metadata & exec_meta, + env_var_pack const & var_pack); + +} // namespace valik diff --git a/include/utilities/external_process.hpp b/include/utilities/external_process.hpp index 0319b2a2..a14a1cd7 100644 --- a/include/utilities/external_process.hpp +++ b/include/utilities/external_process.hpp @@ -138,3 +138,20 @@ class external_process final { close(stderrpipe[WRITE_END]); } }; + +template +bool check_external_process_success(std::vector const & proc_args, external_process const & proc) +{ + if (proc.status() != 0) { + std::cerr << "External process failed\n"; + std::cerr << "call:"; + for (auto args : proc_args) { + std::cerr << " " << args; + } + std::cerr << '\n'; + std::cerr << proc.cerr() << '\n'; + return false; + } + else + return true; +} diff --git a/include/valik/search/cart_query_io.hpp b/include/valik/search/cart_query_io.hpp new file mode 100644 index 00000000..42494414 --- /dev/null +++ b/include/valik/search/cart_query_io.hpp @@ -0,0 +1,82 @@ +#pragma once + +#include + +#include + +#include +#include + +#include +#include +#include + +namespace valik +{ + +/** + * \brief Function that creates a seqan2::Segment from a query_record (split or short query) + * + * \param records vector containing valik split query segments + * \param seqs set of query segments (in-out) + * \param ids set of query segment ids (in-out) + * \param strOut stream for standard output + * \param strErr stream for error output + * \param hostQueries underlying sequences for query segments + * \param hostQueryIDs set of underlying sequence ids + */ +template +inline bool get_cart_queries(rec_vec_t const & records, + seqan2::StringSet const, seqan2::InfixSegment>, seqan2::Dependent<>> & seqs, + seqan2::StringSet & ids, + TStream & strOut, + TStream & strErr) +{ + + std::set uniqueIds; // set of short IDs (cut at first whitespace) + bool idsUnique = true; + + std::cout << "\nget_cart_queries\n"; + + size_t seqCount{0}; + for (auto & record : records) + { + seqan2::String query_id = record.sequence_id; + seqan2::appendValue(seqs, record.querySegment, seqan2::Generous()); + seqan2::appendValue(ids, query_id, seqan2::Generous()); + seqCount++; + idsUnique &= stellar::_checkUniqueId(uniqueIds, (seqan2::String) record.sequence_id); + //!ERROR: infix sequence is copied to seqs invalidating the pointers + std::cout << std::addressof(seqs[seqan2::length(seqs) - 1]) << '\t' << seqs[seqan2::length(seqs) - 1] << '\n'; + } + + strOut << "Loaded " << seqCount << " query sequence" << ((seqCount > 1) ? "s " : " ") << "from cart." << std::endl; + if (!idsUnique) + strErr << "WARNING: Non-unique query ids. Output can be ambiguous.\n"; + return true; + +} + +/** + * \brief Function that writes out cart queries. + * + * \param records vector containing query segments + * \param cart_queries_path output path + */ +template +void write_cart_queries(rec_vec_t & records, std::filesystem::path const & cart_queries_path) +{ + using fields = seqan3::fields; + using types = seqan3::type_list>; + using sequence_record_type = seqan3::sequence_record; + + seqan3::sequence_file_output fout{cart_queries_path, fields{}}; + + for (auto & record : records) + { + sequence_record_type sequence_record{std::move(record.sequence_id), std::move(record.sequence)}; + fout.push_back(sequence_record); + } +} + +} // namespace valik diff --git a/include/valik/search/env_var_pack.hpp b/include/valik/search/env_var_pack.hpp index 1b3f5752..9592ea5d 100644 --- a/include/valik/search/env_var_pack.hpp +++ b/include/valik/search/env_var_pack.hpp @@ -6,6 +6,11 @@ #include #include +#include +#include +#include +#include + namespace valik { diff --git a/include/valik/search/execution_metadata.hpp b/include/valik/search/execution_metadata.hpp new file mode 100644 index 00000000..153a7909 --- /dev/null +++ b/include/valik/search/execution_metadata.hpp @@ -0,0 +1,45 @@ +#pragma once + +#include "sstream" + +#include + +#include +#include + +namespace valik::app +{ + +struct execution_metadata +{ + struct thread_metadata + { + std::vector output_files; + std::stringstream text_out; + std::vector time_statistics; + }; + + execution_metadata(size_t const & threads) + { + table = std::vector(threads); + } + + void merge(search_arguments const & arguments, + search_time_statistics & time_statistics) // IN-OUT parameter + { + // Merge all local data together + std::ofstream text_out(arguments.out_file.string() + ".out"); + for (auto const& td : table) + { + output_files.insert(output_files.end(), td.output_files.begin(), td.output_files.end()); + time_statistics.cart_processing_times.insert(time_statistics.cart_processing_times.end(), td.time_statistics.begin(), td.time_statistics.end()); + text_out << td.text_out.str(); + } + } + + std::vector table; + std::vector output_files; + std::unordered_map bin_count; // +}; + +} // namespace valik::app diff --git a/include/valik/search/iterate_queries.hpp b/include/valik/search/iterate_queries.hpp new file mode 100644 index 00000000..155d863d --- /dev/null +++ b/include/valik/search/iterate_queries.hpp @@ -0,0 +1,305 @@ +#pragma once + +#include +#include +#include + +namespace valik::app +{ + +/** + * @brief Function that sends chunks of queries to the prefilter which then writes shopping carts onto disk. + * + * @param arguments Command line arguments. + * @param time_statistics Run-time statistics. + * @param ibf Interleaved Bloom Filter. + * @param queue Shopping cart queue for load balancing between prefiltering and Stellar search. + */ +template +void iterate_distributed_queries(search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue_t & queue) +{ + using fields = seqan3::fields; + std::vector query_records{}; + raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; + seqan3::sequence_file_input fin{arguments.query_file}; + for (auto &&chunked_records : fin | seqan3::views::chunk((1ULL << 20) * 10)) + { + query_records.clear(); + auto start = std::chrono::high_resolution_clock::now(); + for (auto && fasta_record: chunked_records) + query_records.emplace_back(std::move(fasta_record.id()), std::move(fasta_record.sequence())); + + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); + + start = std::chrono::high_resolution_clock::now(); + prefilter_queries_parallel(ibf, arguments, query_records, thresholder, queue); + end = std::chrono::high_resolution_clock::now(); + time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); + } +} + +//!WORKAROUND: partial specialization of template functions is not allowed +// primary template +template +struct iteration_wrapper +{ + static void go(seqan2::StringSet> const & hostQueries, + seqan2::StringSet const & hostQueryIDs, + search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue_t & queue, + database_segments & segments) + { + return; + } +}; + + +/** + * @brief Class that sends query segments to the prefilter which then inserts them to the shopping cart queue. + * Partial template specialization (split queries). + * @param arguments Command line arguments. + * @param time_statistics Run-time statistics. + * @param ibf Interleaved Bloom Filter. + * @param queue Shopping cart queue for load balancing between prefiltering and Stellar search. + */ +/* +template +struct iteration_wrapper>>, ibf_t> +{ + static void go(seqan2::StringSet const, seqan2::InfixSegment>> & hostQueries, + seqan2::StringSet const & hostQueryIDs, + search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue>> & queue, + database_segments & segments) + { + using namespace seqan3::literals; + using TSequence = seqan2::String; + std::vector> query_records{}; + raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; + + size_t id{0}; + for (auto const & query : hostQueries) + { + query_records.clear(); + auto start = std::chrono::high_resolution_clock::now(); + auto segments_from_id = [&](database_segments::segment & seg) {return id == seg.seq_ind;}; + for (auto const & seg : segments.members | std::views::filter(segments_from_id)) + { + // convert the seqan2::String to a std::vector + //!TODO: can this copy be avoided? + seqan2::Segment inf = seqan2::infixWithLength(query, seg.start, seg.len); + + std::stringstream seq_str_stream; + for (int i = 0; i < seg.len; i++) + seq_str_stream << inf[i]; + + //!TODO: populate seg_vec + std::vector seg_vec{}; + + // convert the seqan2::String ID to a std::string ID + std::stringstream id_str_stream; + for (auto it = begin(hostQueryIDs[id]); it != end(hostQueryIDs[id]); ++it) + id_str_stream << *it; + + query_records.emplace_back(id_str_stream.str(), seg.start, std::move(seg_vec), &query); + } + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); + + start = std::chrono::high_resolution_clock::now(); + prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); + end = std::chrono::high_resolution_clock::now(); + time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); + + id++; + } + } +}; +*/ + + +/** + * @brief Class that sends chunks of queries to the prefilter which then inserts them to the shopping cart queue. + * Partial template specialization (short queries). + * + * @param arguments Command line arguments. + * @param time_statistics Run-time statistics. + * @param ibf Interleaved Bloom Filter. + * @param queue Shopping cart queue for load balancing between prefiltering and Stellar search. + */ +template +void read_short_queries(seqan2::StringSet> const & underlyingQueries, + seqan2::StringSet const & underlyingQueryIDs, + std::vector const, seqan2::InfixSegment>> & hostQueries, + search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue>> & queue) +{ + using TSequence = seqan2::String; + using fields = seqan3::fields; + std::vector> query_records{}; + raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; + + /* + seqan2::SeqFileIn inSeqs; + if (!open(inSeqs, arguments.query_file.c_str())) + { + std::cerr << "Failed to open query file." << std::endl; + } + + std::set uniqueIds; // set of short IDs (cut at first whitespace) + bool idsUnique = true; + + seqan2::String queryStr; + std::string fastaID; + uint32_t seqCount{0}; + std::cout << "\niterate_queries\n"; + for (; !seqan2::atEnd(inSeqs); ++seqCount) + { + auto start = std::chrono::high_resolution_clock::now(); + seqan2::readRecord(fastaID, queryStr, inSeqs); + + idsUnique &= stellar::_checkUniqueId(uniqueIds, fastaID); + seqan2::appendValue(underlyingQueries, queryStr, seqan2::Generous()); + std::cout << std::addressof(underlyingQueries[seqan2::length(underlyingQueries) - 1]) << + '\t' << underlyingQueries[seqan2::length(underlyingQueries) - 1] << '\n'; + + seqan2::appendValue(underlyingQueryIDs, fastaID, seqan2::Generous()); + seqCount++; + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); + } + */ + + + for (size_t i{0}; i < seqan2::length(underlyingQueries); i++) + { + std::vector seg_vec{}; + for (auto & c : underlyingQueries[i]) + { + seqan3::dna4 nuc; + nuc.assign_char(c); + seg_vec.push_back(nuc); + } + + query_records.emplace_back(seqan2::toCString(underlyingQueryIDs[i]), std::move(seg_vec), + seqan2::infix(underlyingQueries[i], 0, seqan2::length(underlyingQueries[i]))); + } + + auto start = std::chrono::high_resolution_clock::now(); + prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); +} + +template +void read_split_queries(seqan2::StringSet> const & underlyingQueries, + seqan2::StringSet const & underlyingQueryIDs, + std::vector const, seqan2::InfixSegment>> & hostQueries, + search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue>> & queue, + database_segments & segments) +{ + using TSequence = seqan2::String; + using fields = seqan3::fields; + std::vector> query_records{}; + raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; + + /* + seqan2::SeqFileIn inSeqs; + if (!open(inSeqs, arguments.query_file.c_str())) + { + std::cerr << "Failed to open query file." << std::endl; + } + + std::set uniqueIds; // set of short IDs (cut at first whitespace) + bool idsUnique = true; + + seqan2::String queryStr; + std::string fastaID; + uint32_t seqCount{0}; + std::cout << "\niterate_queries\n"; + for (; !seqan2::atEnd(inSeqs); ++seqCount) + { + auto start = std::chrono::high_resolution_clock::now(); + seqan2::readRecord(fastaID, queryStr, inSeqs); + + idsUnique &= stellar::_checkUniqueId(uniqueIds, fastaID); + seqan2::appendValue(underlyingQueries, queryStr, seqan2::Generous()); + std::cout << std::addressof(underlyingQueries[seqan2::length(underlyingQueries) - 1]) << + '\t' << underlyingQueries[seqan2::length(underlyingQueries) - 1] << '\n'; + + seqan2::appendValue(underlyingQueryIDs, fastaID, seqan2::Generous()); + seqCount++; + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); + } + */ + + //std::cout << "iterate_queries" << '\n'; + size_t id{0}; + for (auto const & query : underlyingQueries) + { + auto segments_from_id = [&](database_segments::segment & seg) {return id == seg.seq_ind;}; + for (auto const & seg : segments.members | std::views::filter(segments_from_id)) + { + seqan2::Segment inf = seqan2::infixWithLength(query, seg.start, seg.len); + + std::vector seg_vec{}; + for (auto & c : inf) + { + seqan3::dna4 nuc; + nuc.assign_char(c); + seg_vec.push_back(nuc); + } + + query_records.emplace_back(seqan2::toCString(underlyingQueryIDs[id]), std::move(seg_vec), inf); + //std::cout << inf << '\n'; + } + id++; + } + + auto start = std::chrono::high_resolution_clock::now(); + prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); +} + + +/* + +template +struct iteration_wrapper>>, ibf_t> +{ + +}; + +// function calling member function in wrapper class +template +void iterate_queries(querySet_t & hostQueries, + seqan2::StringSet & hostQueryIDs, + search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue_t & queue, + database_segments & segments) +{ + //auto wrapper = iteration_wrapper(); + iteration_wrapper::go(hostQueries, hostQueryIDs, arguments, time_statistics, ibf, queue, segments); +} + +*/ + + +} // namespace valik::app diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index 28622cc8..55e45325 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -128,9 +128,9 @@ void find_pattern_bins(pattern_bounds const & pattern, // - overlap shows how much consequtive patterns overlap // //----------------------------- -template +template void local_prefilter( - std::span const & records, + std::span const & records, seqan3::interleaved_bloom_filter const & ibf, search_arguments const & arguments, raptor::threshold::threshold const & thresholder, @@ -149,9 +149,10 @@ void local_prefilter( seqan3::window_size{arguments.window_size}, seqan3::seed{adjust_seed(arguments.shape_weight)}); - for (query_record const & record : records) + for (query_t const & record : records) { std::vector const & seq = record.sequence; +// seqan3::debug_stream << "Prefiltering sequence: \n"; // sequence can't contain local match if it's shorter than pattern length if (seq.size() < arguments.pattern_size) diff --git a/include/valik/search/prefilter_queries_parallel.hpp b/include/valik/search/prefilter_queries_parallel.hpp index 6dbb540a..e3c2383e 100644 --- a/include/valik/search/prefilter_queries_parallel.hpp +++ b/include/valik/search/prefilter_queries_parallel.hpp @@ -18,12 +18,12 @@ namespace valik::app { -template +template inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter const & ibf, search_arguments const & arguments, - std::vector const & records, + std::vector & records, raptor::threshold::threshold const & thresholder, - cart_queue & queue) + cart_queue & queue) { std::vector tasks; size_t const num_records = records.size(); @@ -34,12 +34,20 @@ inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter records_slice{&records[start], &records[end]}; + std::span records_slice{&records[start], &records[end]}; - auto result_cb = [&queue](query_record const& record, std::unordered_set const& bin_hits) + auto result_cb = [&queue](query_t const& record, std::unordered_set const& bin_hits) { for (size_t const bin : bin_hits) { + /* + + + seqan3::debug_stream << "Cart insertion of sequence: " << '\n'; + for (auto & n : record.sequence) + seqan3::debug_stream << n; + seqan3::debug_stream << '\n'; + */ queue.insert(bin, record); } }; diff --git a/include/valik/search/query_record.hpp b/include/valik/search/query_record.hpp index c2938c53..33976fee 100644 --- a/include/valik/search/query_record.hpp +++ b/include/valik/search/query_record.hpp @@ -1,8 +1,11 @@ #pragma once #include +#include +#include #include +#include namespace valik { @@ -13,4 +16,13 @@ struct query_record std::vector sequence; }; +template +struct short_query_record +{ + std::string sequence_id; + std::vector sequence; + seqan2::Segment querySegment; + std::shared_ptr underlyingData; +}; + } // namespace valik diff --git a/include/valik/search/search.hpp b/include/valik/search/search.hpp index 9b929325..0162847d 100644 --- a/include/valik/search/search.hpp +++ b/include/valik/search/search.hpp @@ -1,5 +1,10 @@ #pragma once +#include +#include +#include +#include +#include #include namespace valik::app diff --git a/include/valik/search/search_distributed.hpp b/include/valik/search/search_distributed.hpp new file mode 100644 index 00000000..929c8c35 --- /dev/null +++ b/include/valik/search/search_distributed.hpp @@ -0,0 +1,131 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace valik::app +{ + +/** + * @brief Function that calls Valik prefiltering and launches parallel processes of Stellar search. + * + * @tparam index_t Type of IBF. + * @param arguments Command line arguments. + * @param time_statistics Run-time statistics. + * @param index Interleaved Bloom Filter. + * @return false if search failed. + */ +template +bool search_distributed(search_arguments const & arguments, search_time_statistics & time_statistics, index_t const & index) +{ + std::optional ref_meta; + if (!arguments.ref_meta_path.empty()) + ref_meta = database_metadata(arguments.ref_meta_path, false); + + std::optional ref_segments; + if (!arguments.ref_seg_path.empty()) + ref_segments = database_segments(arguments.ref_seg_path); + + env_var_pack var_pack{}; + auto queue = cart_queue{index.ibf().bin_count(), arguments.cart_max_capacity, arguments.max_queued_carts}; + + std::mutex mutex; + execution_metadata exec_meta(arguments.threads); + + // negative (reverse complemented) database strand + //!TODO: add switch for reverse strand + bool const reverse = true /*threadOptions.reverse && threadOptions.alphabet != "protein" && threadOptions.alphabet != "char" */; + + bool error_in_search = false; // indicates if an error happen inside this lambda + auto consumerThreads = std::vector{}; + for (size_t threadNbr = 0; threadNbr < arguments.threads; ++threadNbr) + { + consumerThreads.emplace_back( + [&, threadNbr]() { + auto& thread_meta = exec_meta.table[threadNbr]; + // this will block until producer threads have added carts to queue + for (auto next = queue.dequeue(); next; next = queue.dequeue()) + { + auto & [bin_id, records] = *next; + + std::unique_lock g(mutex); + std::filesystem::path cart_queries_path = var_pack.tmp_path / std::string("query_" + std::to_string(bin_id) + + "_" + std::to_string(exec_meta.bin_count[bin_id]++) + ".fasta"); + g.unlock(); + + thread_meta.output_files.push_back(cart_queries_path.string() + ".gff"); + + write_cart_queries(records, cart_queries_path); + + std::vector process_args{}; + process_args.insert(process_args.end(), {var_pack.stellar_exec, "--version-check", "0", "-a", "dna"}); + + if (ref_segments && ref_meta) + { + // search segments of a single reference file + auto ref_len = ref_meta->total_len; + auto seg = ref_segments->segment_from_bin(bin_id); + process_args.insert(process_args.end(), {index.bin_path()[0][0], std::string(cart_queries_path), + "--referenceLength", std::to_string(ref_len), + "--sequenceOfInterest", std::to_string(seg.seq_ind), + "--segmentBegin", std::to_string(seg.start), + "--segmentEnd", std::to_string(seg.start + seg.len)}); + } + else + { + // search a reference database of bin sequence files + if (index.bin_path().size() < (size_t) bin_id) { + throw std::runtime_error("Could not find reference file with index " + std::to_string(bin_id) + + ". Did you forget to provide metadata to search segments in a single reference file instead?"); + } + process_args.insert(process_args.end(), {index.bin_path()[bin_id][0], std::string(cart_queries_path)}); + } + + if (arguments.write_time) + process_args.insert(process_args.end(), "--time"); + + process_args.insert(process_args.end(), {"-e", std::to_string(arguments.stellar_er_rate), + "-l", std::to_string(arguments.pattern_size), + "-o", std::string(cart_queries_path) + ".gff"}); + + auto start = std::chrono::high_resolution_clock::now(); + external_process process(process_args); + auto end = std::chrono::high_resolution_clock::now(); + + thread_meta.time_statistics.emplace_back(0.0 + std::chrono::duration_cast>(end - start).count()); + + thread_meta.text_out << process.cout(); + thread_meta.text_out << process.cerr(); + + if (process.status() != 0) { + std::unique_lock g(mutex); // make sure that our output is synchronized + std::cerr << "error running VALIK_STELLAR\n"; + std::cerr << "call:"; + for (auto args : process_args) { + std::cerr << " " << args; + } + std::cerr << '\n'; + std::cerr << process.cerr() << '\n'; + error_in_search = true; + } + } + }); + } + + iterate_distributed_queries(arguments, time_statistics, index.ibf(), queue); + + queue.finish(); // Flush carts that are not empty yet + consumerThreads.clear(); + + // merge output files and metadata from threads + bool error_in_merge = merge_processes(arguments, time_statistics, exec_meta, var_pack); + + return error_in_search && error_in_merge; +} + +} // namespace valik::app diff --git a/include/valik/search/search_local.hpp b/include/valik/search/search_local.hpp new file mode 100644 index 00000000..e538fabf --- /dev/null +++ b/include/valik/search/search_local.hpp @@ -0,0 +1,368 @@ +#pragma once + +#include "future" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace valik::app +{ + +/** + * @brief Function that calls Valik prefiltering and launches parallel threads of Stellar search. + * + * @tparam index_t Type of IBF. + * @param arguments Command line arguments. + * @param time_statistics Run-time statistics. + * @param index Interleaved Bloom Filter. + * @return false if search failed. + */ +template +bool search_local(search_arguments const & arguments, search_time_statistics & time_statistics, index_t const & index) +{ + std::optional ref_meta; + if (!arguments.ref_meta_path.empty()) + ref_meta = database_metadata(arguments.ref_meta_path, false); + + std::optional ref_segments; + if (!arguments.ref_seg_path.empty()) + ref_segments = database_segments(arguments.ref_seg_path); + + env_var_pack var_pack{}; + + std::optional query_segments; + if (!arguments.query_seg_path.empty()) + query_segments = database_segments(arguments.query_seg_path); + + using TAlphabet = seqan2::Dna; + using TSequence = seqan2::String; + auto queue = cart_queue>{index.ibf().bin_count(), arguments.cart_max_capacity, arguments.max_queued_carts}; + + std::mutex mutex; + execution_metadata exec_meta(arguments.threads); + + // negative (reverse complemented) database strand + bool const reverse = true /*threadOptions.reverse && threadOptions.alphabet != "protein" && threadOptions.alphabet != "char" */; + seqan2::StringSet databases; + using TSize = decltype(length(databases[0])); + seqan2::StringSet reverseDatabases; + seqan2::StringSet databaseIDs; + std::ofstream disabledQueriesFile; + TSize refLen; + + if (arguments.disableThresh != std::numeric_limits::max()) + { + std::filesystem::path disabledPath = arguments.out_file; + disabledPath.replace_extension(".disabled.fa"); + disabledQueriesFile.open(((std::string) disabledPath).c_str(), ::std::ios_base::out | ::std::ios_base::app); + if (!disabledQueriesFile.is_open()) + { + std::cerr << "Could not open file for disabled queries." << std::endl; + return false; + } + } + + stellar::stellar_runtime input_databases_time{}; + for (auto bin_paths : index.bin_path()) + { + for (auto path : bin_paths) + { + bool const databasesSuccess = input_databases_time.measure_time([&]() + { + std::cout << "Launching stellar search on a shared memory machine...\n"; + return stellar::_importAllSequences(path.c_str(), "database", databases, databaseIDs, refLen, std::cout, std::cerr); + }); + if (!databasesSuccess) + return false; + } + } + + if (reverse) + { + for (auto database : databases) + { + reverseComplement(database); + seqan2::appendValue(reverseDatabases, database, seqan2::Generous()); + } + } + + time_statistics.ref_io_time += input_databases_time.milliseconds() / 1000; + stellar::DatabaseIDMap databaseIDMap{databases, databaseIDs}; + stellar::DatabaseIDMap reverseDatabaseIDMap{reverseDatabases, databaseIDs}; + + seqan2::StringSet underlyingQueries; + seqan2::StringSet underlyingQueryIDs; + + TSize queryLen; + // import underlying query sequences (SPLIT) + stellar::stellar_runtime input_queries_time{}; + bool const queriesSuccess = input_queries_time.measure_time([&]() + { + std::cout << "Finding all local alignments between two genomes...\n"; + return stellar::_importAllSequences(arguments.query_file.c_str(), "query", underlyingQueries, underlyingQueryIDs, queryLen, std::cout, std::cerr); + }); + if (!queriesSuccess) + return false; + time_statistics.reads_io_time += input_queries_time.milliseconds() / 1000; + + bool error_in_search = false; // indicates if an error happened inside this lambda + auto consumerThreads = std::vector{}; + for (size_t threadNbr = 0; threadNbr < arguments.threads; ++threadNbr) + { + consumerThreads.emplace_back( + [&, threadNbr]() { + auto& thread_meta = exec_meta.table[threadNbr]; + // this will block until producer threads have added carts to queue + for (auto next = queue.dequeue(); next; next = queue.dequeue()) + { + auto & [bin_id, records] = *next; + + std::unique_lock g(mutex); + std::filesystem::path cart_queries_path = var_pack.tmp_path / std::string("query_" + std::to_string(bin_id) + + "_" + std::to_string(exec_meta.bin_count[bin_id]++) + ".fasta"); + g.unlock(); + + thread_meta.output_files.push_back(cart_queries_path.string() + ".gff"); + + stellar::StellarOptions threadOptions{}; + stellar::stellar_app_runtime stellarThreadTime{}; + threadOptions.alphabet = "dna"; // Possible values: dna, rna, protein, char + threadOptions.queryFile = cart_queries_path.string(); + threadOptions.prefilteredSearch = true; + threadOptions.referenceLength = refLen; + if (ref_segments && ref_meta) + { + threadOptions.searchSegment = true; + auto seg = ref_segments->segment_from_bin(bin_id); + threadOptions.binSequences.emplace_back(seg.seq_ind); + threadOptions.segmentBegin = seg.start; + threadOptions.segmentEnd = seg.start + seg.len; + } + else + { + if (index.bin_path().size() < (size_t) bin_id) { + throw std::runtime_error("Could not find reference file with index " + std::to_string(bin_id) + + ". Did you forget to provide metadata to search segments in a single reference file instead?"); + } + threadOptions.binSequences.push_back(bin_id); + } + threadOptions.numEpsilon = arguments.stellar_er_rate; + threadOptions.epsilon = stellar::utils::fraction::from_double(threadOptions.numEpsilon).limit_denominator(); + threadOptions.minLength = arguments.pattern_size; + threadOptions.disableThresh = arguments.disableThresh; + threadOptions.outputFile = cart_queries_path.string() + ".gff"; + + using TDatabaseSegment = stellar::StellarDatabaseSegment; + using TQuerySegment = seqan2::Segment const, seqan2::InfixSegment>; + + stellar::_writeFileNames(threadOptions, thread_meta.text_out); + stellar::_writeSpecifiedParams(threadOptions, thread_meta.text_out); + stellar::_writeCalculatedParams(threadOptions, thread_meta.text_out); // calculate qGram + thread_meta.text_out << std::endl; + + // import query sequences + seqan2::StringSet> queries; + seqan2::StringSet queryIDs; + get_cart_queries(records, queries, queryIDs, thread_meta.text_out, thread_meta.text_out); + + stellar::_writeMoreCalculatedParams(threadOptions, threadOptions.referenceLength, queries, thread_meta.text_out); + + auto current_time = stellarThreadTime.swift_index_construction_time.now(); + stellar::StellarIndex stellarIndex{queries, threadOptions}; + stellar::StellarSwiftPattern swiftPattern = stellarIndex.createSwiftPattern(); + + // Construct index of the queries + thread_meta.text_out << "Constructing index..." << '\n'; + stellarIndex.construct(); + thread_meta.text_out << std::endl; + stellarThreadTime.swift_index_construction_time.manual_timing(current_time); + + std::vector disabledQueryIDs{}; + + stellar::StellarOutputStatistics outputStatistics{}; + if (threadOptions.forward) + { + auto databaseSegment = stellar::_getDREAMDatabaseSegment + (databases[threadOptions.binSequences[0]], threadOptions); + stellarThreadTime.forward_strand_stellar_time.measure_time([&]() + { + size_t const databaseRecordID = databaseIDMap.recordID(databaseSegment); + seqan2::CharString const & databaseID = databaseIDMap.databaseID(databaseRecordID); + // container for eps-matches + seqan2::StringSet const, + seqan2::CharString> > > forwardMatches; + seqan2::resize(forwardMatches, length(queries)); + + constexpr bool databaseStrand = true; + auto queryIDMap = stellar::QueryIDMap(queries); + + stellar::StellarComputeStatistics statistics = stellar::StellarLauncher::search_and_verify + ( + databaseSegment, + databaseID, + queryIDMap, + databaseStrand, + threadOptions, + swiftPattern, + stellarThreadTime.forward_strand_stellar_time.prefiltered_stellar_time, + forwardMatches + ); + + thread_meta.text_out << std::endl; // swift filter output is on same line + stellar::_printDatabaseIdAndStellarKernelStatistics(threadOptions.verbose, databaseStrand, databaseID, statistics, thread_meta.text_out); + + seqan3::debug_stream << "before consolidation\n"; + + for (auto & query_matches : forwardMatches) + seqan3::debug_stream << seqan2::length(query_matches.matches) << '\t'; + seqan3::debug_stream << '\n'; + + stellarThreadTime.forward_strand_stellar_time.post_process_eps_matches_time.measure_time([&]() + { + // forwardMatches is an in-out parameter + // this is the match consolidation + stellar::_postproccessQueryMatches(databaseStrand, threadOptions.referenceLength, threadOptions, + forwardMatches, disabledQueryIDs); + }); // measure_time + + seqan3::debug_stream << "after consolidation\n"; + + for (auto & query_matches : forwardMatches) + seqan3::debug_stream << seqan2::length(query_matches.matches) << '\t'; + seqan3::debug_stream << '\n'; + + // open output files + std::ofstream outputFile(threadOptions.outputFile.c_str(), ::std::ios_base::out); + if (!outputFile.is_open()) + { + std::cerr << "Could not open output file." << std::endl; + error_in_search = true; + } + stellarThreadTime.forward_strand_stellar_time.output_eps_matches_time.measure_time([&]() + { + // output forwardMatches on positive database strand + stellar::_writeAllQueryMatchesToFile(forwardMatches, queryIDs, databaseStrand, "gff", outputFile); + }); // measure_time + + outputStatistics = stellar::_computeOutputStatistics(forwardMatches); + }); // measure_time + } + + + if (reverse) + { + TDatabaseSegment databaseSegment{}; + stellarThreadTime.reverse_complement_database_time.measure_time([&]() + { + databaseSegment = _getDREAMDatabaseSegment + (reverseDatabases[threadOptions.binSequences[0]], threadOptions, reverse); + }); // measure_time + + stellarThreadTime.reverse_strand_stellar_time.measure_time([&]() + { + size_t const databaseRecordID = reverseDatabaseIDMap.recordID(databaseSegment); + seqan2::CharString const & databaseID = reverseDatabaseIDMap.databaseID(databaseRecordID); + // container for eps-matches + seqan2::StringSet const, + seqan2::CharString> > > reverseMatches; + seqan2::resize(reverseMatches, length(queries)); + + constexpr bool databaseStrand = false; + + stellar::QueryIDMap queryIDMap{queries}; + + stellar::StellarComputeStatistics statistics = stellar::StellarLauncher::search_and_verify + ( + databaseSegment, + databaseID, + queryIDMap, + databaseStrand, + threadOptions, + swiftPattern, + stellarThreadTime.reverse_strand_stellar_time.prefiltered_stellar_time, + reverseMatches + ); + + thread_meta.text_out << std::endl; // swift filter output is on same line + stellar::_printDatabaseIdAndStellarKernelStatistics(threadOptions.verbose, databaseStrand, databaseID, + statistics, thread_meta.text_out); + + stellarThreadTime.reverse_strand_stellar_time.post_process_eps_matches_time.measure_time([&]() + { + // reverseMatches is an in-out parameter + // this is the match consolidation + stellar::_postproccessQueryMatches(databaseStrand, threadOptions.referenceLength, threadOptions, + reverseMatches, disabledQueryIDs); + }); // measure_time + + // open output files + std::ofstream outputFile(threadOptions.outputFile.c_str(), ::std::ios_base::app); + if (!outputFile.is_open()) + { + std::cerr << "Could not open output file." << std::endl; + error_in_search = true; + } + stellarThreadTime.reverse_strand_stellar_time.output_eps_matches_time.measure_time([&]() + { + // output reverseMatches on negative database strand + stellar::_writeAllQueryMatchesToFile(reverseMatches, queryIDs, databaseStrand, "gff", outputFile); + }); // measure_time + + outputStatistics.mergeIn(stellar::_computeOutputStatistics(reverseMatches)); + }); // measure_time + } + + // Writes disabled query sequences to disabledFile. + if (disabledQueriesFile.is_open()) + { + stellarThreadTime.output_disabled_queries_time.measure_time([&]() + { + // write disabled query file + stellar::_writeDisabledQueriesToFastaFile(disabledQueryIDs, queryIDs, queries, disabledQueriesFile); + }); // measure_time + } + + stellar::_writeOutputStatistics(outputStatistics, threadOptions.verbose, disabledQueriesFile.is_open(), thread_meta.text_out); + + thread_meta.time_statistics.emplace_back(stellarThreadTime.milliseconds()); + if (arguments.write_time) + { + std::filesystem::path time_path = cart_queries_path.string() + std::string(".gff.time"); + + stellar::_print_stellar_app_time(stellarThreadTime, thread_meta.text_out); + } + } + }); + } + + std::vector const, seqan2::InfixSegment>> hostQueries; + std::vector hostQueryIDs; + + if constexpr (is_split) + read_split_queries(underlyingQueries, underlyingQueryIDs, hostQueries, arguments, time_statistics, index.ibf(), queue, *query_segments); + else + read_short_queries(underlyingQueries, underlyingQueryIDs, hostQueries, arguments, time_statistics, index.ibf(), queue); + + queue.finish(); // Flush carts that are not empty yet + consumerThreads.clear(); + + // merge output files and metadata from threads + bool error_in_merge = merge_processes(arguments, time_statistics, exec_meta, var_pack); + + return error_in_search && error_in_merge; +} + +} // namespace valik::app diff --git a/include/valik/shared.hpp b/include/valik/shared.hpp index 22b55071..6cba21f4 100644 --- a/include/valik/shared.hpp +++ b/include/valik/shared.hpp @@ -4,8 +4,10 @@ #include #include +#include #include +#include #include #include @@ -137,8 +139,10 @@ struct search_arguments final : public minimiser_threshold_arguments, public ste }; } - std::filesystem::path seg_path{}; + double stellar_er_rate{}; std::filesystem::path ref_meta_path{}; + std::filesystem::path ref_seg_path{}; + std::filesystem::path query_seg_path{}; bool shared_memory{false}; }; diff --git a/lib/stellar3 b/lib/stellar3 index dada3f89..49b06866 160000 --- a/lib/stellar3 +++ b/lib/stellar3 @@ -1 +1 @@ -Subproject commit dada3f894d0ab84242c7ad57b62c88886b34c3b2 +Subproject commit 49b06866aafdda79fc7e2087f7415c0bc5707979 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f079bca3..29121abb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -46,6 +46,9 @@ target_link_libraries ("${PROJECT_NAME}_consolidation_lib" PUBLIC "${PROJECT_NAM add_library ("${PROJECT_NAME}_consolidation_io_lib" STATIC consolidate/io.cpp) target_link_libraries ("${PROJECT_NAME}_consolidation_io_lib" PUBLIC "${PROJECT_NAME}_interface") +add_library ("${PROJECT_NAME}_merge_processes_lib" STATIC consolidate/merge_processes.cpp) +target_link_libraries ("${PROJECT_NAME}_merge_processes_lib" PUBLIC "${PROJECT_NAME}_interface") + # Sliding window argument parsing add_library ("${PROJECT_NAME}_argument_parsing_shared_lib" STATIC argument_parsing/shared.cpp) target_link_libraries ("${PROJECT_NAME}_argument_parsing_shared_lib" PUBLIC "${PROJECT_NAME}_interface") @@ -79,6 +82,7 @@ target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_build_li target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_search_lib") target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_consolidation_lib") target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_consolidation_io_lib") +target_link_libraries ("${PROJECT_NAME}_lib" INTERFACE "${PROJECT_NAME}_merge_processes_lib") # Sliding window executable add_executable ("${PROJECT_NAME}" valik_main.cpp) diff --git a/src/argument_parsing/search.cpp b/src/argument_parsing/search.cpp index 5a393b70..11513a9c 100644 --- a/src/argument_parsing/search.cpp +++ b/src/argument_parsing/search.cpp @@ -87,10 +87,15 @@ void init_search_parser(sharg::parser & parser, search_arguments & arguments) .long_id = "ref-meta", .description = "Path to reference metadata file created by split.", .validator = sharg::input_file_validator{}}); - parser.add_option(arguments.seg_path, + parser.add_option(arguments.ref_seg_path, sharg::config{.short_id = '\0', .long_id = "seg-meta", - .description = "Path to segment metadata file created by split.", + .description = "Path to reference segment metadata file created by split.", + .validator = sharg::input_file_validator{}}); + parser.add_option(arguments.query_seg_path, + sharg::config{.short_id = '\0', + .long_id = "query-meta", + .description = "Path to query genome metadata for finding all local alignment between two long sequences.", .validator = sharg::input_file_validator{}}); parser.add_flag(arguments.shared_memory, sharg::config{.short_id = '\0', @@ -221,6 +226,11 @@ void run_search(sharg::parser & parser) else arguments.overlap = arguments.pattern_size - 1; + // ========================================== + //!WORKAROUND: Stellar does not allow smaller error rates + // ========================================== + arguments.stellar_er_rate = std::max((double) arguments.errors / (double) arguments.pattern_size, 0.00001); + // ========================================== // Dispatch // ========================================== diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index c38ec50e..0abe48e6 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -30,7 +30,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments) .description = "Choose how much consecutive segments overlap.", .validator = positive_integer_validator{true}}); parser.add_option(arguments.seg_count, - sharg::config{.short_id = '\0', + sharg::config{.short_id = 'n', .long_id = "seg-count", .description = "Dividing the database into this many segments. Corresponds to IBF bin count for the reference sequence so that multiples of 64 lead to better performance.", .validator = sharg::arithmetic_range_validator{1, 29952}}); diff --git a/src/consolidate/merge_processes.cpp b/src/consolidate/merge_processes.cpp new file mode 100644 index 00000000..0568aeaa --- /dev/null +++ b/src/consolidate/merge_processes.cpp @@ -0,0 +1,35 @@ +#include + +namespace valik +{ + +bool merge_processes(search_arguments const & arguments, + app::search_time_statistics & time_statistics, + app::execution_metadata & exec_meta, + env_var_pack const & var_pack) +{ + // merge metadata from all threads + exec_meta.merge(arguments, time_statistics); + std::vector merge_process_args; + if (exec_meta.output_files.size() > 0) + { + merge_process_args.push_back(var_pack.merge_exec); + for (auto & path : exec_meta.output_files) + merge_process_args.push_back(path); + } + else + { + //!WORKAROUND: merge hangs if no valik matches found + merge_process_args.push_back("echo"); + merge_process_args.push_back("-n"); + } + + external_process merge(merge_process_args); + + std::ofstream matches_out(arguments.out_file); + matches_out << merge.cout(); + + return check_external_process_success(merge_process_args, merge); +} + +} // namespace valik diff --git a/src/valik_search.cpp b/src/valik_search.cpp index dbcde837..a0ccfabf 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -1,87 +1,22 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include +#include namespace valik::app { -using fields = seqan3::fields; -using types = seqan3::type_list>; -using sequence_record_type = seqan3::sequence_record; - -template -inline bool get_cart_queries(rec_vec_t & records, - seqan2::StringSet & seqs, - seqan2::StringSet & ids, - TStream & strOut, - TStream & strErr) -{ - std::set uniqueIds; // set of short IDs (cut at first whitespace) - bool idsUnique = true; - - size_t seqCount{0}; - for (auto & record : records) - { - TSequence seq; - for (auto c : record.sequence) - seq += c.to_char(); - - seqan2::appendValue(seqs, seq, seqan2::Generous()); - seqan2::appendValue(ids, (seqan2::String) record.sequence_id, seqan2::Generous()); - seqCount++; - idsUnique &= stellar::_checkUniqueId(uniqueIds, (seqan2::String) record.sequence_id); - } - - strOut << "Loaded " << seqCount << " query sequence" << ((seqCount > 1) ? "s." : ".") << std::endl; - if (!idsUnique) - strErr << "WARNING: Non-unique query ids. Output can be ambiguous.\n"; - return true; -} - -template -void write_cart_queries(rec_vec_t & records, std::filesystem::path const & cart_queries_path) +/** + * @brief Function that loads the index and launches local or distributed search. + * + * @param arguments Command line arguments. + */ +void valik_search(search_arguments const & arguments) { - seqan3::sequence_file_output fout{cart_queries_path, fields{}}; - - for (auto & record : records) - { - sequence_record_type sequence_record{std::move(record.sequence_id), std::move(record.sequence)}; - fout.push_back(sequence_record); - } -} + search_time_statistics time_statistics{}; -//----------------------------- -// -// Setup IBF and launch multithreaded search. -// -//----------------------------- -template -bool run_program(search_arguments const &arguments, search_time_statistics & time_statistics) -{ - using index_structure_t = std::conditional_t; + using index_structure_t = index_structure::ibf; + if (arguments.compressed) + using index_structure_t = index_structure::ibf_compressed; auto index = valik_index{}; - bool error_triggered = false; // indicates if an error happen inside this function - { auto start = std::chrono::high_resolution_clock::now(); load_index(index, arguments.index_file); @@ -89,446 +24,16 @@ bool run_program(search_arguments const &arguments, search_time_statistics & tim time_statistics.index_io_time += std::chrono::duration_cast>(end - start).count(); } - seqan3::sequence_file_input fin{arguments.query_file}; - std::vector query_records{}; - - raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; - - env_var_pack var_pack{}; - sync_out synced_out{arguments.out_file}; - auto queue = cart_queue{index.ibf().bin_count(), arguments.cart_max_capacity, arguments.max_queued_carts}; - - std::optional segments; - if (!arguments.seg_path.empty()) - segments = database_segments(arguments.seg_path); - - std::optional ref_meta; - if (!arguments.ref_meta_path.empty()) - ref_meta = database_metadata(arguments.ref_meta_path, false); - - //!WORKAROUND: Stellar does not allow smaller error rates - double er_rate = std::max((double) arguments.errors / (double) arguments.pattern_size, 0.00001); - - std::mutex mutex; - std::unordered_map bin_count; - - struct LocalData { - std::vector output_files; - std::stringstream text_out; - std::vector timeStatistics; - std::vector stellarTimes; - }; - - std::vector localData(arguments.threads); - - using TAlphabet = seqan2::Dna; - using TSequence = seqan2::String; - - // negative (reverse complemented) database strand - bool const reverse = true /*threadOptions.reverse && threadOptions.alphabet != "protein" && threadOptions.alphabet != "char" */; - seqan2::StringSet databases; - using TSize = decltype(length(databases[0])); - seqan2::StringSet reverseDatabases; - seqan2::StringSet databaseIDs; - std::ofstream disabledQueriesFile; - TSize refLen; - + bool failed; if (arguments.shared_memory) { - if (arguments.disableThresh != std::numeric_limits::max()) - { - std::filesystem::path disabledPath = arguments.out_file; - disabledPath.replace_extension(".disabled.fa"); - disabledQueriesFile.open(((std::string) disabledPath).c_str(), ::std::ios_base::out | ::std::ios_base::app); - if (!disabledQueriesFile.is_open()) - { - std::cerr << "Could not open file for disabled queries." << std::endl; - return false; - } - } - - stellar::stellar_runtime input_databases_time{}; - for (auto bin_paths : index.bin_path()) - { - for (auto path : bin_paths) - { - bool const databasesSuccess = input_databases_time.measure_time([&]() - { - std::cout << "Launching stellar search on a shared memory machine...\n"; - return stellar::_importAllSequences(path.c_str(), "database", databases, databaseIDs, refLen, std::cout, std::cerr); - }); - if (!databasesSuccess) - return false; - } - } - - if (reverse) - { - for (auto database : databases) - { - reverseComplement(database); - seqan2::appendValue(reverseDatabases, database, seqan2::Generous()); - } - } - - time_statistics.ref_io_time += input_databases_time.milliseconds() / 1000; - } - stellar::DatabaseIDMap databaseIDMap{databases, databaseIDs}; - stellar::DatabaseIDMap reverseDatabaseIDMap{reverseDatabases, databaseIDs}; - - - auto consumerThreads = std::vector{}; - for (size_t threadNbr = 0; threadNbr < arguments.threads; ++threadNbr) - { - consumerThreads.emplace_back( - [&, threadNbr]() { - auto& ld = localData[threadNbr]; - // this will block until producer threads have added carts to queue - for (auto next = queue.dequeue(); next; next = queue.dequeue()) - { - auto & [bin_id, records] = *next; - - std::unique_lock g(mutex); - std::filesystem::path cart_queries_path = var_pack.tmp_path / std::string("query_" + std::to_string(bin_id) + - "_" + std::to_string(bin_count[bin_id]++) + ".fasta"); - g.unlock(); - - ld.output_files.push_back(cart_queries_path.string() + ".gff"); - - if (arguments.shared_memory) - { - stellar::StellarOptions threadOptions{}; - stellar::stellar_app_runtime stellarThreadTime{}; - using TDatabaseSegment = stellar::StellarDatabaseSegment; - - // import query sequences - seqan2::StringSet queries; - seqan2::StringSet queryIDs; - //!TODO: split query sequence - get_cart_queries(records, queries, queryIDs, ld.text_out, ld.text_out); - using TSize = decltype(length(queries[0])); - - threadOptions.alphabet = "dna"; // Possible values: dna, rna, protein, char - threadOptions.queryFile = cart_queries_path.string(); - threadOptions.prefilteredSearch = true; - threadOptions.referenceLength = refLen; - if (segments && ref_meta) - { - threadOptions.searchSegment = true; - auto seg = segments->segment_from_bin(bin_id); - threadOptions.binSequences.emplace_back(seg.seq_ind); - threadOptions.segmentBegin = seg.start; - threadOptions.segmentEnd = seg.start + seg.len; - } - else - { - if (index.bin_path().size() < (size_t) bin_id) { - throw std::runtime_error("Could not find reference file with index " + std::to_string(bin_id) + - ". Did you forget to provide metadata to search segments in a single reference file instead?"); - } - threadOptions.binSequences.push_back(bin_id); - } - threadOptions.numEpsilon = er_rate; - threadOptions.epsilon = stellar::utils::fraction::from_double(threadOptions.numEpsilon).limit_denominator(); - threadOptions.minLength = arguments.pattern_size; - threadOptions.disableThresh = arguments.disableThresh; - threadOptions.outputFile = cart_queries_path.string() + ".gff"; - stellar::_writeFileNames(threadOptions, ld.text_out); - stellar::_writeSpecifiedParams(threadOptions, ld.text_out); - stellar::_writeCalculatedParams(threadOptions, ld.text_out); // calculate qGram - ld.text_out << std::endl; - stellar::_writeMoreCalculatedParams(threadOptions, threadOptions.referenceLength, queries, ld.text_out); - - - auto current_time = stellarThreadTime.swift_index_construction_time.now(); - stellar::StellarIndex stellarIndex{queries, threadOptions}; - stellar::StellarSwiftPattern swiftPattern = stellarIndex.createSwiftPattern(); - - // Construct index of the queries - ld.text_out << "Constructing index..." << '\n'; - stellarIndex.construct(); - ld.text_out << std::endl; - stellarThreadTime.swift_index_construction_time.manual_timing(current_time); - - std::vector disabledQueryIDs{}; - - stellar::StellarOutputStatistics outputStatistics{}; - if (threadOptions.forward) - { - auto databaseSegment = stellar::_getDREAMDatabaseSegment - (databases[threadOptions.binSequences[0]], threadOptions); - stellarThreadTime.forward_strand_stellar_time.measure_time([&]() - { - size_t const databaseRecordID = databaseIDMap.recordID(databaseSegment); - seqan2::CharString const & databaseID = databaseIDMap.databaseID(databaseRecordID); - // container for eps-matches - seqan2::StringSet const, - seqan2::CharString> > > forwardMatches; - seqan2::resize(forwardMatches, length(queries)); - - constexpr bool databaseStrand = true; - stellar::QueryIDMap queryIDMap{queries}; - - stellar::StellarComputeStatistics statistics = stellar::StellarLauncher::search_and_verify - ( - databaseSegment, - databaseID, - queryIDMap, - databaseStrand, - threadOptions, - swiftPattern, - stellarThreadTime.forward_strand_stellar_time.prefiltered_stellar_time, - forwardMatches - ); - - ld.text_out << std::endl; // swift filter output is on same line - stellar::_printDatabaseIdAndStellarKernelStatistics(threadOptions.verbose, databaseStrand, databaseID, statistics, ld.text_out); - - stellarThreadTime.forward_strand_stellar_time.post_process_eps_matches_time.measure_time([&]() - { - // forwardMatches is an in-out parameter - // this is the match consolidation - stellar::_postproccessQueryMatches(databaseStrand, threadOptions.referenceLength, threadOptions, - forwardMatches, disabledQueryIDs); - }); // measure_time - - // open output files - std::ofstream outputFile(threadOptions.outputFile.c_str(), ::std::ios_base::out); - if (!outputFile.is_open()) - { - std::cerr << "Could not open output file." << std::endl; - error_triggered = true; - } - stellarThreadTime.forward_strand_stellar_time.output_eps_matches_time.measure_time([&]() - { - // output forwardMatches on positive database strand - stellar::_writeAllQueryMatchesToFile(forwardMatches, queryIDs, databaseStrand, "gff", outputFile); - }); // measure_time - - outputStatistics = stellar::_computeOutputStatistics(forwardMatches); - }); // measure_time - } - - - if (reverse) - { - TDatabaseSegment databaseSegment{}; - stellarThreadTime.reverse_complement_database_time.measure_time([&]() - { - databaseSegment = _getDREAMDatabaseSegment - (reverseDatabases[threadOptions.binSequences[0]], threadOptions, reverse); - }); // measure_time - - stellarThreadTime.reverse_strand_stellar_time.measure_time([&]() - { - size_t const databaseRecordID = reverseDatabaseIDMap.recordID(databaseSegment); - seqan2::CharString const & databaseID = reverseDatabaseIDMap.databaseID(databaseRecordID); - // container for eps-matches - seqan2::StringSet const, - seqan2::CharString> > > reverseMatches; - seqan2::resize(reverseMatches, length(queries)); - - constexpr bool databaseStrand = false; - stellar::QueryIDMap queryIDMap{queries}; - - stellar::StellarComputeStatistics statistics = stellar::StellarLauncher::search_and_verify - ( - databaseSegment, - databaseID, - queryIDMap, - databaseStrand, - threadOptions, - swiftPattern, - stellarThreadTime.reverse_strand_stellar_time.prefiltered_stellar_time, - reverseMatches - ); - - ld.text_out << std::endl; // swift filter output is on same line - stellar::_printDatabaseIdAndStellarKernelStatistics(threadOptions.verbose, databaseStrand, databaseID, statistics, ld.text_out); - - stellarThreadTime.reverse_strand_stellar_time.post_process_eps_matches_time.measure_time([&]() - { - // reverseMatches is an in-out parameter - // this is the match consolidation - stellar::_postproccessQueryMatches(databaseStrand, threadOptions.referenceLength, threadOptions, - reverseMatches, disabledQueryIDs); - }); // measure_time - - // open output files - std::ofstream outputFile(threadOptions.outputFile.c_str(), ::std::ios_base::app); - if (!outputFile.is_open()) - { - std::cerr << "Could not open output file." << std::endl; - error_triggered = true; - } - stellarThreadTime.reverse_strand_stellar_time.output_eps_matches_time.measure_time([&]() - { - // output reverseMatches on negative database strand - stellar::_writeAllQueryMatchesToFile(reverseMatches, queryIDs, databaseStrand, "gff", outputFile); - }); // measure_time - - outputStatistics.mergeIn(stellar::_computeOutputStatistics(reverseMatches)); - }); // measure_time - } - - // Writes disabled query sequences to disabledFile. - if (disabledQueriesFile.is_open()) - { - stellarThreadTime.output_disabled_queries_time.measure_time([&]() - { - // write disabled query file - stellar::_writeDisabledQueriesToFastaFile(disabledQueryIDs, queryIDs, queries, disabledQueriesFile); - }); // measure_time - } - - stellar::_writeOutputStatistics(outputStatistics, threadOptions.verbose, disabledQueriesFile.is_open(), ld.text_out); - - ld.timeStatistics.emplace_back(stellarThreadTime.milliseconds()); - if (arguments.write_time) - { - std::filesystem::path time_path = cart_queries_path.string() + std::string(".gff.time"); - - stellar::_print_stellar_app_time(stellarThreadTime, ld.text_out); - } - } - else - { - write_cart_queries(records, cart_queries_path); - - std::vector process_args{}; - process_args.insert(process_args.end(), {var_pack.stellar_exec, "--version-check", "0", "-a", "dna"}); - - if (segments && ref_meta) - { - // search segments of a single reference file - auto ref_len = ref_meta->total_len; - auto seg = segments->segment_from_bin(bin_id); - process_args.insert(process_args.end(), {index.bin_path()[0][0], std::string(cart_queries_path), - "--referenceLength", std::to_string(ref_len), - "--sequenceOfInterest", std::to_string(seg.seq_ind), - "--segmentBegin", std::to_string(seg.start), - "--segmentEnd", std::to_string(seg.start + seg.len)}); - } - else - { - // search a reference database of bin sequence files - if (index.bin_path().size() < (size_t) bin_id) { - throw std::runtime_error("Could not find reference file with index " + std::to_string(bin_id) + - ". Did you forget to provide metadata to search segments in a single reference file instead?"); - } - process_args.insert(process_args.end(), {index.bin_path()[bin_id][0], std::string(cart_queries_path)}); - } - - if (arguments.write_time) - process_args.insert(process_args.end(), "--time"); - - process_args.insert(process_args.end(), {"-e", std::to_string(er_rate), - "-l", std::to_string(arguments.pattern_size), - "-o", std::string(cart_queries_path) + ".gff"}); - - auto start = std::chrono::high_resolution_clock::now(); - external_process process(process_args); - auto end = std::chrono::high_resolution_clock::now(); - - ld.timeStatistics.emplace_back(0.0 + std::chrono::duration_cast>(end - start).count()); - - ld.text_out << process.cout(); - ld.text_out << process.cerr(); - - if (process.status() != 0) { - std::unique_lock g(mutex); // make sure that our output is synchronized - std::cerr << "error running VALIK_STELLAR\n"; - std::cerr << "call:"; - for (auto args : process_args) { - std::cerr << " " << args; - } - std::cerr << '\n'; - std::cerr << process.cerr() << '\n'; - error_triggered = true; - } - } - } - }); - } - - // prefilter data which inserts them into the shopping carts - for (auto &&chunked_records : fin | seqan3::views::chunk((1ULL << 20) * 10)) - { - query_records.clear(); - auto start = std::chrono::high_resolution_clock::now(); - for (auto && query_record: chunked_records) - query_records.emplace_back(std::move(query_record.id()), std::move(query_record.sequence())); - - auto end = std::chrono::high_resolution_clock::now(); - time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); - - start = std::chrono::high_resolution_clock::now(); - prefilter_queries_parallel(index.ibf(), arguments, query_records, thresholder, queue); - end = std::chrono::high_resolution_clock::now(); - time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); - } - queue.finish(); // Flush carts that are not empty yet - consumerThreads.clear(); - - // Merge all local data together - std::ofstream text_out(arguments.out_file.string() + ".out"); - std::vector output_files; - - for (auto const& ld : localData) { - output_files.insert(output_files.end(), ld.output_files.begin(), ld.output_files.end()); - time_statistics.cart_processing_times.insert(time_statistics.cart_processing_times.end(), - ld.timeStatistics.begin(), ld.timeStatistics.end()); - text_out << ld.text_out.str(); - } - - std::vector merge_process_args; - if (output_files.size() > 0) - { - merge_process_args.push_back(var_pack.merge_exec); - for (auto & path : output_files) - merge_process_args.push_back(path); - } - else - { - //!WORKAROUND: merge hangs if no valik matches found - merge_process_args.push_back("echo"); - merge_process_args.push_back("-n"); - } - - external_process merge(merge_process_args); - - auto check_external_process_success = [&](std::vector const & proc_args, external_process const & proc) - { - if (proc.status() != 0) { - std::cerr << "External process failed\n"; - std::cerr << "call:"; - for (auto args : proc_args) { - std::cerr << " " << args; - } - std::cerr << '\n'; - std::cerr << proc.cerr() << '\n'; - return true; - } + if (arguments.query_seg_path.empty()) + failed = search_local(arguments, time_statistics, index); else - return error_triggered; - }; - - error_triggered = check_external_process_success(merge_process_args, merge); - - std::ofstream matches_out(arguments.out_file); - matches_out << merge.cout(); - return error_triggered; -} - -void valik_search(search_arguments const & arguments) -{ - search_time_statistics time_statistics{}; - - bool failed; - if (arguments.compressed) - failed = run_program(arguments, time_statistics); + failed = search_local(arguments, time_statistics, index); + } else - failed = run_program(arguments, time_statistics); + failed = search_distributed(arguments, time_statistics, index); if (arguments.write_time) write_time_statistics(time_statistics, arguments.out_file.string() + ".time"); diff --git a/test/cli/valik_options_test.cpp b/test/cli/valik_options_test.cpp index 8c686d86..518e935a 100644 --- a/test/cli/valik_options_test.cpp +++ b/test/cli/valik_options_test.cpp @@ -127,7 +127,7 @@ TEST_F(argparse_split, no_seg_count) "--seg-count 0"); EXPECT_NE(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); - EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option --seg-count: Value 0 is not in range [1,29952].\n"}); + EXPECT_EQ(result.err, std::string{"[Error] Validation failed for option -n/--seg-count: Value 0 is not in range [1,29952].\n"}); } TEST_F(argparse_build, input_missing) diff --git a/test/data/dream/16bins13window.ibf b/test/data/dream/16bins13window.ibf index 057c65be7d0e85e5afa4d4a99f1a140a9056216c..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 GIT binary patch literal 0 HcmV?d00001 literal 32891 zcmaLg52&Blc^~j!jS*3mg>WgCV1@cBQMb9R6IZ>x^j$o^#&!z51!yZ1zvC{r{8yz&CHN`s}AVxHI+RAOFOA zK5^tTj~;n@5JTWBh+j_K0Jq$zPX}hRTd!9S|7>Ab-coRLU%{ns6+C-q!S*{<>DA8_ zynRQ(mESMeeupZpy)~FAP&G_WA1TE2>BV-n$?AgGoAHkQbRpWjd#b+t@eqy$$Berx zt2fU)>s^0m(e2JUuH2S>8SlbU)%)Mg*9wCl?Mn|-Y3q)H^A8kUDsdy5&4v^M&HwUp z;dlOdVEEqo?y6sUsNnQe!Qo2e~e$nlV+&=fVv|SFI3Lg7|g5zC*S^w~kMLath zzgBp^J0DjH91EVkx8U;8-~)DMU9emKtxEg8TX6GSLH3F6Li95`QW$a_!!Nj-+&aDn z_B2-iH{NUHg7^YktMRjLcDeO08pT3^-pRkoG4tOZJArSVhbpZ`U*oe?--@5y_GH!B z2RVQZUt}}#p|?|il>VLkn;*w7mf{ci+xx-vyRYD(_|2WMC-S-{b^OofMZbK1L43}p zt{#2P#gU=fcmK1{jRp69A@u31-8ek^>z_jEt{a zNB`+HTTZ+hKNZZLg4PY5x~fOP=6{_W;1{W{g~7y^*;p;h=L;T>{m+J~``!7ml)}$0 z@-q&?n?QarOMmO^=V4bj8K$AvzFWoJzZKM2aA)`u*NsnZ(=QdB{ZTv_|2F+^DM)U^ zhpSG1Ytc_(xBu$*^xkUYkN6q?uJGh>Ao}9>R};5~#GN+7SS9PAKl_ik3r=5ZJ+BuY zOx&>UjqI;`A6cZ^$bQEz&1YQo)@!QL5dUxX9X-nzP97>ce)LNA+e`nd@ZzPo#V(&- zt@cAdQ_!#dmwsFS5UkQu_|Q*Wp>KN7zAN@)o%A_mzhN)y&s2}zPmLq~s2lH_e^!_# zCj{UTg@T78r`_MG`h$^|{b^(0B0W2$M|oGH8e8dl!FtG%|M4U80rg({>ncrm!xSW6 z>us%LmFU0Wt#>GleOvoH@kPH!yM2gV@&o>LGx@&%Lp8GfnttVzZSwQ1zXERggbaZ8np??S2ww5$=9qvER}7@szl^2Z|dKVcPeq)xa>&$7MIwKy755c(ueUjezPdX$jLcLKl#IfFNSWV;O(CX z#Z=JvcSMi$3R*Y0rT5JF$o^p*`yhaQeW3Y(VLzpp)%%T5P#(DdcNX!+mB-2tZ<7ayT)wIIxRTNsxF;}REAgwBtNs1y zpDRN9e%rTVAM|8B2l)H2IL|n*y10L=JFi(E`)Kj*uIjfpaZa9xp8wLP{@TZ<7Wq&k zNB!u#`3=HBVG-)L^km@=K30$&;9orL_XYBzpF@cw4{5ej)TT-W0tr_PVwo z-O^(g>(*u8LrTkR!{AND>l-XzO zx78JfDwzMpLshp=%A@I1x1;eFb@naqc)K?#z|+8w00;i`#L*2_(t`>&o=+ffX;d1k$LPF^25EqUoEF|K6EbJ zo&8cj^Sh6L2b)~R!k`yNkN%+W_7VN?)9QCuX*1^za&kTc(f5!)41ZiL{Fr{1&KDkE zY?lk96aQ?8<2FujxnLGseEr zJsSIBZ~Wpyo?<$WyO&{4^Y{(9AOBE|C@%jn=RA5epLMIR-Cw17;u*W8M|9+{FZ005 zC)~r~2cPidr=NAQC(n&uyEhb_eLNpM9giR3j~$qo{Fa&ptGeeDQs&w|_T^}2XQe|oWF`_6qul`V} z;Dwy4V8z*^d0)fx2JQIr-a>C*?mfisi~pv2dj9LVfOzoUFIIbh?!FXzaxNF|8XqY% z`D+-eFukK7`ERApey_Z*+MUzTwf&@;+r6M_lkZrVwdj)^+|w<=Kda{=B=@t?U$d)_ zo%q=*(6`?B?D6zWPLJjsJSGo-CwF*uY5Zf~sgnD#ecgTI{%#*X6h04RKd}B|$p?(P zJLlo)uENaeV?fctX|Zm-^XwP+UKe*lPL3_#4xT-J*?m>;;<2ZxlT@vj}+V&Ixp>d zyc*p1w)}l8yg0FPECU3Pzk57!*|`WcdD+jcs31JOw0g*zTy8OP!4_A8wl9lc zueq<$!d}R0`kI!&_z8d4 zI$o}l^MQQj^?M6HqO0t z(6P_czl!1???--(kC2c5X|dmxZ&bbUFP=_16nsAGZvLO<$0Dy$zrf~ChS{S~5MRzo z_6PBazS66`uVlXGKU*!mp3sjgRU%LKN*7|6@?!Mt-nkoI98tHQ(}RAm{%I(tf?Mum zgT{w`Ee?$#2)ONmDy)C9Ah{gaP^)c-OF3h%`#8_K9=_W`HbgZ+MR>gl{j_gcSA80Z0il0-}9S3zJ%H1;xje+(Rz5-l*{tezHgL1FAy^Vq}Ye<-~7tnHuX;jiBJ5O1c~5&e_H&OD!> zkM{m@80zf-pX^(|L%*)&{}0ENY8OZGD=vx0^PJ~eUepbH@v|LbsQO30yohi8_Y{8P zKNoE48mex8Y4#2uxsek-#kr$zt^R(+!#>Z`elsY3is$G~N8jq~PkmSJGt}vWpTM(s z_SIkr+Wa~M-jn;2)h7yJUqeQY_J8LW?eawD@P1x?#NYAJ))~DSuZQO657yt)$vfm_ zo_G2V#Ja>q{zTu6pA5YF$tp}a53pl-Oq+jbrRxRf#6|42+Ef_NEjIqJqqv;oU&mI|}dnPW$LL*9-4gKCjMh?4R4@JvPcJ3?pMc;bCj`kFmkbS+L*wfUnez7OUe58n5z^W=f>_G@zEALB|UDA&0iV+)^{>Zyx&e{*?85enKuo@*&S3`uza^o>MNLFU;wyS^~#{?zQbB zO+WGb3#W>1j9oUn84BruN`#$zTZ`Ml=&pRKCnJ@O#_{hZgcHfF0 zaO$z2)9YvYy`r>#xX(AQ zILJ@rRrBn_P0nMDv+KTs7xNsVeeaNV_fthRoAviu^lyH2{7D`3-n)BXc>YdL#zjw$ z?4Z}5ajbjuuL^@szV2QLz4)NsL-7ut9@)XpF7Kre?c#<0p7X2sa1^f+Z@0DoU199o z+S!G=b+ZR`>vu17=;`XQ9Dnuv!E;J})W&^0rSM1Z?t?y``!eg0KWr{nKlz3C>(I&n z=&zMYsAyLG8W%(mG=wQ{=xlarD(^ih@7dXY|{4?n%Si`4lFU z1>K9lzL@>uVD53@r!Q5>zK?%-z4cq)JnzN6{?6+6mE4=nKTvh|clcjJ8k+V30pR0M!FJ8(|hTJ<|$UO{teuWSI1V5XjEGQ0WXNS&1*7wG>zZmnA z|9X8C_S)}QC4NUv@T++)A)dCsV-Yzz&$RKnf^*LDt<|vm@q*&9{0aY_Q_82;-d6O^ zp^d-HbFMpo;9H#O_gU*O9zMM9y7uw#nf3M0$un*H?zYdr#Ot%44CNFUyVM`#H{vP& z#V>T)+0Q+Zi~MXoazNMgX#jAj!qO8E1^YO1?&k_`JbX1jJhIqd``W)=>?dv-XGr|$ z*LU)G^<&TCMIX1D-|s2)>3TtaC?6s3M%B~5 zl$PjmHPmIXC8SWq75{uN*L3H~R)C#Q_kz_?OH^i6M5NCNF^ z*2B)&|K@)Uy>6XdPj^>?_-LIgIe*EgjgN27)h>1Khl(fkC7v&GaiiADpS;f9TQsM0 zPCWVUs@o5(+jG*j^VQzwzcIx^sqKSPp{*C67oD?Wck;tSUHt9&Hpwg52R3q!BM0~V z?eC-L5ZL$_3&D=)bK_%$N5^l(om1gE!?kl5|9~H|t|o^;OTm^eC2w+%i;6t1{zYM& z580{pqF1NK>C@H1uj$)5TfY?8q4Qhc{?WvtHh&1%h56e&D@AbNfr9MVI>|S~%$9dl ziM)*Sz7G|Cp7=`+^uRvkIS>Du5ewP}tdAbegL+Jya_+Q0JeNG!`kG(Z57LkSHoq8* zT##JdTeQ}(N<-p>e2@Rn=GE>#>)^$z`yJEId7a#>mwwv_E(pP_ep##f^r zs^qz)Pqlc`Vb-@J+t%ecoIvivGQZi>rDTq$hu; zmOsti3rQ_jK)^#sGoz1FfGr zzSyJohR^z+%lBZtJ&9kwJ8F7~9_X><-HH43a%cXop6>|6nQd_o!-1_mb>kt!zdHMB zFb(5M753#^z#quLdhP3=ziYDpTh-rlO>r;7&sHB%DrjGZmA~;Tc(C~cL*wHdd+UgV=Bzt{(t~XyZe#0 zj|Ts8m*4Ns`LxX!I`Pb~?8Lh9&0pDve7E5xx#*sf+jZSViTOG^ibs`I1iQ&pE=d!OAobMFVLy~%g1y5G+yzvc(x0=c*cChy-! z9J4NT#sk-#*N4Ke_vQPmu0Q(s@2Ju__r%E`4xTD}JC~<}`*i)~Q`T?)(+;v9_TKPa zzS8h3Rl;wx%P|z`=RJq@+ykRGkNCu1nm#g&_aC2sG>8E9xQM;>`d#)pc=t%=%f!U09}>6iNBs-;HM*R&&eA;^RJ#S41d)wzZbvZ z!T$cD={w_kUlF}~X7zqPd7ke;ozs@WAG_3l+dB7Di#!NjI}Zgh@8TgpWDlM{JpI?z z;nmzf(I5T%Lhhsedj#$y?ZflzKkonhonLXZJ?|W99(GJF*2g}>egn?yYl&n0iamIb zt?k?Ci7x{=-?VyXzx{VS==cM9oY#+o{&7JyQ+utwf^3R zmnWdt&c255cW!UBwC@+ks*|_z$#sFpQ$?gBsY-U^oDoeq3-=}`w+R|w;#uNy&R3_ zIR`p+rrtw;Uk2WM#-n#}2c0@Qg{Mz-^EbMwTI6Z)<^x~Mb7XpSo+h7Ha*u_N+w+{_ zu@g1E{e@r3w~W7fKMo6ep1p|ue`<+;Fzw&TxC~QJp8ja=bF7bjp|&o1cdoF{P4QRs zo*%3G?$P-0>_Wafu2jp(uNUn99jDf>E7&LGH{^)SJk!{Kk8seLU{-9Q~&3gU6ERwgQHWRMHoiUhJg4CgFIK6?YnQKxQ}FoH?_Wkf^KXY@C}=(8u=+&d z$phbO@hAGsDDCfbuz!BZ4jz0c^L$E=g6xIeA5A=dU-Ec*nq|KyU!!<_hrh2RUiiCD z&fVm&c0N6)f@ku6mOQ}vIGQ^(k?^korzmfZu`LXKnS6@lk5*kjP~OP@*(d(^7d@%Bd}49B8-EXfNJI2=;=!tWugf@X z+}M-v*7(6j_BnB%Jow@9__sXVJr=gkMW5;*I&qpD*}L;PKJ^pVrn?F=o+^@7wM7J^sSJ!i@3TWpSw^wEpp>#RmQ9dn55d-t1hw*lRX> zp29%0Y&lb+tV;%kco-T9K-n_R~l$@*IRPpv*G_~rYP zHhv23A*@sTsr0TbbpW8Ox zSeUlIcLd%~_4j+p3mHG}{ZF2Z&N$*BI|b1V(Gz{Fc|Mf9#CgoVre6>3|MQPji~SuP zdpBRNKl1Z_Bsp7u%h#qFv8{K=48e;l{NBE;?)#Y&(Vu&@$D#CfK47_1_&Tmer(^N5BG=SOdDsY0bbAdO8n4$e48cv{~7B^ zgE-ds4V^kZjPv6+t8_Nc3Cu@N_F?+Or}yRDOS4aUao=gZ+RY2vpVh??e7qR{#W%gU z2lGC|5PvXF?>~K9kuRBdZ_bmO|2vo=a8JRt9jTLN@a7d?`FW^k_BDB{`(^E|->roa z2b;X_D!jPS`)51njcvs1g%O`Yk55&>d*SS|AJ5;H;m4Q0S^a$PN)OHp^ow8dfAdoH z8>5%7G20tEqbK<#`$XsOeyTS(w9@s0_`d$Z!qcBBt=e_o4ys88Mby`N9K^xdWYrX{Dte_0rQy^_39y^lNa z{M`CMa+e304_<$If#-kf?VJ+-w+~pqasB@+@_5^}9-eLqYaoUv2Up y7u)GyzQ&J=>e_GTm+;T!dB9foVf)tFlSScs9&ypQ=d?s)RIkjF)SyNoO+*A!CY!c__v7;fyBHp;X#5 z4r-Ka$`CqDAHsbIB_lGyi1<(t9O7etgdsE{Qt}|B5xGO-gKsLKNPWQ3;bk_g1_g;JLwfArTe)sev!!Z2J;s4(M@23I`|9g*+j~Lt#{mfHOf9mOrUw!uC z-C`8ML5yD&je|QD^mt|%?mQwl;7a)7142f@4yYGHaL-)7fSw+C$>c1qB!n+?jHk0(BcM!=j8zR3G9liatq2~Qe;~Ln@y8-Dx^4|E7?`nS8Ki~sdKg??^$3{367&N<-STrj#il;{_{f8-!>yZP$Ks6mgw{uIU7-hLUx#++<4;q1>F_kIv>_&mpX9Lc*gHm|3Cr)>v6 zlk-3Qd$rYv%6^N1ANE7^R>Be*^tY!^JwSfH#xxa5>@|Vnzd=tCQOulV&hh+a^KK8XAx;lE~Pj{Y5SYG`^z_e3;JubB4^8+9KUYyYD zL7u;;cfESjvv+>^^_D~a;NSNJ;^p)6IgfLZ-_E>|4{htiKz!hnFVH~r-qAh|fBcoe zUetVXJ{*cPYQU=c!+0efQV(_Of9{8p(+~WmA4AYHc9O)~_dW8Mem-pV>%<%3;<}y} zdhIY@&zn8%W=DinyT9uB#g6{Xe?K@~eTygdK-O2r`67~lS>IdsK)v#FH~qq|nGOH* zso-p!4Tyf~t>`a$Cu9#zdp@5}+P+JEvL13iUd2Rct6K;L2Et?(kqXL zVB>d9VKf@j_*+jQ2uQm#?m1fV+7#pa(a+0dyEXmxXIgqCRYHTj@Ov)oiT{1XO>xS3 zsL})$(H+1%vTpO7CHqTX48wbAPkzXk=YA80;F{pY&D*mpo%0vX(I03a_nFK)<0QWZ zn>Wr0{+yZjM-|V@A@k>T8UFbngxwk41F=83w^MJ5Gqw`ui~QZua~l0+U-R=I?YG_* zRGlsG4`Q4n*taKfR{mjwoIHi!+ZT~LEZ-7m42a*&bEau>;(2XPdL`f<@fLgjeIe>U z{>9Dulk}L_5z>7gtD!f0G$45w)G#nl(D^<=4DSc~$CF=>!WeMfc#l54+8O3rp_BcwS>Z-LLy`IeycR6zG5ITmD|Ao~Od` zNXEBHuLy1n3hdn}q_lkM#J=vgOgdmkzo~<}H(hf%J=r&xs}0X;+_Xc#i1T^sb?rD1 zoB5{SU*i4I3Om~C%{Q&mKiYwxuq5`O5Z)UxZg+A+^E9{f19{;5gBADUd43z!5>O;{ z-0QcuBlThOKCFbq>oNLs?H|xU_8E`!k9Dr$JdpbshAW`gzxj7%Py(`UantoLee`O? z#d9god2YYZ>|izc)PW1ycQ2@)S(jy=&!Oh-pJi8PCRmdk$!qB!Kx^I(9t?r+H=JLx znK7{=#N$gIun%zF+x&&r&Z)O9w4BTdfvHW%d|#~kGEs0wX&GLxcJG(Gwt7&L zbCc<1USQy`1S9L&7vvHGTy?GuKozpy@83+3l zjyQina}NW(3U;5bWk_5n{gK?~i~2#kJfA>2WZihadR*Mj*Q?8&rrtA-9eb`opYgKq zQ#W4uH8eU<^NoMB{JHqj$&h6Ki!^~X=o_C5==CSKi|_x)3y*vH$3XDl`^M;yjDz_> z&F2YzodxFoeM`@?lka0o=sTbIhqV9iyLNwYVD&sLYhXSO2qjQBA=&YR@-WPRhH zfEYCXJ$JQo-Ybw_wC{O;UFW^l`tWsB=N!Vk^S-M$Zl5Qohg;Di&qt%a|D;_%Kd{fT zZ({HLhGsoN1AYAP6V|{ke=}gVhl1pVJQs~P?>mzb2>t2J^Q0ZwS3O5@ZuWZY^G>rs zf36_D$cOE~$jjslDLluCo;;TnvrwHf?%?|h@?QQRr~W|WhjruUAAj$Ko$nLG``@1n zjY)pWFx$bAU_CH~NcY1&oZ8C!q(N#RPpE#w#vc$qkbHh!_gU~AGH)Jl@&W&KCd(4g z^MZWL{Y`qZ8t|WaqkOc~NMvxZr9-!qeo}X?! z6cW!D;_G&vx4b8$zM@aOGp{w?)Em(ZILXUma-Ac2erlzM)M%U%G;#FtUHj`+pL}F} zG4JQK&Zsw7y}sgk%<~VsF03Khe59+<2*`KV2m2*T>`(cgv6FATe)|09ywQLlaWeMe zyO1)G95GmLNPT|jm)Ae^dk{O9zhd7{dB0Ei`~4aovL3j@QHg^{Vc)AIEa>77Z4Ai=TjP*Kh6!<&Gh`>>#1iCpLIvOocoY}Sm@6g>BrUs z{YJj4a}?u%?~u3yd+P){^q_&n4?3UYt)KLZdr98dz-BxsNB~QbW<@t=B_JdYr%bUWfX`^LEbP#DN&4CkX!4k9YL8 zkGFFv<92?Xiz9#NWxO#M17ZIBoJ9TDycv3oJL6h{U-W#6Jh;dFJ4yyL)y=1jAu{)o zDn=)7=Jvcne|NQx@m}@4Ff@>RA!yXuY*o!@z#^_Me5s*BSK&JN64`Anil(p--rO5GVNgcjhCUfLnLAG}K)8@zj6zJKCFk--KZeWHL_iSx=}F zH~P!DV_)|)___a|(&7j~A8wvAv`)((ha~HDLDOIjn)xB$(Bk<9ntHKo_ek*3<2zUS z55Fjn-}*Z^0ew8wfn0C%R-e2O!6#1I{G&}qeC+^$!G_Ux{ziWWuu(k~LiR6xn9gAc? zrR8-7$&#UC8sV~TJfxeIN9G%=682g0tlQzWPM`S#t2TeSQ z3vM3%gTOBTOF;4ndBRaxNzSndesY~lJNsk%9P87aa_>W(OnD8=$AjGS7ykY)f(7(C zozg1oVsefU#znpLdFQ*&)@t!R1LJo&^yW>lfZP*$z0Nw=drz)@&s7)FYYoi(vF~yZ zoccmGe=iWmU+w(m>vJ++@3Kwd-Ot+mQI^WXRrB)R7uuHmy>=@d zhJ8Dau6ho}o^v`$>^qj9=5Amg^1QkFLuo3WtV_;;?x)&46Lkx{Nqs6vRzS7y^*`5%=J`PPsEmVo z1$rw*buIn4f@t;)>S0GM+30g__H|0#V;>}M{C$G=n||_KmJK?hczHj#??D}ZzP^yJ z3;CTT_Q{@I=EE2o*?!MpO+NF;dF{gCFJ-RtA9huFKpc4Q(|aCw-}r%lRqv&Jog&Zn zVDzB!I>jZi7U}Sb2kVw~k;ZN(n)g4Mr>b`o`*gG(g5O^K%=q6C{_4#9R?&hUU-D~Y zKBfNY-wI@|^AqiJj-jrw4!QS{`Qp2XWGqLj)*bnS)>ZxvvcNGYyH)F&?{<21h&=T1 z5A$e`aryp0KGy4@$2`*xafKgRi*qhC_Ycs({5=ni6aJ(wO2GGr%v0(M$=`p|enY>2 z=w}>8ILAZAInk=Y!5{HLk37ou3zFq9p9d9>{CihZr#;q%i8C*0mwxo>D)TwqWcel{J`DeW7(a!VyojL;MMRc+H{quXJyTpAA3+i6lUrg)>;hwwY_NjIA1rNkC_lq$k zMi`q_wEQ*!77EZRrKEpZ>3Xe z@?Ej=1Wo20I^h^t=BWfepX7P!$&UI9-{bb!SHq#_JM;9;o*EQz`;?Z%8if59CjEe( zK;LIkxCPQeerE)~xlW~)ag|@*OH#*Juj#cUc7))! ze&o6-tuA%b$JJ^bQW_ug=5pddKi_`HHAEM|F<|yfZ2x^}L&r9zQX7smy z4^F_Gw;D9x#f~qvdQLs5_Y=hBG4hmgy8(0wg!#=nh}>ZfhUc-XWnQ>oO`13YduI8Z zUP6QMPxi|y52IiY)DV2;uAahjKJ$sR7oUtXzhj4iK_7ST{@cuw^t#}5Dz1r?yF-z6 z!+w1AndJ_A|C(sk3FnBh|8(%Vr*SRnpV!4+dz|y|<8g)0zMXly9TI59oBD>#bv~tT zW_?d(DNBABe z`rC^)a^jxxsCaB4=Qj2)+Vk^h`jNTQllyp$zs}qCh!guo#<^8X>IZ)x=xd)$))7)? zLG%8OxVBo0-w|>?b%pV|9NNDd=bj3=$ANjtd8naTw9b)Z--Dc&T%P~G7e$iofIeT$ zKkX9dJwH!b1Ml{pGo?5GdLYC*_tg^o$@)|tl4^{)z2HG<()cO#92(u0xVltA9G1soU+wJ!D z@Bjb)gTFsM^4neT^~Jw`d-(Kl|KsQV^Go=8;qAlYKR#bv!L$GT*O%MvG4}EjUtIWp z;3eN)WiJ1Y;X*!hoQ=O|4$X0W(EQD1aKC-@`ukpUc{$vENSCYbE?A%Ypt)YSbiI(z z9k6pg@3Y@NI`o|uw?`gtZ|CmdcPAgLkM16G9Mbu_r}vr5LAqY(UdRXOu;0;LbUAm~ z-+gk>9MbjavB#XgeCEx`L%xOH!|zTy4t+0kxpH^&yHk%`eK;QuLd-Z!C-^!aePlxtEdvLym{O+>PUEV3L*Bpm@`e5hw z+Q%nvj=O`7L-&|>AD?}Y&m8J0caL26J~bz=AD8bQ_nP}&xH%oy>rNy(nKl9qNH}Ip{m#_R*We`S9ku7l(4b107b^ zZ{FX<+}$`oUNpC_@1jF@%R#+3pPYO7>@mmXAwM5%KV8mus1LVKe|2-ceC_YM`0VE| z>WB5y^}9oldm(?3AL?^wQJ#+5qX&m{IqxxteCD|K*@v6+!|M8>JM?(Besf$O|LSQD zZ?1X1b%(q=p!dq@t-tS(Z%&@>E@&>V*FHW-cPFl2zP<9$J{;=7+hZ>s+6&!f-oET;5(B%H#6-%=JNc)Q`*Q>3+O^z4G0MH%Hei zkMkG3n;(bncUSXt`{fqr*JE$@(2H`=+&l5^ahEv`?bi#ddl!_4bh&c9&^>(g_LcK@ z&&u;dKHmxIgYL5*@bT(2HJ@22ya)4NAM zzjxHvd2{y2+lRYP4z{;BdHwwI?u7h&P+qQ_4)s7i-U013m)BpPxn6S|`VM;QJ8?OE zP|h8=eR%Jem-B8qZccXx)Nfz?=KOm2pgDhibl9Bt>2VLNANNlC%K7v|d*pf-UEbV1 z^3~<&(BAg()Ad68VEg3zZh5^>9_Q0zk3N3)^O-kqUVruGdQZ7Kn$t^%_CtEnT#t7_ zcbK~aUvqiiQ?C1Qe!cF}-yZX#KKsn+`s}0g*}rso^M!m+ue%o4XO4TnK6>vqw+Hg! z-hbL7*Iqe2dg<$4cSAXQrhY!;E=z#BybG@D8_7%=xc|LR8 zJ?^NE>*0s?K)&w5va$0gYx|L@YDIttJ}+`*E{6&R+rOjt{3Oi z1LYQ%FY>q7-uCkAG50>4Pmlg`y6+{Y2lAQYo$H~y3(D7r!}inlLO%Vt?^A9+G?#~+ zm-D#;=d-_e<9tQ;=*40Eeoy=Px`z(y?>>3Ha_@7GUOn!@&7nN52X?Lp^1;=^Zw~2@ z?!7p)$6a#JoFDRcE?1OuM{|A$zLoFymuoMdxjx(;Ip{9P*Z1<kydfP+iFUorlZV!}eAAj}bx5r%mbhq!z=Pr3%FMs#Z^_=eJ(_6GxuY1a?%jq*; z=uW+I-bZ&YfBWR<>kfG+XOEnHknS$Iay{?&P=ksodSo={1LX?Z+WqpZ9lO zU+41OtIv0DkGu61`C3rSSJM5*qUmvaqhkW+p_T!LG9-6xcn#1bu>OOnueDe13 z;l6_$-F`aWK036wd-UpS55FG%a`&3+lgIh&q070m{Wz46Mf=<(&xiBT-LrC? z^TEDj`Mtyb=H=Qa$L}3{a^~Iz`HSZ6>0G~FbNisYJN3%T@!@>><<03h)Wh!%ecelk za?l=}uRVP3D4Ih(?p>Up59f#Wc`vNLoDb@QdhN%dJ}3wIpzm0}IUUy{=T68E>HN_B zP(Q2>x1SI9``b&euX=mkqerhE?=gpZA)g$deK@~8c(H!H?#1QoU-SCqpgg2QdhhZ+ zKJ$hA?tyfBpk6+@ez`TLpL}}P?_v)1`QGj^x7WPfyYxUl*j_nv9MbKj_g#GY`Fe*Q z{=Tn%KDvI~9eU~Z%iBZeH$S1A{kWWd`*C~Z%pqNmUUT|Fc{x6Lci`sy@{kXAKfk;= z4&}?`pWlrQJEM_F&VI^)%P_bS}q-_x`od{@$fWFCWyew|P20&gUKU`s_dLk>khp z$`|Wfem?!~#(f`Lj~whAcNd>NJ@(P%_|0+XJIm>lx0es{>ya~ubjWXR53J7~x*Q#v zw+H95pC9)7(D{9brOQM6?59J$tB(%(^|`0IeskZeJ#yxH>Ghk_?ZNGX-fzDAeD-z^ zZmzFfPvps2oa{W**t{2w}^~t#pniuuaalJ)zdvHFwzM}WL zOCHz5XHM5g#~~kVFP$IiXW1EldI0({_f?|XP-WE+`PT(4!I(~ zdseR;KXjKn+QVnA$DaDl`OWR=oX;M;e7L+me);8>=hMTl2j|1>lY{2^ak%~ts@vBd zyy#xKJl%e1U+!X{Psh4$nmYc_3L*pY+f(FeSCDtZ|+{)9ejBAn?pWa z??QJjzdm>Aq2mj^*B-vUqnzI!b9p-Me)~F?*9-aNAYDH+$L)pm`smR9-epc--`yPQ zgYIA49(VH7as7QCZr+@`&H3AF4&5W~ey9i6hr0vU1L=C~mD6WlefprgI`=MlK6<%4bca23IlX-5{GIce>uC<>$Mwm32cJ2e&mP>IE|1%{bUwci z&hHLAeCF=J<@r13GcTG$d3#}V-Ro|7eR6vG{mkX`7WKjA^wZ_}adUUz=KT6_K3G3r zd#mG+Umxy!;PyiPa(DYZ?BRoSJvh{Z%kjGpn)5@toL=1f^iO$k&`XUB1W%`SJSA>D}+Un)CZUbZCyt>xJgaZ|-h6eyG2GTuu+YyzenD+6z0! zJIC#DH@)cZ%KiNM^>`2D*T)a(P_BLUnU^=u=WhGlV-AFcelKEnd9gC71r`-3V&H9S1NH00eHZznyj zoDZ7YOV@*&_xs30_d>cnuD{&->@jctq?^0nJ@)E@d~$Rg^0|`^^yL_1ed;57HqW+6(pJeC}uuF4z0yi|)5a56(}wU%xpH z-NDb-Ub=qs=JdH2hkE4cdg$&ew_gwBGndC9zd7z+oS(1#_4R&vIaptNdIv7gk3;*Q zyZL=*b9b22o72ya7xlN#{dAmPuen}xcXh7EoWE#(>eUaMqwBYyzEF?vD(4<^y6-{f iE817Rx%SDy&fV#qxIN``dz+&}`}DX2>MP3A<^KTv3y6jQ diff --git a/test/data/dream/4bins15window.ibf b/test/data/dream/4bins15window.ibf index 2be7edb54feb5d59dfa3fa93e8d06df7d4d8510b..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 GIT binary patch literal 0 HcmV?d00001 literal 32891 zcmb`{YpSHx5`^Ju5pUpS2F}1g4nU{UKkNah19WCy*!8sVLa&#r*{u7&wcukPM=;Ks@JCv?L)5)>!*Ws_Q9b#?ECk< z_==sUUq4;z^yaUA?=6=D^^4}H(|4}-*N6Q2%_*KEJohXvS0Bph-s+IvJL3BARJUj6)kkMBzK@)R^7xldo!)$?5AD%=&4KdxZjqnfJRCN^ zKKnyHyt(}P^sx7;)AOtQez{^x#p_F z&ZAT3hkWMIsoM+JZ(YB+t1bu9ozOk@UHa}}4&;*u)uB0jMLPAxPj!Dka{9i#=fuq` zx`)4W=;^$Rz4W2J96tHWr!NQ6L-kwqj`pzEt-m^X>h@as-lO;N$yqx6&Q)(td3C*C zt~$Scp#Amb>f>~H^>}skkPa_;@9N~LLw@Z-4iEt=DIsz4>0n?qy#&ovXgk z-u7!ApFX5>kDMZZ(H(N+^1DNw-#mSK{S&_GGjHXtzWeO&&Z1m8sO~3+HP;A zbIa-Y*B<7{Zw`O4dHQmoI+Wjc!0l5#4&~tdxV}7nxc1YRQBsEhkWwPTY5fttLq$lnX9iZzera+`Sf9P@$RDzPky?E{vOS#&U^X}`Z!dVhZnt* zy&;`>bn2a>-g)xsU;5VT>pt!CFRb3aYtQQC(&Otr^ozc)9Da5B&ZpOh-bD_byxz$N z`S8vyuTH+YefXMBr;jgm$LhB(7k0k9qC3=AZu83P!+jT=&z=kEyO(|Si}bz+UftsM zhx||-sy9y`dKdTDL%*o+ z9=_`IZ}(U^%{9OEayqDQ4pg@zZMx$5gKb5^b#dq8>R=0W*5Y+a7N zxpeyG(v_>Pdwd7Jr87@nxg7Q8*}r~1s9%)Zx;d@m-pl;X(U*hstK<6iX`ilhTDK3> zmx~vx@Aooy0Wn>h8tmGMV@UOmo>0tG^`|UyB96I~a)2Y`_rw&(Lzi40c+Ap_{IsA0$bWk7C;jrKTRlj|p zT-f~XfkVC%E`8s%y7ukeJI*w>>D0RqKcs{9v=1Flw~!t#U+eyE`0SzH-%b7Y zq0^VA|6fsV{jJ-Z&ilJVy~sz`+@){5_u|d77i`Wg_s-Ru+d8h#k9&vi%jbP?=nnnn z(G}@n`?y@3-oEPgXq_I?7u^lz(Uq&iQ=bm@u6~d1i<`6dln?3n)uFjKG=~q$v;Xqb z)A2!l^NMu#Xn%cYy0w>m=!$&iw{LH#kMlu&Se-hwpFPz34sz(}ybFK(bdV031NqlI z&B2TE%+WvP(m{9QP!6PnbmpsjFMZs;^3?fpeg1`fusZWvw7YAd_2s=+&cgcF-szA)jUw`+)>D|Q#<(gwY$89Td~#s@^~v*2^vyfn*<3mHXkWe9Ipy@-+rDu5_}4z(fv$Oc z@?X7^PmVcI9v#$&>h#^y`_M1sTYYu&aA-bW?48Zg@4o8I!>eyDZaFzt5hkO% zPrf}Mee>1%aLC`jx%%|>#o=w8&b;Q!gZ5Zl9lomPvxoPypLeITKThAeJ>=3?httzR zef$4xm`*iQ>aoD}&Li4Lrr-O2FSbh2O@fW*~e(TNc`;^Op_G~Vne0_C( zNQcwor{6VQQSQpq?;SX7u6U8`#w8z?4UA`P# zjymo;yW>=^-(HZv*gf<+cjc(N)82BRJ`OvF4${G;*XN_d;kvi)S1!Nx=IN{BMS1qZ z>H3cDf%LHT&Nm0@Lpck5=l1QZE`QbOg>>+w>pRk)?v`&)K6QRvzqsbpwclKQ zbL`XaEzkSox481u&2OH0(0=w_etLQI_zBlO%SUe?=)Ko`eYy6Qr_QghZcg?33p=+u zdFGi@ADuc~^YG^Jw~xc>)S+BDd*F24lTW|M58E%N^S#=)mpV?*57q6>2kF{x-JGS9 z2hFWdoer9(?_F`ah5T^o?aprXs@L=clqrDSI+XcUcddS zTfBPj(|Y}#>-Pus<<*bV!B^+fo74AKH?P>fJUaI*t`2(#E~oqPolwq6*S`DsA>Hlo zkd67ei7cT$}`eNmsUIqH1wqO&hfCl{};b#wS2o$m$Jp*_rL9v!T{`_rj+AALxN7rQro zbM4JJKcfPdoP?1hjg8f%coyR2j$YMyF%1&W{(}-96N+$L&wY-}!WW zP@T_xINjR6bw2w*I`ib};|uBB1|{=y4I^-@8ezU-5mTjkG^P*db#>7^1I7^>khh=@15J1xBTYT zf6|-7k3;_6i7y{Noxk6co{rx=a@3*yaL5PM>Ak1AeD#yAdo~B}y>eTp*N5ix4thBY z>Ez*Zad+eNP~AJ(2kIB)LVdYVu0C`Z4%MN0v3FL#`u1|4{p#;Ly!W;)*PQ06+mF6F zxp2+Zm**aQ<=aDF&WYO-^4U+WKBQ}39qPl@aeG2_cjF7alfCio%fIUU-Gg2(UGr9* zuDR;4_nz|XBiEkIYh53j)A{Y=3p+ Date: Thu, 31 Aug 2023 17:52:57 +0200 Subject: [PATCH 10/11] Use shared pointers over underlying sequences and create copies of query records --- include/valik/search/cart_query_io.hpp | 4 - include/valik/search/iterate_queries.hpp | 266 +++++------------- include/valik/search/local_prefilter.hpp | 1 - .../search/prefilter_queries_parallel.hpp | 13 +- include/valik/search/search_local.hpp | 34 +-- lib/stellar3 | 2 +- src/valik_search.cpp | 1 + test/cli/dream_test.cpp | 4 +- 8 files changed, 87 insertions(+), 238 deletions(-) diff --git a/include/valik/search/cart_query_io.hpp b/include/valik/search/cart_query_io.hpp index 42494414..f9fdc5ec 100644 --- a/include/valik/search/cart_query_io.hpp +++ b/include/valik/search/cart_query_io.hpp @@ -36,8 +36,6 @@ inline bool get_cart_queries(rec_vec_t const & records, std::set uniqueIds; // set of short IDs (cut at first whitespace) bool idsUnique = true; - std::cout << "\nget_cart_queries\n"; - size_t seqCount{0}; for (auto & record : records) { @@ -46,8 +44,6 @@ inline bool get_cart_queries(rec_vec_t const & records, seqan2::appendValue(ids, query_id, seqan2::Generous()); seqCount++; idsUnique &= stellar::_checkUniqueId(uniqueIds, (seqan2::String) record.sequence_id); - //!ERROR: infix sequence is copied to seqs invalidating the pointers - std::cout << std::addressof(seqs[seqan2::length(seqs) - 1]) << '\t' << seqs[seqan2::length(seqs) - 1] << '\n'; } strOut << "Loaded " << seqCount << " query sequence" << ((seqCount > 1) ? "s " : " ") << "from cart." << std::endl; diff --git a/include/valik/search/iterate_queries.hpp b/include/valik/search/iterate_queries.hpp index 155d863d..61a750e6 100644 --- a/include/valik/search/iterate_queries.hpp +++ b/include/valik/search/iterate_queries.hpp @@ -4,6 +4,8 @@ #include #include +#include + namespace valik::app { @@ -42,219 +44,120 @@ void iterate_distributed_queries(search_arguments const & arguments, } } -//!WORKAROUND: partial specialization of template functions is not allowed -// primary template -template -struct iteration_wrapper -{ - static void go(seqan2::StringSet> const & hostQueries, - seqan2::StringSet const & hostQueryIDs, - search_arguments const & arguments, - search_time_statistics & time_statistics, // IN-OUT parameter - ibf_t const & ibf, - cart_queue_t & queue, - database_segments & segments) - { - return; - } -}; - - -/** - * @brief Class that sends query segments to the prefilter which then inserts them to the shopping cart queue. - * Partial template specialization (split queries). - * @param arguments Command line arguments. - * @param time_statistics Run-time statistics. - * @param ibf Interleaved Bloom Filter. - * @param queue Shopping cart queue for load balancing between prefiltering and Stellar search. - */ -/* -template -struct iteration_wrapper>>, ibf_t> -{ - static void go(seqan2::StringSet const, seqan2::InfixSegment>> & hostQueries, - seqan2::StringSet const & hostQueryIDs, - search_arguments const & arguments, - search_time_statistics & time_statistics, // IN-OUT parameter - ibf_t const & ibf, - cart_queue>> & queue, - database_segments & segments) - { - using namespace seqan3::literals; - using TSequence = seqan2::String; - std::vector> query_records{}; - raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; - - size_t id{0}; - for (auto const & query : hostQueries) - { - query_records.clear(); - auto start = std::chrono::high_resolution_clock::now(); - auto segments_from_id = [&](database_segments::segment & seg) {return id == seg.seq_ind;}; - for (auto const & seg : segments.members | std::views::filter(segments_from_id)) - { - // convert the seqan2::String to a std::vector - //!TODO: can this copy be avoided? - seqan2::Segment inf = seqan2::infixWithLength(query, seg.start, seg.len); - - std::stringstream seq_str_stream; - for (int i = 0; i < seg.len; i++) - seq_str_stream << inf[i]; - - //!TODO: populate seg_vec - std::vector seg_vec{}; - - // convert the seqan2::String ID to a std::string ID - std::stringstream id_str_stream; - for (auto it = begin(hostQueryIDs[id]); it != end(hostQueryIDs[id]); ++it) - id_str_stream << *it; - - query_records.emplace_back(id_str_stream.str(), seg.start, std::move(seg_vec), &query); - } - auto end = std::chrono::high_resolution_clock::now(); - time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); - - start = std::chrono::high_resolution_clock::now(); - prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); - end = std::chrono::high_resolution_clock::now(); - time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); - - id++; - } - } -}; -*/ - - /** - * @brief Class that sends chunks of queries to the prefilter which then inserts them to the shopping cart queue. - * Partial template specialization (short queries). + * @brief Function that creates short query records from fasta file input and sends them for prefiltering. * + * @tparam ibf_t Interleaved Bloom Filter type. * @param arguments Command line arguments. * @param time_statistics Run-time statistics. - * @param ibf Interleaved Bloom Filter. - * @param queue Shopping cart queue for load balancing between prefiltering and Stellar search. + * @param ibf Interleaved Bloom Filter of the reference database. + * @param queue Shopping cart queue for load balancing between Valik prefiltering and Stellar search. */ template -void read_short_queries(seqan2::StringSet> const & underlyingQueries, - seqan2::StringSet const & underlyingQueryIDs, - std::vector const, seqan2::InfixSegment>> & hostQueries, - search_arguments const & arguments, - search_time_statistics & time_statistics, // IN-OUT parameter - ibf_t const & ibf, - cart_queue>> & queue) +void iterate_short_queries(search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue>> & queue) { using TSequence = seqan2::String; - using fields = seqan3::fields; + using TId = seqan2::CharString; std::vector> query_records{}; raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; + constexpr uint64_t chunk_size = (1ULL << 20) * 10; - /* seqan2::SeqFileIn inSeqs; if (!open(inSeqs, arguments.query_file.c_str())) { - std::cerr << "Failed to open query file." << std::endl; + throw std::runtime_error("Failed to open " + arguments.query_file.string() + " file."); } - std::set uniqueIds; // set of short IDs (cut at first whitespace) + std::set uniqueIds; // set of short IDs (cut at first whitespace) bool idsUnique = true; - seqan2::String queryStr; - std::string fastaID; - uint32_t seqCount{0}; - std::cout << "\niterate_queries\n"; - for (; !seqan2::atEnd(inSeqs); ++seqCount) + TSequence seq; + TId id; + size_t seqCount{0}; + for (; !atEnd(inSeqs); ++seqCount) { - auto start = std::chrono::high_resolution_clock::now(); - seqan2::readRecord(fastaID, queryStr, inSeqs); - - idsUnique &= stellar::_checkUniqueId(uniqueIds, fastaID); - seqan2::appendValue(underlyingQueries, queryStr, seqan2::Generous()); - std::cout << std::addressof(underlyingQueries[seqan2::length(underlyingQueries) - 1]) << - '\t' << underlyingQueries[seqan2::length(underlyingQueries) - 1] << '\n'; - - seqan2::appendValue(underlyingQueryIDs, fastaID, seqan2::Generous()); - seqCount++; - auto end = std::chrono::high_resolution_clock::now(); - time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); - } - */ - + readRecord(id, seq, inSeqs); + idsUnique &= stellar::_checkUniqueId(uniqueIds, id); - for (size_t i{0}; i < seqan2::length(underlyingQueries); i++) - { + auto query_ptr = std::make_shared(seq); std::vector seg_vec{}; - for (auto & c : underlyingQueries[i]) + for (auto & c : *query_ptr) { seqan3::dna4 nuc; nuc.assign_char(c); seg_vec.push_back(nuc); } - query_records.emplace_back(seqan2::toCString(underlyingQueryIDs[i]), std::move(seg_vec), - seqan2::infix(underlyingQueries[i], 0, seqan2::length(underlyingQueries[i]))); + query_records.emplace_back(seqan2::toCString(id), std::move(seg_vec), seqan2::infix(*query_ptr, 0, seqan2::length(*query_ptr)), query_ptr); + + if (query_records.size() > chunk_size) + { + auto start = std::chrono::high_resolution_clock::now(); + prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); + query_records.clear(); + } } + if (!idsUnique) + std::cerr << "WARNING: Non-unique query ids. Output can be ambiguous.\n"; + auto start = std::chrono::high_resolution_clock::now(); prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); auto end = std::chrono::high_resolution_clock::now(); time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); } +/** + * @brief Function that creates split query records from fasta file input and sends them for prefiltering. + * + * @tparam ibf_t Interleaved Bloom Filter type. + * @param arguments Command line arguments. + * @param time_statistics Run-time statistics. + * @param ibf Interleaved Bloom Filter of the reference database. + * @param queue Shopping cart queue for load balancing between Valik prefiltering and Stellar search. + * @param segments Metadata table for split query segments. + */ template -void read_split_queries(seqan2::StringSet> const & underlyingQueries, - seqan2::StringSet const & underlyingQueryIDs, - std::vector const, seqan2::InfixSegment>> & hostQueries, - search_arguments const & arguments, - search_time_statistics & time_statistics, // IN-OUT parameter - ibf_t const & ibf, - cart_queue>> & queue, - database_segments & segments) +void iterate_split_queries(search_arguments const & arguments, + search_time_statistics & time_statistics, // IN-OUT parameter + ibf_t const & ibf, + cart_queue>> & queue, + database_segments & segments) { using TSequence = seqan2::String; - using fields = seqan3::fields; + using TId = seqan2::CharString; std::vector> query_records{}; raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; + constexpr uint64_t chunk_size = (1ULL << 20) * 10; - /* seqan2::SeqFileIn inSeqs; if (!open(inSeqs, arguments.query_file.c_str())) { - std::cerr << "Failed to open query file." << std::endl; + throw std::runtime_error("Failed to open " + arguments.query_file.string() + " file."); } - std::set uniqueIds; // set of short IDs (cut at first whitespace) + std::set uniqueIds; // set of short IDs (cut at first whitespace) bool idsUnique = true; - seqan2::String queryStr; - std::string fastaID; - uint32_t seqCount{0}; - std::cout << "\niterate_queries\n"; - for (; !seqan2::atEnd(inSeqs); ++seqCount) + TSequence seq; + TId id; + size_t seqCount{0}; + for (; !atEnd(inSeqs); ++seqCount) { - auto start = std::chrono::high_resolution_clock::now(); - seqan2::readRecord(fastaID, queryStr, inSeqs); - - idsUnique &= stellar::_checkUniqueId(uniqueIds, fastaID); - seqan2::appendValue(underlyingQueries, queryStr, seqan2::Generous()); - std::cout << std::addressof(underlyingQueries[seqan2::length(underlyingQueries) - 1]) << - '\t' << underlyingQueries[seqan2::length(underlyingQueries) - 1] << '\n'; + readRecord(id, seq, inSeqs); + idsUnique &= stellar::_checkUniqueId(uniqueIds, id); - seqan2::appendValue(underlyingQueryIDs, fastaID, seqan2::Generous()); - seqCount++; - auto end = std::chrono::high_resolution_clock::now(); - time_statistics.reads_io_time += std::chrono::duration_cast>(end - start).count(); - } - */ + auto query_ptr = std::make_shared(seq); - //std::cout << "iterate_queries" << '\n'; - size_t id{0}; - for (auto const & query : underlyingQueries) - { - auto segments_from_id = [&](database_segments::segment & seg) {return id == seg.seq_ind;}; + auto segments_from_id = [&](database_segments::segment & seg) {return seqCount == seg.seq_ind;}; for (auto const & seg : segments.members | std::views::filter(segments_from_id)) { - seqan2::Segment inf = seqan2::infixWithLength(query, seg.start, seg.len); + seqan2::Segment inf = seqan2::infixWithLength(*query_ptr, seg.start, seg.len); std::vector seg_vec{}; for (auto & c : inf) @@ -264,42 +167,27 @@ void read_split_queries(seqan2::StringSet> const & u seg_vec.push_back(nuc); } - query_records.emplace_back(seqan2::toCString(underlyingQueryIDs[id]), std::move(seg_vec), inf); - //std::cout << inf << '\n'; + query_records.emplace_back(seqan2::toCString(id), std::move(seg_vec), inf, query_ptr); + + if (query_records.size() > chunk_size) + { + auto start = std::chrono::high_resolution_clock::now(); + prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); + auto end = std::chrono::high_resolution_clock::now(); + time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); + query_records.clear(); + } } - id++; } + if (!idsUnique) + std::cerr << "WARNING: Non-unique query ids. Output can be ambiguous.\n"; + + auto start = std::chrono::high_resolution_clock::now(); prefilter_queries_parallel>>(ibf, arguments, query_records, thresholder, queue); auto end = std::chrono::high_resolution_clock::now(); time_statistics.prefilter_time += std::chrono::duration_cast>(end - start).count(); } - -/* - -template -struct iteration_wrapper>>, ibf_t> -{ - -}; - -// function calling member function in wrapper class -template -void iterate_queries(querySet_t & hostQueries, - seqan2::StringSet & hostQueryIDs, - search_arguments const & arguments, - search_time_statistics & time_statistics, // IN-OUT parameter - ibf_t const & ibf, - cart_queue_t & queue, - database_segments & segments) -{ - //auto wrapper = iteration_wrapper(); - iteration_wrapper::go(hostQueries, hostQueryIDs, arguments, time_statistics, ibf, queue, segments); -} - -*/ - - } // namespace valik::app diff --git a/include/valik/search/local_prefilter.hpp b/include/valik/search/local_prefilter.hpp index 55e45325..3382a2d3 100644 --- a/include/valik/search/local_prefilter.hpp +++ b/include/valik/search/local_prefilter.hpp @@ -152,7 +152,6 @@ void local_prefilter( for (query_t const & record : records) { std::vector const & seq = record.sequence; -// seqan3::debug_stream << "Prefiltering sequence: \n"; // sequence can't contain local match if it's shorter than pattern length if (seq.size() < arguments.pattern_size) diff --git a/include/valik/search/prefilter_queries_parallel.hpp b/include/valik/search/prefilter_queries_parallel.hpp index e3c2383e..3160276d 100644 --- a/include/valik/search/prefilter_queries_parallel.hpp +++ b/include/valik/search/prefilter_queries_parallel.hpp @@ -21,10 +21,13 @@ namespace valik::app template inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter const & ibf, search_arguments const & arguments, - std::vector & records, + std::vector const & records, raptor::threshold::threshold const & thresholder, cart_queue & queue) { + if (records.empty()) + return; + std::vector tasks; size_t const num_records = records.size(); size_t const records_per_thread = num_records / arguments.threads; @@ -40,14 +43,6 @@ inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter databaseIDMap{databases, databaseIDs}; stellar::DatabaseIDMap reverseDatabaseIDMap{reverseDatabases, databaseIDs}; - seqan2::StringSet underlyingQueries; - seqan2::StringSet underlyingQueryIDs; - - TSize queryLen; - // import underlying query sequences (SPLIT) - stellar::stellar_runtime input_queries_time{}; - bool const queriesSuccess = input_queries_time.measure_time([&]() - { - std::cout << "Finding all local alignments between two genomes...\n"; - return stellar::_importAllSequences(arguments.query_file.c_str(), "query", underlyingQueries, underlyingQueryIDs, queryLen, std::cout, std::cerr); - }); - if (!queriesSuccess) - return false; - time_statistics.reads_io_time += input_queries_time.milliseconds() / 1000; - bool error_in_search = false; // indicates if an error happened inside this lambda auto consumerThreads = std::vector{}; for (size_t threadNbr = 0; threadNbr < arguments.threads; ++threadNbr) @@ -224,12 +209,6 @@ bool search_local(search_arguments const & arguments, search_time_statistics & t thread_meta.text_out << std::endl; // swift filter output is on same line stellar::_printDatabaseIdAndStellarKernelStatistics(threadOptions.verbose, databaseStrand, databaseID, statistics, thread_meta.text_out); - seqan3::debug_stream << "before consolidation\n"; - - for (auto & query_matches : forwardMatches) - seqan3::debug_stream << seqan2::length(query_matches.matches) << '\t'; - seqan3::debug_stream << '\n'; - stellarThreadTime.forward_strand_stellar_time.post_process_eps_matches_time.measure_time([&]() { // forwardMatches is an in-out parameter @@ -238,12 +217,6 @@ bool search_local(search_arguments const & arguments, search_time_statistics & t forwardMatches, disabledQueryIDs); }); // measure_time - seqan3::debug_stream << "after consolidation\n"; - - for (auto & query_matches : forwardMatches) - seqan3::debug_stream << seqan2::length(query_matches.matches) << '\t'; - seqan3::debug_stream << '\n'; - // open output files std::ofstream outputFile(threadOptions.outputFile.c_str(), ::std::ios_base::out); if (!outputFile.is_open()) @@ -348,13 +321,10 @@ bool search_local(search_arguments const & arguments, search_time_statistics & t }); } - std::vector const, seqan2::InfixSegment>> hostQueries; - std::vector hostQueryIDs; - if constexpr (is_split) - read_split_queries(underlyingQueries, underlyingQueryIDs, hostQueries, arguments, time_statistics, index.ibf(), queue, *query_segments); + iterate_split_queries(arguments, time_statistics, index.ibf(), queue, *query_segments); else - read_short_queries(underlyingQueries, underlyingQueryIDs, hostQueries, arguments, time_statistics, index.ibf(), queue); + iterate_short_queries(arguments, time_statistics, index.ibf(), queue); queue.finish(); // Flush carts that are not empty yet consumerThreads.clear(); diff --git a/lib/stellar3 b/lib/stellar3 index 49b06866..d49f821c 160000 --- a/lib/stellar3 +++ b/lib/stellar3 @@ -1 +1 @@ -Subproject commit 49b06866aafdda79fc7e2087f7415c0bc5707979 +Subproject commit d49f821c84120380f3dc1080abc864840584b3c5 diff --git a/src/valik_search.cpp b/src/valik_search.cpp index a0ccfabf..c267c427 100644 --- a/src/valik_search.cpp +++ b/src/valik_search.cpp @@ -12,6 +12,7 @@ void valik_search(search_arguments const & arguments) { search_time_statistics time_statistics{}; + //!TODO: this switch doesn't work, make is_compressed template parameter using index_structure_t = index_structure::ibf; if (arguments.compressed) using index_structure_t = index_structure::ibf_compressed; diff --git a/test/cli/dream_test.cpp b/test/cli/dream_test.cpp index 71c9112e..53237a13 100644 --- a/test/cli/dream_test.cpp +++ b/test/cli/dream_test.cpp @@ -44,7 +44,7 @@ TEST_P(dream_search, shared_mem) "--seg-meta", seg_meta_path); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{"Launching stellar search on a shared memory machine...\nLoaded 3 database sequences.\n"}); - EXPECT_EQ(result.err, std::string{}); + EXPECT_EQ(result.err, std::string{"WARNING: Non-unique query ids. Output can be ambiguous.\n"}); auto distributed = valik::read_stellar_output(search_result_path(number_of_bins, window_size, number_of_errors), reference, std::ios::binary); auto local = valik::read_stellar_output("search.gff", reference); @@ -92,7 +92,7 @@ TEST_F(dream_search, no_matches) "--seg-meta", data("seg_meta150overlap" + std::to_string(number_of_bins) + "bins.txt")); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{"Launching stellar search on a shared memory machine...\nLoaded 3 database sequences.\n"}); - EXPECT_EQ(result.err, std::string{}); + EXPECT_EQ(result.err, std::string{}); // Stellar shortens query IDs until the first whitespace auto actual = string_list_from_file("search.gff"); From 8bc3db29b180bdbcbc19076db5f1a8cbd9a2bcc5 Mon Sep 17 00:00:00 2001 From: Evelin Aasna Date: Mon, 4 Sep 2023 10:25:09 +0200 Subject: [PATCH 11/11] Revert IBF index files --- test/data/dream/16bins13window.ibf | Bin 0 -> 32891 bytes test/data/dream/16bins15window.ibf | Bin 0 -> 32891 bytes test/data/dream/4bins13window.ibf | Bin 0 -> 32891 bytes test/data/dream/4bins15window.ibf | Bin 0 -> 32891 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/test/data/dream/16bins13window.ibf b/test/data/dream/16bins13window.ibf index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..057c65be7d0e85e5afa4d4a99f1a140a9056216c 100644 GIT binary patch literal 32891 zcmaLg52&Blc^~j!jS*3mg>WgCV1@cBQMb9R6IZ>x^j$o^#&!z51!yZ1zvC{r{8yz&CHN`s}AVxHI+RAOFOA zK5^tTj~;n@5JTWBh+j_K0Jq$zPX}hRTd!9S|7>Ab-coRLU%{ns6+C-q!S*{<>DA8_ zynRQ(mESMeeupZpy)~FAP&G_WA1TE2>BV-n$?AgGoAHkQbRpWjd#b+t@eqy$$Berx zt2fU)>s^0m(e2JUuH2S>8SlbU)%)Mg*9wCl?Mn|-Y3q)H^A8kUDsdy5&4v^M&HwUp z;dlOdVEEqo?y6sUsNnQe!Qo2e~e$nlV+&=fVv|SFI3Lg7|g5zC*S^w~kMLath zzgBp^J0DjH91EVkx8U;8-~)DMU9emKtxEg8TX6GSLH3F6Li95`QW$a_!!Nj-+&aDn z_B2-iH{NUHg7^YktMRjLcDeO08pT3^-pRkoG4tOZJArSVhbpZ`U*oe?--@5y_GH!B z2RVQZUt}}#p|?|il>VLkn;*w7mf{ci+xx-vyRYD(_|2WMC-S-{b^OofMZbK1L43}p zt{#2P#gU=fcmK1{jRp69A@u31-8ek^>z_jEt{a zNB`+HTTZ+hKNZZLg4PY5x~fOP=6{_W;1{W{g~7y^*;p;h=L;T>{m+J~``!7ml)}$0 z@-q&?n?QarOMmO^=V4bj8K$AvzFWoJzZKM2aA)`u*NsnZ(=QdB{ZTv_|2F+^DM)U^ zhpSG1Ytc_(xBu$*^xkUYkN6q?uJGh>Ao}9>R};5~#GN+7SS9PAKl_ik3r=5ZJ+BuY zOx&>UjqI;`A6cZ^$bQEz&1YQo)@!QL5dUxX9X-nzP97>ce)LNA+e`nd@ZzPo#V(&- zt@cAdQ_!#dmwsFS5UkQu_|Q*Wp>KN7zAN@)o%A_mzhN)y&s2}zPmLq~s2lH_e^!_# zCj{UTg@T78r`_MG`h$^|{b^(0B0W2$M|oGH8e8dl!FtG%|M4U80rg({>ncrm!xSW6 z>us%LmFU0Wt#>GleOvoH@kPH!yM2gV@&o>LGx@&%Lp8GfnttVzZSwQ1zXERggbaZ8np??S2ww5$=9qvER}7@szl^2Z|dKVcPeq)xa>&$7MIwKy755c(ueUjezPdX$jLcLKl#IfFNSWV;O(CX z#Z=JvcSMi$3R*Y0rT5JF$o^p*`yhaQeW3Y(VLzpp)%%T5P#(DdcNX!+mB-2tZ<7ayT)wIIxRTNsxF;}REAgwBtNs1y zpDRN9e%rTVAM|8B2l)H2IL|n*y10L=JFi(E`)Kj*uIjfpaZa9xp8wLP{@TZ<7Wq&k zNB!u#`3=HBVG-)L^km@=K30$&;9orL_XYBzpF@cw4{5ej)TT-W0tr_PVwo z-O^(g>(*u8LrTkR!{AND>l-XzO zx78JfDwzMpLshp=%A@I1x1;eFb@naqc)K?#z|+8w00;i`#L*2_(t`>&o=+ffX;d1k$LPF^25EqUoEF|K6EbJ zo&8cj^Sh6L2b)~R!k`yNkN%+W_7VN?)9QCuX*1^za&kTc(f5!)41ZiL{Fr{1&KDkE zY?lk96aQ?8<2FujxnLGseEr zJsSIBZ~Wpyo?<$WyO&{4^Y{(9AOBE|C@%jn=RA5epLMIR-Cw17;u*W8M|9+{FZ005 zC)~r~2cPidr=NAQC(n&uyEhb_eLNpM9giR3j~$qo{Fa&ptGeeDQs&w|_T^}2XQe|oWF`_6qul`V} z;Dwy4V8z*^d0)fx2JQIr-a>C*?mfisi~pv2dj9LVfOzoUFIIbh?!FXzaxNF|8XqY% z`D+-eFukK7`ERApey_Z*+MUzTwf&@;+r6M_lkZrVwdj)^+|w<=Kda{=B=@t?U$d)_ zo%q=*(6`?B?D6zWPLJjsJSGo-CwF*uY5Zf~sgnD#ecgTI{%#*X6h04RKd}B|$p?(P zJLlo)uENaeV?fctX|Zm-^XwP+UKe*lPL3_#4xT-J*?m>;;<2ZxlT@vj}+V&Ixp>d zyc*p1w)}l8yg0FPECU3Pzk57!*|`WcdD+jcs31JOw0g*zTy8OP!4_A8wl9lc zueq<$!d}R0`kI!&_z8d4 zI$o}l^MQQj^?M6HqO0t z(6P_czl!1???--(kC2c5X|dmxZ&bbUFP=_16nsAGZvLO<$0Dy$zrf~ChS{S~5MRzo z_6PBazS66`uVlXGKU*!mp3sjgRU%LKN*7|6@?!Mt-nkoI98tHQ(}RAm{%I(tf?Mum zgT{w`Ee?$#2)ONmDy)C9Ah{gaP^)c-OF3h%`#8_K9=_W`HbgZ+MR>gl{j_gcSA80Z0il0-}9S3zJ%H1;xje+(Rz5-l*{tezHgL1FAy^Vq}Ye<-~7tnHuX;jiBJ5O1c~5&e_H&OD!> zkM{m@80zf-pX^(|L%*)&{}0ENY8OZGD=vx0^PJ~eUepbH@v|LbsQO30yohi8_Y{8P zKNoE48mex8Y4#2uxsek-#kr$zt^R(+!#>Z`elsY3is$G~N8jq~PkmSJGt}vWpTM(s z_SIkr+Wa~M-jn;2)h7yJUqeQY_J8LW?eawD@P1x?#NYAJ))~DSuZQO657yt)$vfm_ zo_G2V#Ja>q{zTu6pA5YF$tp}a53pl-Oq+jbrRxRf#6|42+Ef_NEjIqJqqv;oU&mI|}dnPW$LL*9-4gKCjMh?4R4@JvPcJ3?pMc;bCj`kFmkbS+L*wfUnez7OUe58n5z^W=f>_G@zEALB|UDA&0iV+)^{>Zyx&e{*?85enKuo@*&S3`uza^o>MNLFU;wyS^~#{?zQbB zO+WGb3#W>1j9oUn84BruN`#$zTZ`Ml=&pRKCnJ@O#_{hZgcHfF0 zaO$z2)9YvYy`r>#xX(AQ zILJ@rRrBn_P0nMDv+KTs7xNsVeeaNV_fthRoAviu^lyH2{7D`3-n)BXc>YdL#zjw$ z?4Z}5ajbjuuL^@szV2QLz4)NsL-7ut9@)XpF7Kre?c#<0p7X2sa1^f+Z@0DoU199o z+S!G=b+ZR`>vu17=;`XQ9Dnuv!E;J})W&^0rSM1Z?t?y``!eg0KWr{nKlz3C>(I&n z=&zMYsAyLG8W%(mG=wQ{=xlarD(^ih@7dXY|{4?n%Si`4lFU z1>K9lzL@>uVD53@r!Q5>zK?%-z4cq)JnzN6{?6+6mE4=nKTvh|clcjJ8k+V30pR0M!FJ8(|hTJ<|$UO{teuWSI1V5XjEGQ0WXNS&1*7wG>zZmnA z|9X8C_S)}QC4NUv@T++)A)dCsV-Yzz&$RKnf^*LDt<|vm@q*&9{0aY_Q_82;-d6O^ zp^d-HbFMpo;9H#O_gU*O9zMM9y7uw#nf3M0$un*H?zYdr#Ot%44CNFUyVM`#H{vP& z#V>T)+0Q+Zi~MXoazNMgX#jAj!qO8E1^YO1?&k_`JbX1jJhIqd``W)=>?dv-XGr|$ z*LU)G^<&TCMIX1D-|s2)>3TtaC?6s3M%B~5 zl$PjmHPmIXC8SWq75{uN*L3H~R)C#Q_kz_?OH^i6M5NCNF^ z*2B)&|K@)Uy>6XdPj^>?_-LIgIe*EgjgN27)h>1Khl(fkC7v&GaiiADpS;f9TQsM0 zPCWVUs@o5(+jG*j^VQzwzcIx^sqKSPp{*C67oD?Wck;tSUHt9&Hpwg52R3q!BM0~V z?eC-L5ZL$_3&D=)bK_%$N5^l(om1gE!?kl5|9~H|t|o^;OTm^eC2w+%i;6t1{zYM& z580{pqF1NK>C@H1uj$)5TfY?8q4Qhc{?WvtHh&1%h56e&D@AbNfr9MVI>|S~%$9dl ziM)*Sz7G|Cp7=`+^uRvkIS>Du5ewP}tdAbegL+Jya_+Q0JeNG!`kG(Z57LkSHoq8* zT##JdTeQ}(N<-p>e2@Rn=GE>#>)^$z`yJEId7a#>mwwv_E(pP_ep##f^r zs^qz)Pqlc`Vb-@J+t%ecoIvivGQZi>rDTq$hu; zmOsti3rQ_jK)^#sGoz1FfGr zzSyJohR^z+%lBZtJ&9kwJ8F7~9_X><-HH43a%cXop6>|6nQd_o!-1_mb>kt!zdHMB zFb(5M753#^z#quLdhP3=ziYDpTh-rlO>r;7&sHB%DrjGZmA~;Tc(C~cL*wHdd+UgV=Bzt{(t~XyZe#0 zj|Ts8m*4Ns`LxX!I`Pb~?8Lh9&0pDve7E5xx#*sf+jZSViTOG^ibs`I1iQ&pE=d!OAobMFVLy~%g1y5G+yzvc(x0=c*cChy-! z9J4NT#sk-#*N4Ke_vQPmu0Q(s@2Ju__r%E`4xTD}JC~<}`*i)~Q`T?)(+;v9_TKPa zzS8h3Rl;wx%P|z`=RJq@+ykRGkNCu1nm#g&_aC2sG>8E9xQM;>`d#)pc=t%=%f!U09}>6iNBs-;HM*R&&eA;^RJ#S41d)wzZbvZ z!T$cD={w_kUlF}~X7zqPd7ke;ozs@WAG_3l+dB7Di#!NjI}Zgh@8TgpWDlM{JpI?z z;nmzf(I5T%Lhhsedj#$y?ZflzKkonhonLXZJ?|W99(GJF*2g}>egn?yYl&n0iamIb zt?k?Ci7x{=-?VyXzx{VS==cM9oY#+o{&7JyQ+utwf^3R zmnWdt&c255cW!UBwC@+ks*|_z$#sFpQ$?gBsY-U^oDoeq3-=}`w+R|w;#uNy&R3_ zIR`p+rrtw;Uk2WM#-n#}2c0@Qg{Mz-^EbMwTI6Z)<^x~Mb7XpSo+h7Ha*u_N+w+{_ zu@g1E{e@r3w~W7fKMo6ep1p|ue`<+;Fzw&TxC~QJp8ja=bF7bjp|&o1cdoF{P4QRs zo*%3G?$P-0>_Wafu2jp(uNUn99jDf>E7&LGH{^)SJk!{Kk8seLU{-9Q~&3gU6ERwgQHWRMHoiUhJg4CgFIK6?YnQKxQ}FoH?_Wkf^KXY@C}=(8u=+&d z$phbO@hAGsDDCfbuz!BZ4jz0c^L$E=g6xIeA5A=dU-Ec*nq|KyU!!<_hrh2RUiiCD z&fVm&c0N6)f@ku6mOQ}vIGQ^(k?^korzmfZu`LXKnS6@lk5*kjP~OP@*(d(^7d@%Bd}49B8-EXfNJI2=;=!tWugf@X z+}M-v*7(6j_BnB%Jow@9__sXVJr=gkMW5;*I&qpD*}L;PKJ^pVrn?F=o+^@7wM7J^sSJ!i@3TWpSw^wEpp>#RmQ9dn55d-t1hw*lRX> zp29%0Y&lb+tV;%kco-T9K-n_R~l$@*IRPpv*G_~rYP zHhv23A*@sTsr0TbbpW8Ox zSeUlIcLd%~_4j+p3mHG}{ZF2Z&N$*BI|b1V(Gz{Fc|Mf9#CgoVre6>3|MQPji~SuP zdpBRNKl1Z_Bsp7u%h#qFv8{K=48e;l{NBE;?)#Y&(Vu&@$D#CfK47_1_&Tmer(^N5BG=SOdDsY0bbAdO8n4$e48cv{~7B^ zgE-ds4V^kZjPv6+t8_Nc3Cu@N_F?+Or}yRDOS4aUao=gZ+RY2vpVh??e7qR{#W%gU z2lGC|5PvXF?>~K9kuRBdZ_bmO|2vo=a8JRt9jTLN@a7d?`FW^k_BDB{`(^E|->roa z2b;X_D!jPS`)51njcvs1g%O`Yk55&>d*SS|AJ5;H;m4Q0S^a$PN)OHp^ow8dfAdoH z8>5%7G20tEqbK<#`$XsOeyTS(w9@s0_`d$Z!qcBBt=e_o4ys88Mby`N9K^xdWYrX{Dte_0rQy^_39y^lNa z{M`CMa+e304_<$If#-kf?VJ+-w+~pqasB@+@_5^}9-eLqYaoUv2Up y7u)GyzQ&J=>e_GTm+;T!dB9foVf)tFlSScs9&ypQ=d?s)RIkjF)SyNoO+*A!CY!c__v7;fyBHp;X#5 z4r-Ka$`CqDAHsbIB_lGyi1<(t9O7etgdsE{Qt}|B5xGO-gKsLKNPWQ3;bk_g1_g;JLwfArTe)sev!!Z2J;s4(M@23I`|9g*+j~Lt#{mfHOf9mOrUw!uC z-C`8ML5yD&je|QD^mt|%?mQwl;7a)7142f@4yYGHaL-)7fSw+C$>c1qB!n+?jHk0(BcM!=j8zR3G9liatq2~Qe;~Ln@y8-Dx^4|E7?`nS8Ki~sdKg??^$3{367&N<-STrj#il;{_{f8-!>yZP$Ks6mgw{uIU7-hLUx#++<4;q1>F_kIv>_&mpX9Lc*gHm|3Cr)>v6 zlk-3Qd$rYv%6^N1ANE7^R>Be*^tY!^JwSfH#xxa5>@|Vnzd=tCQOulV&hh+a^KK8XAx;lE~Pj{Y5SYG`^z_e3;JubB4^8+9KUYyYD zL7u;;cfESjvv+>^^_D~a;NSNJ;^p)6IgfLZ-_E>|4{htiKz!hnFVH~r-qAh|fBcoe zUetVXJ{*cPYQU=c!+0efQV(_Of9{8p(+~WmA4AYHc9O)~_dW8Mem-pV>%<%3;<}y} zdhIY@&zn8%W=DinyT9uB#g6{Xe?K@~eTygdK-O2r`67~lS>IdsK)v#FH~qq|nGOH* zso-p!4Tyf~t>`a$Cu9#zdp@5}+P+JEvL13iUd2Rct6K;L2Et?(kqXL zVB>d9VKf@j_*+jQ2uQm#?m1fV+7#pa(a+0dyEXmxXIgqCRYHTj@Ov)oiT{1XO>xS3 zsL})$(H+1%vTpO7CHqTX48wbAPkzXk=YA80;F{pY&D*mpo%0vX(I03a_nFK)<0QWZ zn>Wr0{+yZjM-|V@A@k>T8UFbngxwk41F=83w^MJ5Gqw`ui~QZua~l0+U-R=I?YG_* zRGlsG4`Q4n*taKfR{mjwoIHi!+ZT~LEZ-7m42a*&bEau>;(2XPdL`f<@fLgjeIe>U z{>9Dulk}L_5z>7gtD!f0G$45w)G#nl(D^<=4DSc~$CF=>!WeMfc#l54+8O3rp_BcwS>Z-LLy`IeycR6zG5ITmD|Ao~Od` zNXEBHuLy1n3hdn}q_lkM#J=vgOgdmkzo~<}H(hf%J=r&xs}0X;+_Xc#i1T^sb?rD1 zoB5{SU*i4I3Om~C%{Q&mKiYwxuq5`O5Z)UxZg+A+^E9{f19{;5gBADUd43z!5>O;{ z-0QcuBlThOKCFbq>oNLs?H|xU_8E`!k9Dr$JdpbshAW`gzxj7%Py(`UantoLee`O? z#d9god2YYZ>|izc)PW1ycQ2@)S(jy=&!Oh-pJi8PCRmdk$!qB!Kx^I(9t?r+H=JLx znK7{=#N$gIun%zF+x&&r&Z)O9w4BTdfvHW%d|#~kGEs0wX&GLxcJG(Gwt7&L zbCc<1USQy`1S9L&7vvHGTy?GuKozpy@83+3l zjyQina}NW(3U;5bWk_5n{gK?~i~2#kJfA>2WZihadR*Mj*Q?8&rrtA-9eb`opYgKq zQ#W4uH8eU<^NoMB{JHqj$&h6Ki!^~X=o_C5==CSKi|_x)3y*vH$3XDl`^M;yjDz_> z&F2YzodxFoeM`@?lka0o=sTbIhqV9iyLNwYVD&sLYhXSO2qjQBA=&YR@-WPRhH zfEYCXJ$JQo-Ybw_wC{O;UFW^l`tWsB=N!Vk^S-M$Zl5Qohg;Di&qt%a|D;_%Kd{fT zZ({HLhGsoN1AYAP6V|{ke=}gVhl1pVJQs~P?>mzb2>t2J^Q0ZwS3O5@ZuWZY^G>rs zf36_D$cOE~$jjslDLluCo;;TnvrwHf?%?|h@?QQRr~W|WhjruUAAj$Ko$nLG``@1n zjY)pWFx$bAU_CH~NcY1&oZ8C!q(N#RPpE#w#vc$qkbHh!_gU~AGH)Jl@&W&KCd(4g z^MZWL{Y`qZ8t|WaqkOc~NMvxZr9-!qeo}X?! z6cW!D;_G&vx4b8$zM@aOGp{w?)Em(ZILXUma-Ac2erlzM)M%U%G;#FtUHj`+pL}F} zG4JQK&Zsw7y}sgk%<~VsF03Khe59+<2*`KV2m2*T>`(cgv6FATe)|09ywQLlaWeMe zyO1)G95GmLNPT|jm)Ae^dk{O9zhd7{dB0Ei`~4aovL3j@QHg^{Vc)AIEa>77Z4Ai=TjP*Kh6!<&Gh`>>#1iCpLIvOocoY}Sm@6g>BrUs z{YJj4a}?u%?~u3yd+P){^q_&n4?3UYt)KLZdr98dz-BxsNB~QbW<@t=B_JdYr%bUWfX`^LEbP#DN&4CkX!4k9YL8 zkGFFv<92?Xiz9#NWxO#M17ZIBoJ9TDycv3oJL6h{U-W#6Jh;dFJ4yyL)y=1jAu{)o zDn=)7=Jvcne|NQx@m}@4Ff@>RA!yXuY*o!@z#^_Me5s*BSK&JN64`Anil(p--rO5GVNgcjhCUfLnLAG}K)8@zj6zJKCFk--KZeWHL_iSx=}F zH~P!DV_)|)___a|(&7j~A8wvAv`)((ha~HDLDOIjn)xB$(Bk<9ntHKo_ek*3<2zUS z55Fjn-}*Z^0ew8wfn0C%R-e2O!6#1I{G&}qeC+^$!G_Ux{ziWWuu(k~LiR6xn9gAc? zrR8-7$&#UC8sV~TJfxeIN9G%=682g0tlQzWPM`S#t2TeSQ z3vM3%gTOBTOF;4ndBRaxNzSndesY~lJNsk%9P87aa_>W(OnD8=$AjGS7ykY)f(7(C zozg1oVsefU#znpLdFQ*&)@t!R1LJo&^yW>lfZP*$z0Nw=drz)@&s7)FYYoi(vF~yZ zoccmGe=iWmU+w(m>vJ++@3Kwd-Ot+mQI^WXRrB)R7uuHmy>=@d zhJ8Dau6ho}o^v`$>^qj9=5Amg^1QkFLuo3WtV_;;?x)&46Lkx{Nqs6vRzS7y^*`5%=J`PPsEmVo z1$rw*buIn4f@t;)>S0GM+30g__H|0#V;>}M{C$G=n||_KmJK?hczHj#??D}ZzP^yJ z3;CTT_Q{@I=EE2o*?!MpO+NF;dF{gCFJ-RtA9huFKpc4Q(|aCw-}r%lRqv&Jog&Zn zVDzB!I>jZi7U}Sb2kVw~k;ZN(n)g4Mr>b`o`*gG(g5O^K%=q6C{_4#9R?&hUU-D~Y zKBfNY-wI@|^AqiJj-jrw4!QS{`Qp2XWGqLj)*bnS)>ZxvvcNGYyH)F&?{<21h&=T1 z5A$e`aryp0KGy4@$2`*xafKgRi*qhC_Ycs({5=ni6aJ(wO2GGr%v0(M$=`p|enY>2 z=w}>8ILAZAInk=Y!5{HLk37ou3zFq9p9d9>{CihZr#;q%i8C*0mwxo>D)TwqWcel{J`DeW7(a!VyojL;MMRc+H{quXJyTpAA3+i6lUrg)>;hwwY_NjIA1rNkC_lq$k zMi`q_wEQ*!77EZRrKEpZ>3Xe z@?Ej=1Wo20I^h^t=BWfepX7P!$&UI9-{bb!SHq#_JM;9;o*EQz`;?Z%8if59CjEe( zK;LIkxCPQeerE)~xlW~)ag|@*OH#*Juj#cUc7))! ze&o6-tuA%b$JJ^bQW_ug=5pddKi_`HHAEM|F<|yfZ2x^}L&r9zQX7smy z4^F_Gw;D9x#f~qvdQLs5_Y=hBG4hmgy8(0wg!#=nh}>ZfhUc-XWnQ>oO`13YduI8Z zUP6QMPxi|y52IiY)DV2;uAahjKJ$sR7oUtXzhj4iK_7ST{@cuw^t#}5Dz1r?yF-z6 z!+w1AndJ_A|C(sk3FnBh|8(%Vr*SRnpV!4+dz|y|<8g)0zMXly9TI59oBD>#bv~tT zW_?d(DNBABe z`rC^)a^jxxsCaB4=Qj2)+Vk^h`jNTQllyp$zs}qCh!guo#<^8X>IZ)x=xd)$))7)? zLG%8OxVBo0-w|>?b%pV|9NNDd=bj3=$ANjtd8naTw9b)Z--Dc&T%P~G7e$iofIeT$ zKkX9dJwH!b1Ml{pGo?5GdLYC*_tg^o$@)|tl4^{)z2HG<()cO#92(u0xVltA9G1soU+wJ!D z@Bjb)gTFsM^4neT^~Jw`d-(Kl|KsQV^Go=8;qAlYKR#bv!L$GT*O%MvG4}EjUtIWp z;3eN)WiJ1Y;X*!hoQ=O|4$X0W(EQD1aKC-@`ukpUc{$vENSCYbE?A%Ypt)YSbiI(z z9k6pg@3Y@NI`o|uw?`gtZ|CmdcPAgLkM16G9Mbu_r}vr5LAqY(UdRXOu;0;LbUAm~ z-+gk>9MbjavB#XgeCEx`L%xOH!|zTy4t+0kxpH^&yHk%`eK;QuLd-Z!C-^!aePlxtEdvLym{O+>PUEV3L*Bpm@`e5hw z+Q%nvj=O`7L-&|>AD?}Y&m8J0caL26J~bz=AD8bQ_nP}&xH%oy>rNy(nKl9qNH}Ip{m#_R*We`S9ku7l(4b107b^ zZ{FX<+}$`oUNpC_@1jF@%R#+3pPYO7>@mmXAwM5%KV8mus1LVKe|2-ceC_YM`0VE| z>WB5y^}9oldm(?3AL?^wQJ#+5qX&m{IqxxteCD|K*@v6+!|M8>JM?(Besf$O|LSQD zZ?1X1b%(q=p!dq@t-tS(Z%&@>E@&>V*FHW-cPFl2zP<9$J{;=7+hZ>s+6&!f-oET;5(B%H#6-%=JNc)Q`*Q>3+O^z4G0MH%Hei zkMkG3n;(bncUSXt`{fqr*JE$@(2H`=+&l5^ahEv`?bi#ddl!_4bh&c9&^>(g_LcK@ z&&u;dKHmxIgYL5*@bT(2HJ@22ya)4NAM zzjxHvd2{y2+lRYP4z{;BdHwwI?u7h&P+qQ_4)s7i-U013m)BpPxn6S|`VM;QJ8?OE zP|h8=eR%Jem-B8qZccXx)Nfz?=KOm2pgDhibl9Bt>2VLNANNlC%K7v|d*pf-UEbV1 z^3~<&(BAg()Ad68VEg3zZh5^>9_Q0zk3N3)^O-kqUVruGdQZ7Kn$t^%_CtEnT#t7_ zcbK~aUvqiiQ?C1Qe!cF}-yZX#KKsn+`s}0g*}rso^M!m+ue%o4XO4TnK6>vqw+Hg! z-hbL7*Iqe2dg<$4cSAXQrhY!;E=z#BybG@D8_7%=xc|LR8 zJ?^NE>*0s?K)&w5va$0gYx|L@YDIttJ}+`*E{6&R+rOjt{3Oi z1LYQ%FY>q7-uCkAG50>4Pmlg`y6+{Y2lAQYo$H~y3(D7r!}inlLO%Vt?^A9+G?#~+ zm-D#;=d-_e<9tQ;=*40Eeoy=Px`z(y?>>3Ha_@7GUOn!@&7nN52X?Lp^1;=^Zw~2@ z?!7p)$6a#JoFDRcE?1OuM{|A$zLoFymuoMdxjx(;Ip{9P*Z1<kydfP+iFUorlZV!}eAAj}bx5r%mbhq!z=Pr3%FMs#Z^_=eJ(_6GxuY1a?%jq*; z=uW+I-bZ&YfBWR<>kfG+XOEnHknS$Iay{?&P=ksodSo={1LX?Z+WqpZ9lO zU+41OtIv0DkGu61`C3rSSJM5*qUmvaqhkW+p_T!LG9-6xcn#1bu>OOnueDe13 z;l6_$-F`aWK036wd-UpS55FG%a`&3+lgIh&q070m{Wz46Mf=<(&xiBT-LrC? z^TEDj`Mtyb=H=Qa$L}3{a^~Iz`HSZ6>0G~FbNisYJN3%T@!@>><<03h)Wh!%ecelk za?l=}uRVP3D4Ih(?p>Up59f#Wc`vNLoDb@QdhN%dJ}3wIpzm0}IUUy{=T68E>HN_B zP(Q2>x1SI9``b&euX=mkqerhE?=gpZA)g$deK@~8c(H!H?#1QoU-SCqpgg2QdhhZ+ zKJ$hA?tyfBpk6+@ez`TLpL}}P?_v)1`QGj^x7WPfyYxUl*j_nv9MbKj_g#GY`Fe*Q z{=Tn%KDvI~9eU~Z%iBZeH$S1A{kWWd`*C~Z%pqNmUUT|Fc{x6Lci`sy@{kXAKfk;= z4&}?`pWlrQJEM_F&VI^)%P_bS}q-_x`od{@$fWFCWyew|P20&gUKU`s_dLk>khp z$`|Wfem?!~#(f`Lj~whAcNd>NJ@(P%_|0+XJIm>lx0es{>ya~ubjWXR53J7~x*Q#v zw+H95pC9)7(D{9brOQM6?59J$tB(%(^|`0IeskZeJ#yxH>Ghk_?ZNGX-fzDAeD-z^ zZmzFfPvps2oa{W**t{2w}^~t#pniuuaalJ)zdvHFwzM}WL zOCHz5XHM5g#~~kVFP$IiXW1EldI0({_f?|XP-WE+`PT(4!I(~ zdseR;KXjKn+QVnA$DaDl`OWR=oX;M;e7L+me);8>=hMTl2j|1>lY{2^ak%~ts@vBd zyy#xKJl%e1U+!X{Psh4$nmYc_3L*pY+f(FeSCDtZ|+{)9ejBAn?pWa z??QJjzdm>Aq2mj^*B-vUqnzI!b9p-Me)~F?*9-aNAYDH+$L)pm`smR9-epc--`yPQ zgYIA49(VH7as7QCZr+@`&H3AF4&5W~ey9i6hr0vU1L=C~mD6WlefprgI`=MlK6<%4bca23IlX-5{GIce>uC<>$Mwm32cJ2e&mP>IE|1%{bUwci z&hHLAeCF=J<@r13GcTG$d3#}V-Ro|7eR6vG{mkX`7WKjA^wZ_}adUUz=KT6_K3G3r zd#mG+Umxy!;PyiPa(DYZ?BRoSJvh{Z%kjGpn)5@toL=1f^iO$k&`XUB1W%`SJSA>D}+Un)CZUbZCyt>xJgaZ|-h6eyG2GTuu+YyzenD+6z0! zJIC#DH@)cZ%KiNM^>`2D*T)a(P_BLUnU^=u=WhGlV-AFcelKEnd9gC71r`-3V&H9S1NH00eHZznyj zoDZ7YOV@*&_xs30_d>cnuD{&->@jctq?^0nJ@)E@d~$Rg^0|`^^yL_1ed;57HqW+6(pJeC}uuF4z0yi|)5a56(}wU%xpH z-NDb-Ub=qs=JdH2hkE4cdg$&ew_gwBGndC9zd7z+oS(1#_4R&vIaptNdIv7gk3;*Q zyZL=*b9b22o72ya7xlN#{dAmPuen}xcXh7EoWE#(>eUaMqwBYyzEF?vD(4<^y6-{f iE817Rx%SDy&fV#qxIN``dz+&}`}DX2>MP3A<^KTv3y6jQ literal 0 HcmV?d00001 diff --git a/test/data/dream/4bins15window.ibf b/test/data/dream/4bins15window.ibf index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2be7edb54feb5d59dfa3fa93e8d06df7d4d8510b 100644 GIT binary patch literal 32891 zcmb`{YpSHx5`^Ju5pUpS2F}1g4nU{UKkNah19WCy*!8sVLa&#r*{u7&wcukPM=;Ks@JCv?L)5)>!*Ws_Q9b#?ECk< z_==sUUq4;z^yaUA?=6=D^^4}H(|4}-*N6Q2%_*KEJohXvS0Bph-s+IvJL3BARJUj6)kkMBzK@)R^7xldo!)$?5AD%=&4KdxZjqnfJRCN^ zKKnyHyt(}P^sx7;)AOtQez{^x#p_F z&ZAT3hkWMIsoM+JZ(YB+t1bu9ozOk@UHa}}4&;*u)uB0jMLPAxPj!Dka{9i#=fuq` zx`)4W=;^$Rz4W2J96tHWr!NQ6L-kwqj`pzEt-m^X>h@as-lO;N$yqx6&Q)(td3C*C zt~$Scp#Amb>f>~H^>}skkPa_;@9N~LLw@Z-4iEt=DIsz4>0n?qy#&ovXgk z-u7!ApFX5>kDMZZ(H(N+^1DNw-#mSK{S&_GGjHXtzWeO&&Z1m8sO~3+HP;A zbIa-Y*B<7{Zw`O4dHQmoI+Wjc!0l5#4&~tdxV}7nxc1YRQBsEhkWwPTY5fttLq$lnX9iZzera+`Sf9P@$RDzPky?E{vOS#&U^X}`Z!dVhZnt* zy&;`>bn2a>-g)xsU;5VT>pt!CFRb3aYtQQC(&Otr^ozc)9Da5B&ZpOh-bD_byxz$N z`S8vyuTH+YefXMBr;jgm$LhB(7k0k9qC3=AZu83P!+jT=&z=kEyO(|Si}bz+UftsM zhx||-sy9y`dKdTDL%*o+ z9=_`IZ}(U^%{9OEayqDQ4pg@zZMx$5gKb5^b#dq8>R=0W*5Y+a7N zxpeyG(v_>Pdwd7Jr87@nxg7Q8*}r~1s9%)Zx;d@m-pl;X(U*hstK<6iX`ilhTDK3> zmx~vx@Aooy0Wn>h8tmGMV@UOmo>0tG^`|UyB96I~a)2Y`_rw&(Lzi40c+Ap_{IsA0$bWk7C;jrKTRlj|p zT-f~XfkVC%E`8s%y7ukeJI*w>>D0RqKcs{9v=1Flw~!t#U+eyE`0SzH-%b7Y zq0^VA|6fsV{jJ-Z&ilJVy~sz`+@){5_u|d77i`Wg_s-Ru+d8h#k9&vi%jbP?=nnnn z(G}@n`?y@3-oEPgXq_I?7u^lz(Uq&iQ=bm@u6~d1i<`6dln?3n)uFjKG=~q$v;Xqb z)A2!l^NMu#Xn%cYy0w>m=!$&iw{LH#kMlu&Se-hwpFPz34sz(}ybFK(bdV031NqlI z&B2TE%+WvP(m{9QP!6PnbmpsjFMZs;^3?fpeg1`fusZWvw7YAd_2s=+&cgcF-szA)jUw`+)>D|Q#<(gwY$89Td~#s@^~v*2^vyfn*<3mHXkWe9Ipy@-+rDu5_}4z(fv$Oc z@?X7^PmVcI9v#$&>h#^y`_M1sTYYu&aA-bW?48Zg@4o8I!>eyDZaFzt5hkO% zPrf}Mee>1%aLC`jx%%|>#o=w8&b;Q!gZ5Zl9lomPvxoPypLeITKThAeJ>=3?httzR zef$4xm`*iQ>aoD}&Li4Lrr-O2FSbh2O@fW*~e(TNc`;^Op_G~Vne0_C( zNQcwor{6VQQSQpq?;SX7u6U8`#w8z?4UA`P# zjymo;yW>=^-(HZv*gf<+cjc(N)82BRJ`OvF4${G;*XN_d;kvi)S1!Nx=IN{BMS1qZ z>H3cDf%LHT&Nm0@Lpck5=l1QZE`QbOg>>+w>pRk)?v`&)K6QRvzqsbpwclKQ zbL`XaEzkSox481u&2OH0(0=w_etLQI_zBlO%SUe?=)Ko`eYy6Qr_QghZcg?33p=+u zdFGi@ADuc~^YG^Jw~xc>)S+BDd*F24lTW|M58E%N^S#=)mpV?*57q6>2kF{x-JGS9 z2hFWdoer9(?_F`ah5T^o?aprXs@L=clqrDSI+XcUcddS zTfBPj(|Y}#>-Pus<<*bV!B^+fo74AKH?P>fJUaI*t`2(#E~oqPolwq6*S`DsA>Hlo zkd67ei7cT$}`eNmsUIqH1wqO&hfCl{};b#wS2o$m$Jp*_rL9v!T{`_rj+AALxN7rQro zbM4JJKcfPdoP?1hjg8f%coyR2j$YMyF%1&W{(}-96N+$L&wY-}!WW zP@T_xINjR6bw2w*I`ib};|uBB1|{=y4I^-@8ezU-5mTjkG^P*db#>7^1I7^>khh=@15J1xBTYT zf6|-7k3;_6i7y{Noxk6co{rx=a@3*yaL5PM>Ak1AeD#yAdo~B}y>eTp*N5ix4thBY z>Ez*Zad+eNP~AJ(2kIB)LVdYVu0C`Z4%MN0v3FL#`u1|4{p#;Ly!W;)*PQ06+mF6F zxp2+Zm**aQ<=aDF&WYO-^4U+WKBQ}39qPl@aeG2_cjF7alfCio%fIUU-Gg2(UGr9* zuDR;4_nz|XBiEkIYh53j)A{Y=3p+