diff --git a/include/valik/split/metadata.hpp b/include/valik/split/metadata.hpp index b2d410d9..00549873 100644 --- a/include/valik/split/metadata.hpp +++ b/include/valik/split/metadata.hpp @@ -91,15 +91,17 @@ struct metadata * * All indices and positions are 0-based. * - * \param id Index of the segment in the vector of segments. - * \param seq_ind Index of the underlying sequence in the input FASTA file. Corresponds to sequence_stats::ind. - * \param start Segment start position in sequence. - * \param len Segment length. + * \param id Segment id. + * \param seq_ind Index of the underlying sequence in the input FASTA file. Corresponds to sequence_stats::ind. + * \param bin_seq_inds List of sequences (sequence_stats::ind) for a metagenome database. + * \param start Segment start position in sequence. + * \param len Segment length. */ struct segment_stats { size_t id; size_t seq_ind; + std::vector bin_seq_inds; uint64_t start; uint64_t len; @@ -118,6 +120,14 @@ struct metadata len = l; } + segment_stats(size_t const i, std::vector const inds, uint64_t const s, uint64_t const l) + { + id = i; + bin_seq_inds = inds; + start = s; + len = l; + } + segment_stats(size_t const ind, uint64_t const s, uint64_t const l) { seq_ind = ind; @@ -165,12 +175,12 @@ struct metadata * * @param db_path Path to input file. */ - void scan_database_file(std::filesystem::path const & db_path) + void scan_database_file(std::vector> const & bin_path) { using traits_type = seqan3::sequence_file_input_default_traits_dna; - seqan3::sequence_file_input fin{db_path}; + seqan3::sequence_file_input fin{bin_path[0][0]}; // single input file total_len = 0; - size_t fasta_ind = 0; + size_t fasta_ind = sequences.size(); for (auto & record : fin) { trim_fasta_id(record.id()); @@ -179,6 +189,7 @@ struct metadata sequences.push_back(seq); fasta_ind++; } + std::stable_sort(sequences.begin(), sequences.end(), length_order()); } void add_segment(size_t const ind, uint64_t const s, uint64_t const l) @@ -193,6 +204,41 @@ struct metadata segments.push_back(seg); } + void add_segment(size_t const i, std::vector const & bin_seq_ids, uint64_t const s, uint64_t const l) + { + segment_stats seg(i, bin_seq_ids, s, l); + segments.push_back(seg); + } + + /** + * @brief Function that scans over a metagenome database to extract sequences and segments. + * + * @param bin_path Path to input file. + */ + void scan_metagenome_bin(std::vector const & bin_files) + { + using traits_type = seqan3::sequence_file_input_default_traits_dna; + size_t segment_id = segments.size(); + uint64_t bin_len{0}; + std::vector bin_seq_ids; + for (auto & bin_file : bin_files) + { + seqan3::sequence_file_input fin{bin_file}; + size_t fasta_ind = sequences.size(); + for (auto & record : fin) + { + trim_fasta_id(record.id()); + sequence_stats seq(record.id(), fasta_ind, record.sequence().size()); + total_len += seq.len; + bin_len += seq.len; + bin_seq_ids.push_back(fasta_ind); + sequences.push_back(seq); + fasta_ind++; + } + } + add_segment(segment_id, bin_seq_ids, 0, bin_len); + } + /** * @brief Function that splits the database into n partially overlapping segments. * @@ -343,28 +389,20 @@ struct metadata */ metadata(split_arguments const & arguments) { - for (auto & bin_files : arguments.bin_path) - { - //!TODO: handle case where multiple input files per bin - for (auto & bin_file : bin_files) - { - scan_database_file(bin_file); // for a clustered metagenome database segments == sequences - } - } - seq_count = sequences.size(); - if (arguments.metagenome) { - for (sequence_stats & seq : sequences) + for (auto & bin_files : arguments.bin_path) { - add_segment(seq.ind, seq.ind, 0u, seq.len); + scan_metagenome_bin(bin_files); } } else { - std::stable_sort(sequences.begin(), sequences.end(), length_order()); + scan_database_file(arguments.bin_path); scan_database_sequences(arguments); } + + seq_count = sequences.size(); seg_count = segments.size(); } diff --git a/src/argument_parsing/split.cpp b/src/argument_parsing/split.cpp index 4df9fd5a..dad28888 100644 --- a/src/argument_parsing/split.cpp +++ b/src/argument_parsing/split.cpp @@ -90,8 +90,16 @@ void run_split(sharg::parser & parser) { if (!line.empty()) { - sequence_file_validator(line); - arguments.bin_path.emplace_back(std::vector{line}); + std::stringstream linestrm(line); + std::string filepath; + std::vector bin_paths; + while (std::getline(linestrm, filepath, '\t')) + { + sequence_file_validator(filepath); + bin_paths.emplace_back(filepath); + } + + arguments.bin_path.emplace_back(bin_paths); } } } @@ -108,7 +116,7 @@ void run_split(sharg::parser & parser) arguments.meta_out.replace_extension("meta"); } - if (!arguments.split_index && arguments.metagenome) + if (arguments.metagenome) arguments.split_index = true; if (!arguments.split_index && !arguments.only_split && !parser.is_option_set("ref-meta")) diff --git a/test/cli/valik_test.cpp b/test/cli/valik_test.cpp index b63c1dd8..7db67eb2 100644 --- a/test/cli/valik_test.cpp +++ b/test/cli/valik_test.cpp @@ -6,6 +6,61 @@ #include "cli_test.hpp" +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////// valik split preprocess clusters /////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +struct valik_split_clusters : public valik_base {}; + +TEST_F(valik_split_clusters, split_metagenome_clusters) +{ + { + std::ofstream one_per_bin{"single_seq_bin_paths.txt"}; + for (size_t i{0}; i < 8; i++) + { + std::string file_path = cli_test::data("bin_" + std::to_string(i) + ".fasta"); + one_per_bin << file_path << '\n'; + } + + std::ofstream two_per_bin{"multi_seq_bin_paths.txt"}; + for (size_t i{0}; i < 8; i=i+2) + { + std::string file_path = cli_test::data("bin_" + std::to_string(i) + ".fasta"); + two_per_bin << file_path << '\t'; + file_path = cli_test::data("bin_" + std::to_string(i+1) + ".fasta"); + two_per_bin << file_path << '\n'; + } + } + + cli_test_result const result_one_per_bin = execute_app("valik", "split", + "--metagenome", + "--split-index", + "--out single_seq_meta.bin", + "single_seq_bin_paths.txt"); + EXPECT_EQ(result_one_per_bin.exit_code, 0); + EXPECT_EQ(result_one_per_bin.out, std::string{}); + EXPECT_EQ(result_one_per_bin.err, std::string{}); + + auto one_per_bin_meta = valik::metadata("single_seq_meta.bin"); + EXPECT_EQ(one_per_bin_meta.seq_count, 16); + EXPECT_EQ(one_per_bin_meta.seg_count, 8); + + cli_test_result const result_two_per_bin = execute_app("valik", "split", + "--metagenome", + "--split-index", + "--out multi_seq_meta.bin", + "multi_seq_bin_paths.txt"); + EXPECT_EQ(result_two_per_bin.exit_code, 0); + EXPECT_EQ(result_two_per_bin.out, std::string{}); + EXPECT_EQ(result_two_per_bin.err, std::string{}); + + auto two_per_bin_meta = valik::metadata("multi_seq_meta.bin"); + EXPECT_EQ(two_per_bin_meta.seq_count, 16); + EXPECT_EQ(two_per_bin_meta.seg_count, 4); + + EXPECT_EQ(one_per_bin_meta.total_len, two_per_bin_meta.total_len) +} + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////// valik split index bins ///////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////