Skip to content

Commit

Permalink
Preprocess metagenome database
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 14, 2024
1 parent 0127d42 commit 2c4d3bc
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 23 deletions.
78 changes: 58 additions & 20 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,17 @@ struct metadata
*
* All indices and positions are 0-based.
*
* \param id Index of the segment in the vector of segments.
* \param seq_ind Index of the underlying sequence in the input FASTA file. Corresponds to sequence_stats::ind.
* \param start Segment start position in sequence.
* \param len Segment length.
* \param id Segment id.
* \param seq_ind Index of the underlying sequence in the input FASTA file. Corresponds to sequence_stats::ind.
* \param bin_seq_inds List of sequences (sequence_stats::ind) for a metagenome database.
* \param start Segment start position in sequence.
* \param len Segment length.
*/
struct segment_stats
{
size_t id;
size_t seq_ind;
std::vector<size_t> bin_seq_inds;
uint64_t start;
uint64_t len;

Expand All @@ -118,6 +120,14 @@ struct metadata
len = l;
}

segment_stats(size_t const i, std::vector<size_t> const inds, uint64_t const s, uint64_t const l)
{
id = i;
bin_seq_inds = inds;
start = s;
len = l;
}

segment_stats(size_t const ind, uint64_t const s, uint64_t const l)
{
seq_ind = ind;
Expand Down Expand Up @@ -165,12 +175,12 @@ struct metadata
*
* @param db_path Path to input file.
*/
void scan_database_file(std::filesystem::path const & db_path)
void scan_database_file(std::vector<std::vector<std::string>> const & bin_path)
{
using traits_type = seqan3::sequence_file_input_default_traits_dna;
seqan3::sequence_file_input<traits_type> fin{db_path};
seqan3::sequence_file_input<traits_type> fin{bin_path[0][0]}; // single input file
total_len = 0;
size_t fasta_ind = 0;
size_t fasta_ind = sequences.size();
for (auto & record : fin)
{
trim_fasta_id(record.id());
Expand All @@ -179,6 +189,7 @@ struct metadata
sequences.push_back(seq);
fasta_ind++;
}
std::stable_sort(sequences.begin(), sequences.end(), length_order());
}

void add_segment(size_t const ind, uint64_t const s, uint64_t const l)
Expand All @@ -193,6 +204,41 @@ struct metadata
segments.push_back(seg);
}

void add_segment(size_t const i, std::vector<size_t> const & bin_seq_ids, uint64_t const s, uint64_t const l)
{
segment_stats seg(i, bin_seq_ids, s, l);
segments.push_back(seg);
}

/**
* @brief Function that scans over a metagenome database to extract sequences and segments.
*
* @param bin_path Path to input file.
*/
void scan_metagenome_bin(std::vector<std::string> const & bin_files)
{
using traits_type = seqan3::sequence_file_input_default_traits_dna;
size_t segment_id = segments.size();
uint64_t bin_len{0};
std::vector<size_t> bin_seq_ids;
for (auto & bin_file : bin_files)
{
seqan3::sequence_file_input<traits_type> fin{bin_file};
size_t fasta_ind = sequences.size();
for (auto & record : fin)
{
trim_fasta_id(record.id());
sequence_stats seq(record.id(), fasta_ind, record.sequence().size());
total_len += seq.len;
bin_len += seq.len;
bin_seq_ids.push_back(fasta_ind);
sequences.push_back(seq);
fasta_ind++;
}
}
add_segment(segment_id, bin_seq_ids, 0, bin_len);
}

/**
* @brief Function that splits the database into n partially overlapping segments.
*
Expand Down Expand Up @@ -343,28 +389,20 @@ struct metadata
*/
metadata(split_arguments const & arguments)
{
for (auto & bin_files : arguments.bin_path)
{
//!TODO: handle case where multiple input files per bin
for (auto & bin_file : bin_files)
{
scan_database_file(bin_file); // for a clustered metagenome database segments == sequences
}
}
seq_count = sequences.size();

if (arguments.metagenome)
{
for (sequence_stats & seq : sequences)
for (auto & bin_files : arguments.bin_path)
{
add_segment(seq.ind, seq.ind, 0u, seq.len);
scan_metagenome_bin(bin_files);
}
}
else
{
std::stable_sort(sequences.begin(), sequences.end(), length_order());
scan_database_file(arguments.bin_path);
scan_database_sequences(arguments);
}

seq_count = sequences.size();
seg_count = segments.size();
}

Expand Down
14 changes: 11 additions & 3 deletions src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,16 @@ void run_split(sharg::parser & parser)
{
if (!line.empty())
{
sequence_file_validator(line);
arguments.bin_path.emplace_back(std::vector<std::string>{line});
std::stringstream linestrm(line);
std::string filepath;
std::vector<std::string> bin_paths;
while (std::getline(linestrm, filepath, '\t'))
{
sequence_file_validator(filepath);
bin_paths.emplace_back(filepath);
}

arguments.bin_path.emplace_back(bin_paths);
}
}
}
Expand All @@ -108,7 +116,7 @@ void run_split(sharg::parser & parser)
arguments.meta_out.replace_extension("meta");
}

if (!arguments.split_index && arguments.metagenome)
if (arguments.metagenome)
arguments.split_index = true;

if (!arguments.split_index && !arguments.only_split && !parser.is_option_set("ref-meta"))
Expand Down
55 changes: 55 additions & 0 deletions test/cli/valik_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,61 @@

#include "cli_test.hpp"

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////// valik split preprocess clusters ///////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

struct valik_split_clusters : public valik_base {};

TEST_F(valik_split_clusters, split_metagenome_clusters)
{
{
std::ofstream one_per_bin{"single_seq_bin_paths.txt"};
for (size_t i{0}; i < 8; i++)
{
std::string file_path = cli_test::data("bin_" + std::to_string(i) + ".fasta");
one_per_bin << file_path << '\n';
}

std::ofstream two_per_bin{"multi_seq_bin_paths.txt"};
for (size_t i{0}; i < 8; i=i+2)
{
std::string file_path = cli_test::data("bin_" + std::to_string(i) + ".fasta");
two_per_bin << file_path << '\t';
file_path = cli_test::data("bin_" + std::to_string(i+1) + ".fasta");
two_per_bin << file_path << '\n';
}
}

cli_test_result const result_one_per_bin = execute_app("valik", "split",
"--metagenome",
"--split-index",
"--out single_seq_meta.bin",
"single_seq_bin_paths.txt");
EXPECT_EQ(result_one_per_bin.exit_code, 0);
EXPECT_EQ(result_one_per_bin.out, std::string{});
EXPECT_EQ(result_one_per_bin.err, std::string{});

auto one_per_bin_meta = valik::metadata("single_seq_meta.bin");
EXPECT_EQ(one_per_bin_meta.seq_count, 16);
EXPECT_EQ(one_per_bin_meta.seg_count, 8);

cli_test_result const result_two_per_bin = execute_app("valik", "split",
"--metagenome",
"--split-index",
"--out multi_seq_meta.bin",
"multi_seq_bin_paths.txt");
EXPECT_EQ(result_two_per_bin.exit_code, 0);
EXPECT_EQ(result_two_per_bin.out, std::string{});
EXPECT_EQ(result_two_per_bin.err, std::string{});

auto two_per_bin_meta = valik::metadata("multi_seq_meta.bin");
EXPECT_EQ(two_per_bin_meta.seq_count, 16);
EXPECT_EQ(two_per_bin_meta.seg_count, 4);

EXPECT_EQ(one_per_bin_meta.total_len, two_per_bin_meta.total_len)
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////// valik split index bins /////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down

0 comments on commit 2c4d3bc

Please sign in to comment.