Skip to content

Commit

Permalink
Metagenome metadata struct
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 15, 2024
1 parent 2269934 commit 4877d5c
Show file tree
Hide file tree
Showing 48 changed files with 197 additions and 1,391 deletions.
2 changes: 1 addition & 1 deletion include/valik/search/search_distributed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ bool search_distributed(search_arguments const & arguments, search_time_statisti
auto seg = ref_meta->segment_from_bin(bin_id);
process_args.insert(process_args.end(), {index.bin_path()[0][0], std::string(cart_queries_path),
"--referenceLength", std::to_string(ref_len),
"--sequenceOfInterest", std::to_string(seg.seq_ind),
"--sequenceOfInterest", std::to_string(seg.seq_vec[0]),
"--segmentBegin", std::to_string(seg.start),
"--segmentEnd", std::to_string(seg.start + seg.len)});
}
Expand Down
3 changes: 2 additions & 1 deletion include/valik/search/search_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ bool search_local(search_arguments const & arguments, search_time_statistics & t
threadOptions.referenceLength = refLen;
threadOptions.searchSegment = true;
auto seg = ref_meta.segment_from_bin(bin_id);
threadOptions.binSequences.emplace_back(seg.seq_ind);
//!TODO: deal with metagenome database
threadOptions.binSequences.emplace_back(seg.seq_vec[0]);
threadOptions.segmentBegin = seg.start;
threadOptions.segmentEnd = seg.start + seg.len;

Expand Down
2 changes: 1 addition & 1 deletion include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ struct split_arguments
{
std::vector<std::vector<std::string>> bin_path{};
std::filesystem::path db_file{};
std::filesystem::path meta_out{"metadata.txt"};
std::filesystem::path meta_out{"metadata.bin"};

size_t pattern_size{150};
uint32_t seg_count{64};
Expand Down
160 changes: 89 additions & 71 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <utilities/threshold/param_set.hpp>
#include <valik/shared.hpp>

#include <algorithm>
#include <iostream>
#include <fstream>
#include <ranges>
Expand Down Expand Up @@ -46,36 +47,58 @@ void trim_fasta_id(id_t & id)
*/
struct metadata
{
/** !\brief a metadata struct that represents a sequence file.
*
* \param id Numerical file id.
* \param path Input fasta file path.
*/
struct sequence_file
{
size_t id;
std::string path;

sequence_file() noexcept = default;
sequence_file(sequence_file const &) noexcept = default;
sequence_file & operator=(sequence_file const &) noexcept = default;
sequence_file & operator=(sequence_file &&) noexcept = default;
~sequence_file() noexcept = default;

sequence_file(size_t const i, std::string const p) : id(i), path(p) { }

template <class Archive>
void serialize(Archive & archive)
{
archive(id, path);
}
};

/** !\brief a metadata struct that represents a single sequence.
*
* \param id The FASTA id.
* \param ind 0-based index in the input FASTA file.
* \param len Sequence length.
* \param file_id Numerical file id (sequence_file::id).
* \param id The FASTA id.
* \param ind 0-based index in the input FASTA file.
* \param len Sequence length.
*/
struct sequence_stats
{
size_t file_id;
std::string id;
size_t ind;
uint64_t len;

sequence_stats() noexcept = default;
sequence_stats(sequence_stats const &) noexcept = default;
sequence_stats(sequence_stats &&) noexcept = default;
sequence_stats & operator=(sequence_stats const &) noexcept = default;
sequence_stats & operator=(sequence_stats &&) noexcept = default;
~sequence_stats() noexcept = default;

sequence_stats(std::string const fasta_id, size_t const fasta_ind, uint64_t const seq_length)
{
id = fasta_id;
ind = fasta_ind;
len = seq_length;
}
sequence_stats(size_t const seq_file_id, std::string const fasta_id, size_t const fasta_ind, uint64_t const seq_length) :
file_id(seq_file_id), id(fasta_id), ind(fasta_ind), len(seq_length) {}

template <class Archive>
void serialize(Archive & archive)
{
archive(id, ind, len);
archive(file_id, id, ind, len);
}
};

Expand All @@ -87,63 +110,56 @@ struct metadata
}
};

/** !\brief a struct that represents a single segment.
/** !\brief a struct that represents a single segment of a reference or query database.
*
* All indices and positions are 0-based.
*
* \param id Segment id.
* \param seq_ind Index of the underlying sequence in the input FASTA file. Corresponds to sequence_stats::ind.
* \param bin_seq_inds List of sequences (sequence_stats::ind) for a metagenome database.
* \param start Segment start position in sequence.
* \param id Numerical segment id.
* \param seq_vec List of sequences (numerical ids corresponding to sequence_stats::ind) associated with this segment.
* \param start Segment start position in sequence if segment consists of a single subsequence. 0 for metagenome bin.
* \param len Segment length.
*/
struct segment_stats
{
size_t id;
size_t seq_ind;
std::vector<size_t> bin_seq_inds;
uint64_t start;
std::vector<size_t> seq_vec{};
uint64_t start{0};
uint64_t len;

segment_stats() noexcept = default;
segment_stats(segment_stats const &) noexcept = default;
segment_stats(segment_stats &&) noexcept = default;
segment_stats & operator=(segment_stats const &) noexcept = default;
segment_stats & operator=(segment_stats &&) noexcept = default;
~segment_stats() noexcept = default;

segment_stats(size_t const i, size_t const ind, uint64_t const s, uint64_t const l)
segment_stats(size_t const ind, uint64_t const s, uint64_t const l) : start(s), len(l)
{
id = i;
seq_ind = ind;
start = s;
len = l;
seq_vec.push_back(ind);
}

segment_stats(size_t const i, std::vector<size_t> const inds, uint64_t const s, uint64_t const l)
{
id = i;
bin_seq_inds = inds;
start = s;
len = l;
}

segment_stats(size_t const ind, uint64_t const s, uint64_t const l)
{
seq_ind = ind;
start = s;
len = l;
segment_stats(size_t const & i, std::vector<size_t> & ind_vec, uint64_t const l) : id(i), len(l)
{
seq_vec = std::move(ind_vec);
}

std::string unique_id()
{
return std::to_string(seq_ind) + "_" + std::to_string(start) + "_" + std::to_string(len);
std::string str_id{};
for (auto seq_ind : seq_vec)
{
str_id += std::to_string(seq_ind);
str_id += "_";
}
str_id += std::to_string(start);
str_id += "_";
str_id += std::to_string(len);
return str_id;
}

template <class Archive>
void serialize(Archive & archive)
{
archive(id, seq_ind, start, len);
archive(id, seq_vec, start, len);
}
};

Expand All @@ -156,14 +172,18 @@ struct metadata

inline bool operator() (segment_stats const & left, segment_stats const & right)
{
return (left.seq_ind < right.seq_ind);
if (left.seq_vec.size() > 1 || right.seq_vec.size() > 1)
throw std::runtime_error("Can't order sets of sets of sequences.");
return (left.seq_vec[0] < right.seq_vec[0]);
}
};

uint64_t total_len{0};
size_t seq_count;
size_t seg_count;
size_t pattern_size;

std::vector<sequence_file> files;
std::vector<sequence_stats> sequences;

private:
Expand All @@ -178,35 +198,28 @@ struct metadata
void scan_database_file(std::vector<std::vector<std::string>> const & bin_path)
{
using traits_type = seqan3::sequence_file_input_default_traits_dna;
files.emplace_back(0, bin_path[0][0]);
seqan3::sequence_file_input<traits_type> fin{bin_path[0][0]}; // single input file
size_t fasta_ind = sequences.size();
for (auto & record : fin)
{
trim_fasta_id(record.id());
sequence_stats seq(record.id(), fasta_ind, record.sequence().size());
sequence_stats seq(0, record.id(), fasta_ind, record.sequence().size()); // there is only a single sequence file
total_len += seq.len;
sequences.push_back(seq);
fasta_ind++;
}
std::stable_sort(sequences.begin(), sequences.end(), length_order());
}

void add_segment(size_t const ind, uint64_t const s, uint64_t const l)
void add_segment(size_t const seq_id, uint64_t const s, uint64_t const l)
{
segment_stats seg(ind, s, l);
segments.push_back(seg);
segments.emplace_back(seq_id, s, l);
}

void add_segment(size_t const i, size_t const ind, uint64_t const s, uint64_t const l)
void add_segment(size_t const i, std::vector<size_t> & bin_seq_ids, uint64_t const l)
{
segment_stats seg(i, ind, s, l);
segments.push_back(seg);
}

void add_segment(size_t const i, std::vector<size_t> const & bin_seq_ids, uint64_t const s, uint64_t const l)
{
segment_stats seg(i, bin_seq_ids, s, l);
segments.push_back(seg);
segments.emplace_back(i, bin_seq_ids, l);
}

/**
Expand All @@ -217,25 +230,26 @@ struct metadata
void scan_metagenome_bin(std::vector<std::string> const & bin_files)
{
using traits_type = seqan3::sequence_file_input_default_traits_dna;
size_t segment_id = segments.size();
uint64_t bin_len{0};
std::vector<size_t> bin_seq_ids;
for (auto & bin_file : bin_files)
for (size_t file_id{0}; file_id < bin_files.size(); file_id++)
{
std::string bin_file = bin_files[file_id];
files.emplace_back(file_id, bin_file);
seqan3::sequence_file_input<traits_type> fin{bin_file};
size_t fasta_ind = sequences.size();
for (auto & record : fin)
{
trim_fasta_id(record.id());
sequence_stats seq(record.id(), fasta_ind, record.sequence().size());
sequence_stats seq(file_id, record.id(), fasta_ind, record.sequence().size());
total_len += seq.len;
bin_len += seq.len;
bin_seq_ids.push_back(fasta_ind);
sequences.push_back(seq);
fasta_ind++;
}
}
add_segment(segment_id, bin_seq_ids, 0, bin_len);
add_segment(segments.size(), bin_seq_ids, bin_len);
}

/**
Expand Down Expand Up @@ -403,6 +417,7 @@ struct metadata

seq_count = sequences.size();
seg_count = segments.size();
pattern_size = arguments.pattern_size;
}

/**
Expand Down Expand Up @@ -451,7 +466,10 @@ struct metadata
if (sequences.size() <= ind)
throw std::runtime_error{"Sequence " + std::to_string(ind) + " index out of range."};

return segments | std::views::filter([ind](segment_stats const & seg) {return ind == seg.seq_ind;});
return segments | std::views::filter([ind](segment_stats const & seg)
{
return (std::find(seg.seq_vec.begin(), seg.seq_vec.end(), ind) != seg.seq_vec.end());
});
}

/**
Expand All @@ -463,7 +481,7 @@ struct metadata
{
std::ofstream os(filepath, std::ios::binary);
cereal::BinaryOutputArchive archive(os);
archive(total_len, sequences, segments);
archive(total_len, pattern_size, files, sequences, segments);
}

/**
Expand All @@ -475,7 +493,7 @@ struct metadata
{
std::ifstream is(filepath, std::ios::binary);
cereal::BinaryInputArchive archive(is);
archive(total_len, sequences, segments);
archive(total_len, pattern_size, files, sequences, segments);
seq_count = sequences.size();
seg_count = segments.size();
}
Expand All @@ -488,8 +506,14 @@ struct metadata

out_str << "$\n";

for (const auto & seg : segments)
out_str << seg.id << '\t' << seg.seq_ind << '\t' << seg.start << '\t' << seg.len << '\n';
for (size_t seg_id{0}; seg_id < segments.size(); seg_id++)
{
segment_stats seg = segments[seg_id];
out_str << seg_id << '\t';
for (size_t ind : seg.seq_vec)
out_str << ind << '\t';
out_str << seg.start << '\t' << seg.len << '\n';
}

out_str << "$\n";

Expand Down Expand Up @@ -521,12 +545,6 @@ struct metadata
return stdev / mean;
}

size_t segment_overlap() const
{
assert(segments.size() > 1);
return segments[0].len - segments[1].start;
}

/**
* @brief Probability of a k-mer appearing spuriously in a bin.
*/
Expand Down Expand Up @@ -564,7 +582,7 @@ struct metadata
the probability of 1 k-mer out of n matching is P(1 k-mer matches) = (n take 1) * p^1 * (1 - p)^(n - 1).
*/

size_t kmers_per_pattern = segment_overlap() - params.k + 1;
size_t kmers_per_pattern = pattern_size - params.k + 1;
for (uint8_t matching_kmer_count{0}; matching_kmer_count < params.t; matching_kmer_count++)
{
fpr -= combinations(matching_kmer_count, kmers_per_pattern) *
Expand All @@ -584,7 +602,7 @@ struct metadata
if (pattern_p < 9e-6) // avoid very small floating point numbers
return 100000;
size_t max_patterns_per_segment = std::round(1.0 / pattern_p) - 1;
return segment_overlap() + query_every * (std::max(max_patterns_per_segment, (size_t) 2) - 1);
return pattern_size + query_every * (std::max(max_patterns_per_segment, (size_t) 2) - 1);
}
};

Expand Down
2 changes: 1 addition & 1 deletion src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ void run_split(sharg::parser & parser)
if (!parser.is_option_set("out"))
{
arguments.meta_out = arguments.db_file;
arguments.meta_out.replace_extension("meta");
arguments.meta_out.replace_extension("bin");
}

if (arguments.metagenome)
Expand Down
4 changes: 2 additions & 2 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ void find_thresholds_for_kmer_size(metadata const & ref_meta,
std::cout << "error_rate\tthreshold_kind\tthreshold\tFNR\tFP_per_pattern\tmax_query_seg_len\n";

auto best_params = param_set(attr.k, space.max_thresh, space);
for (uint8_t errors{1}; errors <= std::ceil(ref_meta.segment_overlap() * max_err); errors++)
for (uint8_t errors{1}; errors <= std::ceil(ref_meta.pattern_size * max_err) && errors <= space.max_errors; errors++)
{
search_pattern pattern(errors, ref_meta.segment_overlap());
search_pattern pattern(errors, ref_meta.pattern_size);
std::cout << errors / (double) pattern.l << '\t';
if (kmer_lemma_threshold(pattern.l, attr.k, errors) > 1)
{
Expand Down
Loading

0 comments on commit 4877d5c

Please sign in to comment.