Skip to content

Commit

Permalink
Join metadata tables into a single file
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Sep 13, 2023
1 parent 88f91b3 commit 2b12f33
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 38 deletions.
5 changes: 2 additions & 3 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna

struct split_arguments
{
std::filesystem::path db_file{};
std::filesystem::path db_out{"reference_metadata.txt"};
std::filesystem::path seg_out{"reference_segments.txt"};
std::filesystem::path seq_file{};
std::filesystem::path meta_out{"metadata.txt"};

size_t overlap{150};
size_t seg_count{64};
Expand Down
264 changes: 264 additions & 0 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#pragma once

#include <seqan3/argument_parser/exceptions.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/io/sequence_file/all.hpp>

#include <iostream>
#include <fstream>

namespace valik
{

/**
* @brief Struct that stores the metadata for a split database.
* \param total_len Total database length.
* \param sequences Collection of database sequences.
* \param default_seg_len Default length of a segment that is dynamically updated.
* \param segments Collection of database segments.
*/
struct metadata
{
/** !\brief a metadata struct that represents a single sequence.
*
* \param id The FASTA id.
* \param ind 0-based index in the input FASTA file.
* \param len Sequence length.
*/
struct sequence_stats
{
std::string id;
size_t ind;
size_t len;

sequence_stats(std::string const fasta_id, size_t const fasta_ind, size_t const seq_length)
{
id = fasta_id;
ind = fasta_ind;
len = seq_length;
}
};

/** !\brief a struct that represents a single segment.
*
* All indices and positions are 0-based.
*
* \param id Index of the segment in the vector of segments.
* \param seq_ind Index of the underlying sequence in the input FASTA file. Corresponds to sequence_stats::id.
* \param start Segment start position in sequence.
* \param len Segment length.
*/
struct segment_stats
{
size_t id;
size_t seq_ind;
size_t start;
size_t len;

segment_stats(size_t i, size_t ind, size_t s, size_t l)
{
id = i;
seq_ind = ind;
start = s;
len = l;
}

std::string unique_id()
{
return std::to_string(seq_ind) + "_" + std::to_string(start) + "_" + std::to_string(len);
}
};

uint64_t total_len;
std::vector<sequence_stats> sequences;
size_t default_seg_len;
std::vector<segment_stats> segments;

/**
* @brief Function that scans over a sequence file to extract metadata.
*
* @param db_path Path to input file.
*/
void scan_database_file(std::filesystem::path const & db_path)
{
using traits_type = seqan3::sequence_file_input_default_traits_dna;
seqan3::sequence_file_input<traits_type> fin{db_path};
total_len = 0;
size_t fasta_ind = 0;
for (auto & record : fin)
{
sequence_stats seq(record.id(), fasta_ind, record.sequence().size());
total_len += seq.len;
sequences.push_back(seq);
fasta_ind++;
}
}

void add_segment(size_t const i, size_t const ind, size_t const s, size_t const l)
{
segment_stats seg(i, ind, s, l);
segments.push_back(seg);
}

/**
* @brief Function that scans over the collection of sequences and greedily assigns segment positions.
*
* @param seg_count Number of segments.
* @param overlap Length of overlap between adjacent segments.
*/
void scan_database_sequences(size_t const seg_count, size_t const overlap)
{
assert(seg_count > 0);
default_seg_len = total_len / seg_count + 1;
assert(default_seg_len > overlap);

size_t remaining_db_len = total_len;
for (const auto & seq : sequences)
{
size_t start = 0;
if (seq.len < (default_seg_len / 2))
{
seqan3::debug_stream << "Sequence: " << seq.id << " is too short and will be skipped.\n";
remaining_db_len -= seq.len;
}
else if ((seq.len >= default_seg_len / 2) & (seq.len <= default_seg_len * 1.5))
{
// database sequence is single segment
add_segment(segments.size(), seq.ind, start, seq.len);
remaining_db_len -= seq.len;
}
else
{
// sequences that are contained in a single segment might not have the exact segment length
// dynamically update segment length to divide the rest of the remaining database as equally as possible among the chosen number of segments
size_t remaining_seg_count = seg_count - segments.size();
size_t updated_seg_len = remaining_db_len / remaining_seg_count;

size_t segments_per_seq = std::round( (double) seq.len / (double) updated_seg_len);
size_t actual_seg_len = seq.len / segments_per_seq + overlap + 1; // + 1 because integer division always rounded down

// divide database sequence into multiple segments
add_segment(segments.size(), seq.ind, start, actual_seg_len);
start = start + actual_seg_len - overlap;
while (start + actual_seg_len - overlap < seq.len)
{
add_segment(segments.size(), seq.ind, start, actual_seg_len);
start = start + actual_seg_len - overlap;
}
add_segment(segments.size(), seq.ind, start, seq.len - start);

remaining_db_len -= seq.len;
}
}
assert(segments.size() == seg_count);
}

/**
* @brief Constructor that scans a sequence database to create a metadata struct.
*/
metadata(split_arguments const & arguments)
{
scan_database_file(arguments.seq_file);
scan_database_sequences(arguments.seg_count, arguments.overlap);
}

/**
* @brief Constructor that deserializes a metadata struct from file.
*/
metadata(std::filesystem::path const & filepath)
{
std::ifstream in_file(filepath);
if (in_file.is_open())
{
std::string seq_meta;
std::getline(in_file, seq_meta, '$');
std::stringstream seq_str(seq_meta);

std::string seq_id;
size_t fasta_ind, length;
total_len = 0;
while (seq_str >> seq_id)
{
seq_str >> fasta_ind;
seq_str >> length;
total_len += length;
sequences.push_back(sequence_stats(seq_id, fasta_ind, length));
}

std::string seg_meta;
std::getline(in_file, seg_meta); // newline
std::getline(in_file, seg_meta, '$');
std::stringstream seg_str(seg_meta);

size_t id, seq_ind, start;
while (seg_str >> id)
{
seg_str >> seq_ind;
seg_str >> start;
seg_str >> length;

add_segment(id, seq_ind, start, length);
}
}

in_file.close();
}

inline size_t ind_from_id(std::string const & string_id) const
{
auto it = std::find_if(sequences.begin(), sequences.end(), [&](const sequence_stats& seq) { return seq.id == string_id;});
if (it == sequences.end())
throw seqan3::validation_error{"Sequence metadata does not contain sequence from Stellar output."};
else
return (*it).ind;
}


//!TODO: isn't this just segments[id]???
/**
* @brief Function that find the segment corresponding to a numerical ID.
*
* @param id
* @return segment
*/
segment_stats segment_from_bin(size_t const id) const
{
/*
for (auto & seg : segments)
{
if (seg.id == id)
return seg;
}
*/

if (segments.size() < (id - 1))
throw std::runtime_error{"Could not find segment"};

return segments[id];
}

/**
* @brief Function that serializes the metadata struct.
*
* @param filepath Output file path.
*/
void to_file(std::filesystem::path const & filepath)
{
std::ofstream out_file;
out_file.open(filepath);

for (sequence_stats const & seq : sequences)
out_file << seq.id << '\t' << seq.ind << '\t' << seq.len << '\n';

out_file << "$\n";

for (const auto & seg : segments)
out_file << seg.id << '\t' << seg.seq_ind << '\t' << seg.start << '\t' << seg.len << '\n';

out_file << "$\n";
out_file.close();
}

};

} // namespace valik
9 changes: 3 additions & 6 deletions include/valik/split/write_seg_sequences.hpp
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
#pragma once

#include <valik/shared.hpp>
#include <valik/split/database_metadata.hpp>
#include <valik/split/database_segments.hpp>
#include <valik/split/metadata.hpp>

namespace valik
{

/** !\brief Function that writes an output FASTA file for each segment sequence. */
void write_reference_segments(database_metadata const & reference_metadata,
database_segments & segments,
void write_reference_segments(metadata & reference_metadata,
std::filesystem::path const & ref_path);

/** !\brief Function that writes segment sequences into a single FASTA file. */
void write_query_segments(database_metadata const & query_metadata,
database_segments & segments,
void write_query_segments(metadata & query_metadata,
std::filesystem::path const & query_path);

} // namespace valik
14 changes: 4 additions & 10 deletions src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,15 @@ namespace valik::app
void init_split_parser(sharg::parser & parser, split_arguments & arguments)
{
init_shared_meta(parser);
parser.add_positional_option(arguments.db_file,
parser.add_positional_option(arguments.seq_file,
sharg::config{.description = "File containing database sequences.",
.validator = sharg::input_file_validator{}});
parser.add_option(arguments.db_out,
sharg::config{.short_id = '\0',
.long_id = "db-meta",
parser.add_option(arguments.meta_out,
sharg::config{.short_id = 'o',
.long_id = "out",
.description = "Please provide a valid path to the database metadata output.",
.required = true,
.validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create}});
parser.add_option(arguments.seg_out,
sharg::config{.short_id = '\0',
.long_id = "seg-meta",
.description = "Please provide a valid path to the segment metadata output.",
.required = true,
.validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create}});
parser.add_option(arguments.overlap,
sharg::config{.short_id = '\0',
.long_id = "overlap",
Expand Down
14 changes: 6 additions & 8 deletions src/split/write_seg_sequences.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ using types = seqan3::type_list<std::vector<seqan3::dna4>, std::string>;
using fields = seqan3::fields<seqan3::field::seq, seqan3::field::id>;
using sequence_record_type = seqan3::sequence_record<types, fields>;

void write_reference_segments(database_metadata const & reference_metadata,
database_segments & segments,
void write_reference_segments(metadata & reference_metadata,
std::filesystem::path const & ref_path)
{
std::filesystem::path seg_path_file = ref_path;
Expand All @@ -23,8 +22,8 @@ void write_reference_segments(database_metadata const & reference_metadata,
for (auto && [seq] : sequence_file_t{ref_path})
{
// get the underlying sequence
auto is_segment_sequence = [&](database_segments::segment & seg) {return reference_metadata.sequences.at(i).ind == seg.seq_ind;};
for (auto & seg : segments.members | std::views::filter(is_segment_sequence))
auto is_segment_sequence = [&](metadata::segment_stats & seg) {return reference_metadata.sequences.at(i).ind == seg.seq_ind;};
for (auto & seg : reference_metadata.segments | std::views::filter(is_segment_sequence))
{
std::filesystem::path seg_file = ref_path;
std::filesystem::path seg_stem = seg_file.stem();
Expand All @@ -49,8 +48,7 @@ void write_reference_segments(database_metadata const & reference_metadata,
file_paths_out.close();
}

void write_query_segments(database_metadata const & query_metadata,
database_segments & segments,
void write_query_segments(metadata & query_metadata,
std::filesystem::path const & query_path)
{
std::filesystem::path seg_out_path = query_path;
Expand All @@ -61,8 +59,8 @@ void write_query_segments(database_metadata const & query_metadata,
for (auto && [seq] : sequence_file_t{query_path})
{
// get the underlying sequence
auto is_segment_sequence = [&](database_segments::segment & seg) {return query_metadata.sequences.at(i).ind == seg.seq_ind;};
for (auto & seg : segments.members | std::views::filter(is_segment_sequence))
auto is_segment_sequence = [&](metadata::segment_stats & seg) {return query_metadata.sequences.at(i).ind == seg.seq_ind;};
for (auto & seg : query_metadata.segments | std::views::filter(is_segment_sequence))
{
std::string id{seg.unique_id()};
seqan3::dna4_vector seg_sequence(&seq[seg.start], &seq[seg.start+seg.len]);
Expand Down
Loading

0 comments on commit 2b12f33

Please sign in to comment.