Skip to content

Commit

Permalink
Serialise metadata with cereal
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 13, 2024
1 parent 800d24f commit 7868705
Show file tree
Hide file tree
Showing 24 changed files with 103 additions and 188 deletions.
1 change: 1 addition & 0 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ struct split_arguments
uint8_t errors{0};
uint8_t kmer_size{20};
size_t threshold{};
bool metagenome{false};
std::filesystem::path ref_meta_path{};
bool write_out{false};
bool only_split{false};
Expand Down
101 changes: 51 additions & 50 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#include <fstream>
#include <ranges>

#include <cereal/archives/binary.hpp>
#include <cereal/types/vector.hpp>

namespace valik
{

Expand Down Expand Up @@ -55,12 +58,25 @@ struct metadata
size_t ind;
uint64_t len;

constexpr sequence_stats() noexcept = default;
constexpr sequence_stats(sequence_stats const &) noexcept = default;
constexpr sequence_stats(sequence_stats &&) noexcept = default;
constexpr sequence_stats & operator=(sequence_stats const &) noexcept = default;
constexpr sequence_stats & operator=(sequence_stats &&) noexcept = default;
~sequence_stats() noexcept = default;

sequence_stats(std::string const fasta_id, size_t const fasta_ind, uint64_t const seq_length)
{
id = fasta_id;
ind = fasta_ind;
len = seq_length;
}

template <class Archive>
void serialize(Archive & archive)
{
archive(id, ind, len);
}
};

struct length_order
Expand All @@ -87,6 +103,13 @@ struct metadata
uint64_t start;
uint64_t len;

constexpr segment_stats() noexcept = default;
constexpr segment_stats(segment_stats const &) noexcept = default;
constexpr segment_stats(segment_stats &&) noexcept = default;
constexpr segment_stats & operator=(segment_stats const &) noexcept = default;
constexpr segment_stats & operator=(segment_stats &&) noexcept = default;
~segment_stats() noexcept = default;

segment_stats(size_t const i, size_t const ind, uint64_t const s, uint64_t const l)
{
id = i;
Expand All @@ -106,6 +129,12 @@ struct metadata
{
return std::to_string(seq_ind) + "_" + std::to_string(start) + "_" + std::to_string(len);
}

template <class Archive>
void serialize(Archive & archive)
{
archive(id, seq_ind, start, len);
}
};

struct fasta_order
Expand Down Expand Up @@ -326,45 +355,9 @@ struct metadata
*/
metadata(std::filesystem::path const & filepath)
{
std::ifstream in_file(filepath);
if (in_file.is_open())
{
std::string seq_meta;
std::getline(in_file, seq_meta, '$');
std::stringstream seq_str(seq_meta);

std::string seq_id, fasta_ind, length;
total_len = 0;
while(std::getline(seq_str, seq_id, '\t'))
{
std::getline(seq_str, fasta_ind, '\t');
std::getline(seq_str, length, '\n');
total_len += stoi(length);
sequences.push_back(sequence_stats(seq_id, stoi(fasta_ind), stoi(length)));
}

std::string seg_meta;
std::getline(in_file, seg_meta); // newline
std::getline(in_file, seg_meta, '$');
std::stringstream seg_str(seg_meta);

size_t id, seq_ind, start;
while (seg_str >> id)
{
seg_str >> seq_ind;
seg_str >> start;
seg_str >> length;

add_segment(id, seq_ind, start, stoi(length));
}
}

in_file.close();
seq_count = sequences.size();
seg_count = segments.size();
load(filepath);
}


/**
* @brief Function that returns the numerical index of a sequence based on its fasta ID.
*
Expand Down Expand Up @@ -407,28 +400,34 @@ struct metadata
}

/**
* @brief Function that serializes the metadata struct.
* @brief Serialize the metadata struct.
*
* @param filepath Output file path.
*/
void to_file(std::filesystem::path const & filepath)
void save(std::filesystem::path const & filepath) const
{
std::ofstream out_file;
out_file.open(filepath);

stream_out(out_file);

out_file.close();
std::ofstream os(filepath, std::ios::binary);
cereal::BinaryOutputArchive archive(os);
archive(total_len, sequences, segments);
}

/**
* @brief Function that streams out the metadata table.
* @brief Deserialise the metadata struct.
*
* @param out_str Output stream.
* @param filepath Input file path.
*/
template <typename str_t>
void stream_out(str_t & out_str)
void load(std::filesystem::path const & filepath)
{
std::ifstream is(filepath, std::ios::binary);
cereal::BinaryInputArchive archive(is);
archive(total_len, sequences, segments);
seq_count = sequences.size();
seg_count = segments.size();
}

std::string to_string()
{
std::stringstream out_str;
for (sequence_stats const & seq : sequences)
out_str << seq.id << '\t' << seq.ind << '\t' << seq.len << '\n';

Expand All @@ -438,6 +437,8 @@ struct metadata
out_str << seg.id << '\t' << seg.seq_ind << '\t' << seg.start << '\t' << seg.len << '\n';

out_str << "$\n";

return out_str.str();
}

double segment_length_stdev()
Expand Down
9 changes: 8 additions & 1 deletion src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
{
init_shared_meta(parser);
parser.add_positional_option(arguments.seq_file,
sharg::config{.description = "File containing database sequences.",
sharg::config{.description = "File containing database sequences. If splitting --metagenome provide a list of cluster paths.",
.validator = sharg::input_file_validator{}});
parser.add_option(arguments.meta_out,
sharg::config{.short_id = 'o',
Expand Down Expand Up @@ -41,6 +41,10 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
sharg::config{.short_id = '\0',
.long_id = "split-index",
.description = "Adjust the suggested segment count to create a multiple of 64 segments instead. This is suitable for building an IBF."});
parser.add_flag(arguments.metagenome,
sharg::config{.short_id = '\0',
.long_id = "metagenome",
.description = "Split a clustered metagenome database. Reference input is a list of cluster paths"});
parser.add_option(arguments.ref_meta_path,
sharg::config{.short_id = '\0',
.long_id = "ref-meta",
Expand Down Expand Up @@ -78,6 +82,9 @@ void run_split(sharg::parser & parser)
arguments.meta_out.replace_extension("meta");
}

if (!arguments.split_index && arguments.metagenome)
arguments.split_index = true;

if (!arguments.split_index && !arguments.only_split && !parser.is_option_set("ref-meta"))
throw sharg::parser_error{"Need to provide path to reference metadata to process a query database."};

Expand Down
7 changes: 4 additions & 3 deletions src/valik_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ void valik_split(split_arguments & arguments)
}

metadata meta(arguments);
meta.to_file(arguments.meta_out);
meta.save(arguments.meta_out);

if (!arguments.only_split)
{
// ==========================================
Expand Down Expand Up @@ -91,9 +92,9 @@ void valik_split(split_arguments & arguments)
if (arguments.write_out)
{
if (arguments.split_index)
write_reference_segments(meta, arguments.meta_out);
write_reference_segments(meta, arguments.seq_file);
else
write_query_segments(meta, arguments.meta_out);
write_query_segments(meta, arguments.seq_file);
}
}

Expand Down
32 changes: 0 additions & 32 deletions test/api/valik/split/write_seg_sequences_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,6 @@ static void const test_reference_out(size_t overlap, size_t bins)
}
}

TEST(write_ref_sequences, o0_b4)
{
size_t overlap = 0;
size_t bins = 4;

test_reference_out(overlap, bins);
}

TEST(write_ref_sequences, o20_b4)
{
size_t overlap = 20;
Expand All @@ -74,14 +66,6 @@ TEST(write_ref_sequences, o20_b4)
test_reference_out(overlap, bins);
}

TEST(write_ref_sequences, o0_b16)
{
size_t overlap = 0;
size_t bins = 16;

test_reference_out(overlap, bins);
}

TEST(write_ref_sequences, o20_b16)
{
size_t overlap = 20;
Expand Down Expand Up @@ -123,14 +107,6 @@ static void const test_query_out(size_t overlap, size_t bins)
}
}

TEST(write_query_sequences, o0_b4)
{
size_t overlap = 0;
size_t bins = 4;

test_query_out(overlap, bins);
}

TEST(write_query_sequences, o20_b4)
{
size_t overlap = 20;
Expand All @@ -139,14 +115,6 @@ TEST(write_query_sequences, o20_b4)
test_query_out(overlap, bins);
}

TEST(write_query_sequences, o0_b16)
{
size_t overlap = 0;
size_t bins = 16;

test_query_out(overlap, bins);
}

TEST(write_query_sequences, o20_b16)
{
size_t overlap = 20;
Expand Down
12 changes: 7 additions & 5 deletions test/cli/valik_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,19 @@ TEST_P(valik_split_various, split_various_lengths)

cli_test_result const result = execute_app("valik", "split",
data("various_chromosome_lengths.fasta"),
"--out reference_metadata.txt",
"--out query_metadata.txt",
"--seg-count ", std::to_string(seg_count),
"--pattern ", std::to_string(overlap),
"--ref-meta ", segment_metadata_path(150, 4));

EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, std::string{});
EXPECT_EQ(result.err, std::string{"Sequence: chr5 is too short and will be skipped.\n"});
std::string const expected_segments = string_from_file(segment_metadata_path(overlap, seg_count), std::ios::binary);
std::string const actual_segments = string_from_file("reference_metadata.txt", std::ios::binary);
EXPECT_TRUE(expected_segments == actual_segments);
EXPECT_EQ(result.err, std::string{"Sequence: chr5 is too short and will be skipped.\n"});
auto expected_segments = valik::metadata(segment_metadata_path(overlap, seg_count));
std::string expected_segment_str = expected_segments.to_string();
auto actual_segments = valik::metadata("query_metadata.txt");
std::string actual_segment_str = actual_segments.to_string();
EXPECT_TRUE(expected_segment_str == actual_segment_str);
}


Expand Down
Binary file modified test/data/consolidate/16bins50overlap_reference_metadata.tsv
Binary file not shown.
Binary file modified test/data/consolidate/8bins50overlap_reference_metadata.tsv
Binary file not shown.
10 changes: 4 additions & 6 deletions test/data/consolidate/api_test_input.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,9 @@ min_len=50

for bin in 8 16
do
valik split $ref_file --out ${bin}bins${min_len}overlap_reference_metadata.tsv --seg-count $bin --pattern $min_len --without-parameter-tuning

tail -n $((bin + 1)) ${bin}bins${min_len}overlap_reference_metadata.tsv | head -n $bin > segments.tsv
while read -r bin_id id start len;
valik split $ref_file --out ${bin}bins${min_len}overlap_reference_metadata.tsv --seg-count $bin --pattern $min_len --without-parameter-tuning --write-out
grep ">" multi_seq_ref.segments.fasta | cut -c 2- | awk -F'_' '{print $1 "\t" $2 "\t" $3}' > segments.tsv
while read -r id start len;
do
end=$(echo $start + $len | bc)
stellar -e $error_rate -l $min_len -o multi_seq_ref_${id}_${start}_${len}.gff \
Expand All @@ -78,8 +77,7 @@ do

done < segments.tsv

rm segments.tsv

rm segments.tsv multi_seq_ref.segments.fasta

cat multi_seq_ref_*.gff > ${bin}bins${min_len}overlap_dream_all.gff
rm multi_seq_ref_*
Expand Down
Loading

0 comments on commit 7868705

Please sign in to comment.