Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the argument parser #101

Merged
merged 15 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/utilities/consolidate/consolidate_matches.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#pragma once

#include <filesystem>
#include <map>
#include <ranges>

#include <valik/shared.hpp>
#include <valik/split/metadata.hpp>
Expand Down
89 changes: 81 additions & 8 deletions include/utilities/consolidate/stellar_match.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ struct stellar_match
uint64_t dend{};
std::string percid;
bool is_forward_match{true};
std::string qname{};
uint64_t qbegin{};
uint64_t qend{};
std::string alignment_attributes{};

// Stellar GFF attributes
// 1;seq2Range=1280,1378;cigar=97M1D2M;mutations=14A,45G,58T,92C
std::string attributes{};

stellar_match(std::vector<std::string> const match_vec, metadata const & meta)
stellar_match(std::vector<std::string> const & match_vec, metadata const & meta)
{
dname = match_vec[0];

Expand All @@ -33,15 +33,59 @@ struct stellar_match
if (match_vec[6] == "-")
is_forward_match = false;

attributes = match_vec[8];
// Stellar GFF attributes
// 1;seq2Range=1280,1378;cigar=97M1D2M;mutations=14A,45G,58T,92C
// OR
// 1;seq2Range=1280,1378;eValue=4.05784e-73;cigar=97M1D2M;mutations=14A,45G,58T,92C
std::vector<std::string> attributes_vec = get_line_vector<std::string>(match_vec[8], ';');

if (attributes_vec.size() == 4 || attributes_vec.size() == 5)
{
qname = attributes_vec[0];
qbegin = stoi(attributes_vec[1].substr(attributes_vec[1].find("=") + 1,
attributes_vec[1].find(",") - attributes_vec[1].find("=") - 1));
qend = stoi(attributes_vec[1].substr(attributes_vec[1].find(",") + 1));

for (auto it = attributes_vec.begin() + 2; it < attributes_vec.end() - 1; it++)
{
alignment_attributes += *it;
alignment_attributes += ";";
}

alignment_attributes += attributes_vec[attributes_vec.size() - 1];
}
else
{
std::string attributes{};
for (auto & field : attributes_vec)
{
attributes += field;
attributes += ";";
}
if (attributes.size() > 0)
attributes.pop_back();

throw std::runtime_error("Malformed GFF record:\n" + attributes);
}
}

struct length_order
{
inline bool operator() (stellar_match const & left, stellar_match const & right)
{
return ((left.dend - left.dbegin) < (right.dend - right.dbegin));
}
};

bool operator == (stellar_match const & other) const
{
if (dname == other.dname &&
dbegin == other.dbegin &&
dend == other.dend &&
is_forward_match == other.is_forward_match)
is_forward_match == other.is_forward_match &&
qname == other.qname &&
qbegin == other.qbegin &&
percid_is_equal_to(other.percid))
return true;
else
return false;
Expand All @@ -64,6 +108,26 @@ struct stellar_match
}
}

std::string get_cigar() const
{
std::vector<std::string> attributes_vec = get_line_vector<std::string>(alignment_attributes, ';');
return attributes_vec[attributes_vec.size() - 2];
}

std::string get_mutations() const
{
return alignment_attributes.substr(alignment_attributes.find("mutations="));
}

bool percid_is_equal_to(std::string const & other) const
{
float eps{0.001};
float this_percid = std::stof(percid);
float other_percid = std::stof(other);

return abs(this_percid - other_percid) < eps;
}

std::string to_string() const
{
std::string match_str = dname;
Expand All @@ -82,7 +146,16 @@ struct stellar_match
match_str += "-";

match_str += "\t.\t";
match_str += attributes;

// 1;seq2Range=1280,1378;cigar=97M1D2M;mutations=14A,45G,58T,92C
match_str += qname;
match_str += ";";
match_str += "seq2Range=";
match_str += std::to_string(qbegin);
match_str += ",";
match_str += std::to_string(qend);
match_str += ";";
match_str += alignment_attributes;
match_str += "\n";

return match_str;
Expand Down
2 changes: 1 addition & 1 deletion include/utilities/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace valik
{

template <typename field_t>
std::vector<field_t> get_line_vector(std::string const line, char const delim)
std::vector<field_t> get_line_vector(std::string const & line, char const delim)
{
std::vector<field_t> line_vec;

Expand Down
19 changes: 0 additions & 19 deletions include/valik/argument_parsing/validators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,6 @@ struct power_of_two_validator
}
};

struct error_rate_validator
{
using option_value_type = float;

void operator()(option_value_type const & val) const
{
if ((val < 0.0) || (val > 0.2))
{
throw sharg::validation_error{"The provided error rate is not in range [0, 0.2]."};
}
}

std::string get_help_page_message() const
{
return "Error rate must be in range [0, 0.2].";
}

};

class positive_integer_validator
{
public:
Expand Down
9 changes: 1 addition & 8 deletions include/valik/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <sharg/exceptions.hpp>
#include <seqan3/search/dream_index/interleaved_bloom_filter.hpp>

#include <../lib/raptor/include/raptor/hierarchical_interleaved_bloom_filter.hpp>
#include <valik/shared.hpp>

namespace valik
Expand All @@ -21,14 +20,8 @@ namespace index_structure
using ibf = seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed>;
using ibf_compressed = seqan3::interleaved_bloom_filter<seqan3::data_layout::compressed>;

// TODO: add support for HIBF
using hibf = raptor::hierarchical_interleaved_bloom_filter<seqan3::data_layout::uncompressed>;
using hibf_compressed = raptor::hierarchical_interleaved_bloom_filter<seqan3::data_layout::compressed>;

template <typename return_t, typename input_t>
concept compressible_from =
(std::same_as<return_t, ibf_compressed> && std::same_as<input_t, ibf>) ||
(std::same_as<return_t, hibf_compressed> && std::same_as<input_t, hibf>);
concept compressible_from = (std::same_as<return_t, ibf_compressed> && std::same_as<input_t, ibf>);

} // namespace index_structure

Expand Down
6 changes: 5 additions & 1 deletion include/valik/search/prefilter_queries_parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,19 @@ inline void prefilter_queries_parallel(seqan3::interleaved_bloom_filter<ibf_data
size_t const num_records = records.size();
size_t const records_per_thread = num_records / arguments.threads;

sync_out verbose_out(arguments.disabledQueriesFile);
for (size_t i = 0; i < arguments.threads; ++i)
{
size_t const start = records_per_thread * i;
size_t const end = i == (unsigned) (arguments.threads - 1) ? num_records : records_per_thread * (i + 1);

std::span<query_t const> records_slice{&records[start], &records[end]};

auto result_cb = [&queue](query_t const& record, std::unordered_set<size_t> const& bin_hits)
auto result_cb = [&queue,&arguments,&verbose_out,&ibf](query_t const& record, std::unordered_set<size_t> const& bin_hits)
{
if (arguments.verbose && (bin_hits.size() > std::max((size_t) 4, (size_t) std::round(ibf.bin_count() / 2.0))))
verbose_out.write_record(record, bin_hits.size());

for (size_t const bin : bin_hits)
{
queue.insert(bin, record);
Expand Down
12 changes: 7 additions & 5 deletions include/valik/search/search_distributed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ bool search_distributed(search_arguments const & arguments, search_time_statisti
std::mutex mutex;
execution_metadata exec_meta(arguments.threads);

// negative (reverse complemented) database strand
//!TODO: add switch for reverse strand
bool const reverse = true /*threadOptions.reverse && threadOptions.alphabet != "protein" && threadOptions.alphabet != "char" */;

bool error_in_search = false; // indicates if an error happen inside this lambda
auto consumerThreads = std::vector<std::jthread>{};
for (size_t threadNbr = 0; threadNbr < arguments.threads; ++threadNbr)
Expand All @@ -68,7 +64,7 @@ bool search_distributed(search_arguments const & arguments, search_time_statisti
write_cart_queries(records, cart_queries_path);

std::vector<std::string> process_args{};
process_args.insert(process_args.end(), {var_pack.stellar_exec, "--version-check", "0", "-a", "dna"});
process_args.insert(process_args.end(), {var_pack.stellar_exec, "--version-check", "0", "--verbose", "-a", "dna"});

if (ref_meta)
{
Expand Down Expand Up @@ -101,6 +97,12 @@ bool search_distributed(search_arguments const & arguments, search_time_statisti
process_args.insert(process_args.end(), {"-e", std::to_string(numEpsilon),
"-l", std::to_string(arguments.pattern_size),
"-o", std::string(cart_queries_path) + ".gff"});

process_args.insert(process_args.end(), {"--repeatPeriod", std::to_string(arguments.maxRepeatPeriod)});
process_args.insert(process_args.end(), {"--repeatLength", std::to_string(arguments.minRepeatLength)});
process_args.insert(process_args.end(), {"--verification", arguments.strVerificationMethod});
process_args.insert(process_args.end(), {"--xDrop", std::to_string(arguments.xDrop)});
process_args.insert(process_args.end(), {"--abundanceCut", std::to_string(arguments.qgramAbundanceCut)});

auto start = std::chrono::high_resolution_clock::now();
external_process process(process_args);
Expand Down
10 changes: 9 additions & 1 deletion include/valik/search/search_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ bool search_local(search_arguments const & arguments, search_time_statistics & t
stellar::StellarOptions threadOptions{};
stellar::stellar_app_runtime stellarThreadTime{};
threadOptions.alphabet = "dna"; // Possible values: dna, rna, protein, char
threadOptions.verbose = true;
threadOptions.queryFile = cart_queries_path.string();
threadOptions.prefilteredSearch = true;
threadOptions.referenceLength = refLen;
Expand All @@ -142,8 +143,15 @@ bool search_local(search_arguments const & arguments, search_time_statistics & t
threadOptions.numEpsilon = std::max(arguments.error_rate, (float) 0.00001);
threadOptions.epsilon = stellar::utils::fraction::from_double(threadOptions.numEpsilon).limit_denominator();
threadOptions.minLength = arguments.pattern_size;
threadOptions.disableThresh = arguments.disableThresh;
threadOptions.outputFile = cart_queries_path.string() + ".gff";

{
threadOptions.maxRepeatPeriod = arguments.maxRepeatPeriod;
threadOptions.minRepeatLength = arguments.minRepeatLength;
threadOptions.strVerificationMethod = arguments.strVerificationMethod;
threadOptions.xDrop = arguments.xDrop;
threadOptions.qgramAbundanceCut = arguments.qgramAbundanceCut;
}

using TDatabaseSegment = stellar::StellarDatabaseSegment<TAlphabet>;
using TQuerySegment = seqan2::Segment<seqan2::String<TAlphabet> const, seqan2::InfixSegment>;
Expand Down
18 changes: 15 additions & 3 deletions include/valik/search/sync_out.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@
#include <fstream>
#include <mutex>

#include <seqan3/core/debug_stream.hpp>
#include <seqan3/io/all.hpp>

namespace valik::app
{

class sync_out
{

public:
sync_out() = default;
sync_out(sync_out const &) = default;
Expand All @@ -20,14 +24,22 @@ class sync_out
sync_out(std::filesystem::path const & path) : file{path} {}

template <typename t>
void write(t && data)
void write_record(t && query_record, size_t const & bin_count)
{
std::string fasta_string = ">";
fasta_string += query_record.sequence_id;
fasta_string += '\n';
for (auto & n : query_record.sequence)
fasta_string += seqan3::to_char(n);
fasta_string += '\n';

std::lock_guard<std::mutex> lock(write_mutex);
file << std::forward<t>(data);
seqan3::debug_stream << "[Warning] Insufficient prefiltering. " << bin_count << " bins match query:\n" << fasta_string << "\n";
}
// outfile gets unlocked as soon as the current threads exits the write function
// outfile gets unlocked as soon as the current thread exits the write function

private:
//seqan3::sequence_file_output<fields, output_format_types> fout;
std::ofstream file;
std::mutex write_mutex;
};
Expand Down
10 changes: 6 additions & 4 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ struct split_arguments
std::filesystem::path meta_out{"metadata.txt"};

size_t overlap{150};
size_t seg_count{64};
size_t seg_count_in{64};
uint32_t seg_count{64};
uint32_t seg_count_in{64};
bool split_index{false};
bool write_ref{false};
bool write_query{false};
Expand All @@ -82,6 +82,7 @@ struct build_arguments
uint64_t bits{4096};
uint64_t hash{2};
bool compressed{false};
bool fast{false};

std::filesystem::path ref_meta_path{};
};
Expand Down Expand Up @@ -135,6 +136,8 @@ struct search_arguments final : public minimiser_threshold_arguments, public ste

bool compressed{false};
bool write_time{false};
bool fast{false};
bool verbose{false};

size_t cart_max_capacity{3}; //!TODO determine suitable values
size_t max_queued_carts{10}; //!TODO determine suitable values
Expand All @@ -145,7 +148,7 @@ struct search_arguments final : public minimiser_threshold_arguments, public ste
{
.window_size{window_size},
.shape{shape},
.pattern_size{pattern_size},
.query_length{pattern_size},
.errors{errors},
.percentage{threshold},
.p_max{p_max},
Expand All @@ -160,7 +163,6 @@ struct search_arguments final : public minimiser_threshold_arguments, public ste
std::filesystem::path ref_meta_path{};
std::filesystem::path query_meta_path{};
bool distribute{false};

};

} // namespace valik
7 changes: 4 additions & 3 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,9 @@ struct metadata
size_t seq_count;
size_t seg_count;

private:
std::vector<sequence_stats> sequences;
std::vector<sequence_stats> sequences;

private:
size_t default_seg_len;
std::vector<segment_stats> segments;

Expand Down Expand Up @@ -370,7 +371,7 @@ struct metadata
{
auto it = std::find_if(sequences.begin(), sequences.end(), [&](const sequence_stats& seq) { return seq.id == string_id;});
if (it == sequences.end())
throw seqan3::validation_error{"Sequence metadata does not contain sequence from Stellar output."};
throw seqan3::validation_error{"Sequence metadata does not contain sequence " + string_id + " from Stellar output."};
else
return (*it).ind;
}
Expand Down
2 changes: 1 addition & 1 deletion lib/raptor
Submodule raptor updated 448 files
2 changes: 1 addition & 1 deletion lib/stellar3
Submodule stellar3 updated 0 files
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ target_link_libraries ("${PROJECT_NAME}_build_lib" PUBLIC "${PROJECT_NAME}_inter

# Valik search
add_library ("raptor_threshold" STATIC
../lib/raptor/src/threshold/threshold.cpp
../lib/raptor/src/threshold/one_indirect_error_model.cpp
../lib/raptor/src/threshold/multiple_error_model.cpp
../lib/raptor/src/threshold/pascal_row.cpp
Expand Down
Loading