Skip to content

Commit

Permalink
Replace --overlap by --pattern size
Browse files Browse the repository at this point in the history
  • Loading branch information
eaasna committed Feb 5, 2024
1 parent a8bab6d commit aa4509e
Show file tree
Hide file tree
Showing 25 changed files with 1,370 additions and 291 deletions.
5 changes: 3 additions & 2 deletions include/utilities/threshold/filtering_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,16 @@ struct filtering_request
/**
* @brief An approximation of the probability of a query segment having a spurious match in a reference bin.
*/
double fpr(param_set const & params, uint64_t const & query_seg_len, uint64_t const & overlap) const
double fpr(param_set const & params, uint64_t const & query_seg_len) const
{
double pattern_p = pattern_spurious_match_prob(params);
/*
uint64_t independent_patterns_per_segment = query_seg_len / (double) l;
double independent_patterns_p = std::min(1.0, pattern_p * independent_patterns_per_segment);
std::cout << "independent_patterns_p\t" << independent_patterns_p << '\n';
*/
uint64_t total_patterns_per_segment = std::round((query_seg_len - l + 1) / (double) (l - overlap));
constexpr uint8_t query_every = 2; // query every 2nd pattern by default
uint64_t total_patterns_per_segment = std::round((query_seg_len - l + 1) / (double) query_every);
double total_patterns_p = std::min(1.0, pattern_p * total_patterns_per_segment);
std::cout << "total_patterns_p\t" << total_patterns_p << '\n';
return total_patterns_p;
Expand Down
2 changes: 1 addition & 1 deletion include/utilities/threshold/kmer_attributes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ struct kmer_attributes
template <typename str_t>
void serialize(str_t & outstr)
{
outstr << "k=" << k << '\n';
outstr << "k=" << std::to_string(k) << '\n';
size_t t = 1;
for (auto threshold_table : fn_conf_counts)
{
Expand Down
4 changes: 2 additions & 2 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,17 @@ struct split_arguments
std::filesystem::path seq_file{};
std::filesystem::path meta_out{"metadata.txt"};

size_t overlap{150};
size_t pattern_size{150};
uint32_t seg_count{64};
uint32_t seg_count_in{64};
bool split_index{false};
float error_rate{0.05};
uint8_t errors{0};
size_t pattern_size{100};
uint8_t kmer_size{20};
size_t threshold{};
std::filesystem::path ref_meta_path{};
bool write_out{false};
bool only_split{false};
bool verbose{false};
};

Expand Down
8 changes: 4 additions & 4 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ struct metadata
void scan_database_sequences(split_arguments const & arguments)
{
default_seg_len = total_len / arguments.seg_count + 1;
if (default_seg_len <= arguments.overlap)
if (default_seg_len <= arguments.pattern_size)
{
throw std::runtime_error("Segments of length " + std::to_string(default_seg_len) + "bp can not overlap by " +
std::to_string(arguments.overlap) + "bp.\nDecrease the overlap or the number of segments.");
std::to_string(arguments.pattern_size) + "bp.\nDecrease the overlap or the number of segments.");
}

size_t len_lower_bound = default_seg_len / 10;
Expand All @@ -298,9 +298,9 @@ struct metadata
}

if (arguments.split_index)
make_exactly_n_segments(arguments.seg_count, arguments.overlap, first_long_seq);
make_exactly_n_segments(arguments.seg_count, arguments.pattern_size, first_long_seq);
else
make_equal_length_segments(arguments.seg_count, arguments.overlap, first_long_seq);
make_equal_length_segments(arguments.seg_count, arguments.pattern_size, first_long_seq);

std::stable_sort(sequences.begin(), sequences.end(), fasta_order());
std::stable_sort(segments.begin(), segments.end(), fasta_order());
Expand Down
13 changes: 9 additions & 4 deletions src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
.long_id = "out",
.description = "Please provide a valid path to the database metadata output.",
.validator = sharg::output_file_validator{sharg::output_file_open_options::open_or_create}});
parser.add_option(arguments.overlap,
parser.add_option(arguments.pattern_size,
sharg::config{.short_id = '\0',
.long_id = "overlap",
.description = "Choose how much consecutive segments overlap.",
.long_id = "pattern",
.description = "Choose how much consecutive segments overlap. This is the minimum length of a local alignment.",
.validator = positive_integer_validator{true}});
parser.add_option(arguments.error_rate,
sharg::config{.short_id = 'e',
Expand All @@ -46,6 +46,11 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
.long_id = "write-ref",
.description = "Write an output FASTA file for each reference segment or write all query segments into a single output FASTA file..",
.advanced = true});
parser.add_flag(arguments.only_split,
sharg::config{.short_id = '\0',
.long_id = "without-parameter-tuning",
.description = "Preprocess database without setting default parameters.",
.advanced = true});
parser.add_flag(arguments.verbose,
sharg::config{.short_id = 'v',
.long_id = "verbose",
Expand All @@ -68,7 +73,7 @@ void run_split(sharg::parser & parser)
arguments.meta_out.replace_extension("meta");
}

if (!arguments.split_index && !parser.is_option_set("ref-meta-path"))
if (!arguments.split_index && !arguments.only_split && !parser.is_option_set("ref-meta"))
throw sharg::parser_error{"Need to provide path to reference metadata to process a query database."};

// ==========================================
Expand Down
34 changes: 21 additions & 13 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,31 @@ param_set get_best_params(param_space const & space,
std::vector<double> kmer_fn;
std::vector<double> kmer_fp;
uint8_t k = att.k;
for (uint8_t t = 1; t <= space.max_thresh; t++)
if (k > request.l)
break;
else
{
param_set params(k, t, param_space());
auto score = param_score(request, params, att);
kmer_scores.push_back(score);
kmer_fn.push_back(att.fnr_for_param_set(request, params));
kmer_fp.push_back(request.pattern_spurious_match_prob(params));
if (score < best_score)
for (uint8_t t = 1; t <= space.max_thresh; t++)
{
best_score = score;
best_params = params;
param_set params(k, t, param_space());
auto score = param_score(request, params, att);
std::cout << std::to_string(k) << '\t' << std::to_string(t) << '\t' << score << '\n';
kmer_scores.push_back(score);
kmer_fn.push_back(att.fnr_for_param_set(request, params));
kmer_fp.push_back(request.pattern_spurious_match_prob(params));
if (score <= best_score)
{
best_score = score;
best_params = params;
}
}
scores.push_back(kmer_scores);
fn_rates.push_back(kmer_fn);
fp_rates.push_back(kmer_fp);
}
scores.push_back(kmer_scores);
fn_rates.push_back(kmer_fn);
fp_rates.push_back(kmer_fp);
}

/*
std::cout << '\t';
for (size_t i{1}; i <= space.max_thresh; i++)
std::cout << i << '\t';
Expand All @@ -75,6 +82,7 @@ param_set get_best_params(param_space const & space,
std::cout << cell << '\t';
std::cout << '\n';
}
*/

return best_params;
}
Expand Down Expand Up @@ -151,7 +159,7 @@ uint8_t find_threshold(param_space const & space,
std::cout << "threshold " << std::to_string(best_params.t) << '\n';
std::cout << "FNR " << att.fnr_for_param_set(request, best_params) << '\n';
std::cout << "segment len " << std::to_string(std::round(query_meta.total_len / (double) query_meta.seg_count)) << '\n';
std::cout << "FPR " << request.fpr(best_params, std::round(query_meta.total_len / (double) query_meta.seg_count), arguments.overlap) << '\n';
std::cout << "FPR " << request.fpr(best_params, std::round(query_meta.total_len / (double) query_meta.seg_count)) << '\n';
}

return best_params.t;
Expand Down
73 changes: 41 additions & 32 deletions src/valik_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,46 +18,55 @@ void valik_split(split_arguments & arguments)
metadata meta(arguments);
meta.to_file(arguments.meta_out);

// ==========================================
// Parameter deduction
// ==========================================
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

if (arguments.split_index)
if (!arguments.only_split)
{
size_t errors = meta.segment_overlap() * arguments.error_rate;
filtering_request request(errors, meta.segment_overlap(), meta.total_len, meta.seg_count);
auto best_params = get_best_params(space, request, attr_vec);
arguments.kmer_size = best_params.k;
// ==========================================
// Parameter deduction
// ==========================================
auto space = param_space();
std::vector<kmer_attributes> attr_vec;
if (!read_fn_confs(attr_vec))
precalc_fn_confs(attr_vec);

if (arguments.verbose)
std::cout << arguments.pattern_size << '\n';

if (arguments.split_index)
{
std::cout << "Build index for:\n";
std::cout << "db length " << meta.total_len << "bp\n";
std::cout << "min local match length " << meta.segment_overlap() << "bp\n";
std::cout << "max error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';
filtering_request request(arguments.errors, arguments.pattern_size, meta.total_len, meta.seg_count);
auto best_params = get_best_params(space, request, attr_vec);
arguments.kmer_size = best_params.k;

if (arguments.verbose)
{
std::cout << "Build index for:\n";
std::cout << "db length " << meta.total_len << "bp\n";
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "max error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';

find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate);
find_thresholds_for_kmer_size(space, meta, attr_vec[best_params.k - std::get<0>(space.kmer_range)], arguments.error_rate);
}
}
}
else
{
if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.kmer_size, (size_t) arguments.errors) <= 1)
else
{
metadata ref_meta = metadata(arguments.ref_meta_path);
std::cout.precision(3);
std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n";
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';
if (kmer_lemma_threshold(arguments.pattern_size, (size_t) arguments.kmer_size, (size_t) arguments.errors) <= 1)
{
metadata ref_meta = metadata(arguments.ref_meta_path);
auto best_threshold = find_threshold(space, ref_meta, meta, arguments, attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)]);
if (arguments.verbose)
{
std::cout.precision(3);
std::cout << "Can not search database with an exact k-mer lemma threshold with parameters\n";
std::cout << "min local match length " << arguments.pattern_size << "bp\n";
std::cout << "error rate " << arguments.error_rate << "\n";
std::cout << "kmer size " << std::to_string(arguments.kmer_size) << '\n';

std::cout << "Applying heuristic threshold\n";
std::cout << "use threshold " << find_threshold(space, ref_meta, meta, arguments, attr_vec[arguments.kmer_size - std::get<0>(space.kmer_range)]);
std::cout << "Applying heuristic threshold\n";
std::cout << "use threshold " << best_threshold;
}
}
}

}

if (arguments.write_out)
Expand Down
5 changes: 2 additions & 3 deletions test/cli/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ target_use_datasources (valik_test FILES bin_5.fasta)
target_use_datasources (valik_test FILES bin_6.fasta)
target_use_datasources (valik_test FILES bin_7.fasta)

target_use_datasources (valik_test FILES query.fastq)
target_use_datasources (valik_test FILES query.fasta)
target_use_datasources (valik_test FILES ref.fasta)
target_use_datasources (valik_test FILES various_chromosome_lengths.fasta)
target_use_datasources (valik_test FILES 0overlap4bins.txt)
Expand All @@ -37,7 +37,6 @@ target_use_datasources (valik_test FILES 0overlap16bins.txt)
target_use_datasources (valik_test FILES 20overlap16bins.txt)

target_use_datasources (valik_test FILES single_reference.fasta)
target_use_datasources (valik_test FILES single_query.fq)

target_use_datasources (valik_test FILES 150overlap4bins.txt)
target_use_datasources (valik_test FILES 150overlap4bins13window.ibf)
Expand Down Expand Up @@ -77,7 +76,7 @@ target_use_datasources (dream_test FILES 16bins15window1error.gff)
target_use_datasources (dream_test FILES 4bins13window1error.gff)
target_use_datasources (dream_test FILES 4bins15window1error.gff)
target_use_datasources (dream_test FILES dummy_reads.fastq)
target_use_datasources (dream_test FILES query.fastq)
target_use_datasources (dream_test FILES query.fasta)
target_use_datasources (dream_test FILES ref.fasta)
target_use_datasources (dream_test FILES ref_meta.txt)
target_use_datasources (dream_test FILES seg_meta150overlap16bins.txt)
Expand Down
3 changes: 2 additions & 1 deletion test/cli/dream_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ TEST_P(dream_split_search, split_shared_mem)
data("query.fastq"),
"--out", query_meta_path,
"--seg-count ", std::to_string(query_seg_count),
"--overlap 0");
"--ref-meta", ref_meta_path,
"--pattern 0");
EXPECT_EQ(split_query.exit_code, 0);

cli_test_result const build = execute_app("valik", "build",
Expand Down
Loading

0 comments on commit aa4509e

Please sign in to comment.