Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Increase max bound for search arguments (#126)" #129

Merged
merged 1 commit into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/utilities/threshold/basics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,17 @@ inline size_t kmer_lemma_threshold(size_t const l, uint8_t const k, uint8_t cons
struct param_space
{
constexpr static uint8_t max_errors{15};
uint16_t max_thresh{20};
uint8_t max_thresh{20};
constexpr static size_t max_len{150};
constexpr static std::pair<uint8_t, uint8_t> kmer_range{9, 35};
constexpr static std::pair<uint8_t, uint8_t> kmer_range{9, 21};

param_space() noexcept = default;
param_space(param_space const &) noexcept = default;
param_space & operator=(param_space const &) noexcept = default;
param_space & operator=(param_space &&) noexcept = default;
~param_space() noexcept = default;

param_space(uint16_t thresh) : max_thresh(thresh) {}
param_space(uint8_t thresh) : max_thresh(thresh) {}

uint8_t min_k() const
{
Expand Down
12 changes: 2 additions & 10 deletions include/utilities/threshold/kmer_loss.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#include <cereal/archives/binary.hpp>
#include <cereal/types/vector.hpp>

#include <seqan3/core/debug_stream.hpp>

namespace valik
{

Expand Down Expand Up @@ -83,14 +81,8 @@ struct kmer_loss
if (kmer_lemma_threshold(pattern.l, params.k, pattern.e) >= params.t)
return 0.0;
if (params.t > fn_conf_counts.size())
throw std::runtime_error("Calculated configuration count table for threshold=[1, " + std::to_string(fn_conf_counts.size()) + "]. "
"Can't find FNR for threshold=" + std::to_string(params.t));
if (pattern.e >= fn_conf_counts[0].size())
throw std::runtime_error("Calculated configuration count table for errors=[0, " + std::to_string(fn_conf_counts[0].size()) + "]. "
"Can't find FNR for errors=" + std::to_string(pattern.e));
if (pattern.l >= fn_conf_counts[0][0].size())
throw std::runtime_error("Calculated configuration count table for min_len=[1, " + std::to_string(fn_conf_counts[0][0].size()) + "]. "
"Can't find FNR for min_len=" + std::to_string(pattern.l));
throw std::runtime_error("Calculated configuration count table for t=[1, " + std::to_string(fn_conf_counts.size()) + "]. "
"Can't find FNR for t=" + std::to_string(params.t));
return fn_conf_counts[params.t - 1][pattern.e][pattern.l] / (double) pattern.total_conf_count();
}

Expand Down
11 changes: 2 additions & 9 deletions include/utilities/threshold/param_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,31 +18,24 @@ namespace valik
struct param_set
{
uint8_t k;
uint16_t t;
uint8_t t;

param_set() noexcept = default;
param_set(param_set const &) noexcept = default;
param_set & operator=(param_set const &) noexcept = default;
param_set & operator=(param_set &&) noexcept = default;
~param_set() noexcept = default;

param_set(uint8_t const kmer_size, uint16_t const thresh, param_space const & space) : k(kmer_size), t(thresh)
param_set(uint8_t const kmer_size, uint8_t const thresh, param_space const & space) : k(kmer_size), t(thresh)
{
if ((kmer_size < std::get<0>(space.kmer_range)) | (kmer_size > std::get<1>(space.kmer_range)))
{
throw std::runtime_error{"k=" + std::to_string(kmer_size) + " out of range [" +
std::to_string(std::get<0>(space.kmer_range)) + ", " +
std::to_string(std::get<1>(space.kmer_range)) + "]"};
}

if (thresh > space.max_thresh)
{
throw std::runtime_error{"t=" + std::to_string(thresh) + " out of range [0, " + std::to_string(space.max_thresh) + "]"};
}
}

param_set(uint8_t const kmer_size, uint16_t const thresh) : k(kmer_size), t(thresh) {}

template <class Archive>
void serialize(Archive & archive)
{
Expand Down
2 changes: 1 addition & 1 deletion include/valik/search/search_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ bool search_local(search_arguments & arguments, search_time_statistics & time_st
{
search_pattern pattern(arguments.errors, arguments.pattern_size);
param_space space;
param_set params(arguments.shape_size, arguments.threshold);
param_set params(arguments.shape_size, arguments.threshold, space);
filtering_request request(pattern, ref_meta, query_meta.value());
if ((request.fpr(params) > 0.2) && (arguments.search_type != search_kind::STELLAR))
std::cerr << "WARNING: Prefiltering will be inefficient for a high error rate.\n";
Expand Down
2 changes: 1 addition & 1 deletion lib/seqan
5 changes: 2 additions & 3 deletions src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ namespace valik::app

void init_split_parser(sharg::parser & parser, split_arguments & arguments)
{
param_space space{};
init_shared_meta(parser);
parser.add_positional_option(arguments.db_file,
sharg::config{.description = "File containing database sequences. If splitting --metagenome provide a text file with a list of cluster paths.",
Expand All @@ -22,7 +21,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
sharg::config{.short_id = '\0',
.long_id = "pattern",
.description = "Choose how much consecutive segments overlap. This is the minimum length of a local alignment.",
.validator = sharg::arithmetic_range_validator{10u, 1000u}});
.validator = positive_integer_validator{true}});
parser.add_option(arguments.error_rate,
sharg::config{.short_id = 'e',
.long_id = "error-rate",
Expand All @@ -37,7 +36,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
sharg::config{.short_id = 'k',
.long_id = "kmer",
.description = "Choose the kmer size.",
.validator = sharg::arithmetic_range_validator{space.min_k(), space.max_k()}});
.validator = positive_integer_validator{true}});
parser.add_option(arguments.seg_count_in,
sharg::config{.short_id = 'n',
.long_id = "seg-count",
Expand Down
64 changes: 25 additions & 39 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ param_set get_best_params(search_pattern const & pattern,
bool const verbose)
{
param_space space = fn_attr.space;

if (kmer_lemma_threshold(pattern.l, space.max_k(), pattern.e) > space.max_thresh)
return param_set(space.max_k(), kmer_lemma_threshold(pattern.l, space.max_k(), pattern.e));

param_set best_params(space.min_k(), 1, space);
auto best_score = score(fn_attr.get_kmer_loss(space.min_k()), pattern, best_params, ref_meta, PATTERNS_PER_SEGMENT);

Expand Down Expand Up @@ -98,51 +94,41 @@ search_kmer_profile find_thresholds_for_kmer_size(metadata const & ref_meta,
kmer_loss attr,
uint8_t const max_errors)
{
constexpr param_space space{};
param_space space{};
search_kmer_profile kmer_thresh{attr.k, ref_meta.pattern_size};
for (uint8_t errors{0}; errors <= max_errors; errors++)
for (uint8_t errors{0}; errors <= max_errors && errors <= space.max_errors; errors++)
{
search_pattern pattern(errors, ref_meta.pattern_size);
search_kind search_type{search_kind::LEMMA};
auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors), space);
auto try_params = best_params;

if (kmer_lemma_threshold(pattern.l, attr.k, errors) > space.max_thresh)
if ((best_params.t < THRESH_LOWER) ||
segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER)
{
auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors));
double fnr{0}; // == attr.fnr_for_param_set(pattern, best_params)
double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params);
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern});
}
else
{
auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors), space);
auto try_params = best_params;
if ((best_params.t < THRESH_LOWER) ||
segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER)
search_type = search_kind::STELLAR;
try_params.t++;
while ((try_params.t <= space.max_thresh) &&
(attr.fnr_for_param_set(pattern, try_params) < FNR_UPPER))
{
search_type = search_kind::STELLAR;
try_params.t++;
while ((try_params.t <= space.max_thresh) &&
(attr.fnr_for_param_set(pattern, try_params) < FNR_UPPER))
if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER)
{
if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER)
{
best_params = std::move(try_params);
search_type = search_kind::HEURISTIC;
}
try_params.t++;
best_params = std::move(try_params);
search_type = search_kind::HEURISTIC;
}
try_params.t++;
}

if (search_type == search_kind::STELLAR)
{
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type});
}
else
{
double fnr = attr.fnr_for_param_set(pattern, best_params);
double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params);
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern});
}
}

if (search_type == search_kind::STELLAR)
{
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type});
}
else
{
double fnr = attr.fnr_for_param_set(pattern, best_params);
double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params);
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern});
}
}

Expand Down
12 changes: 4 additions & 8 deletions test/api/utilities/threshold/kmer_loss_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,11 @@ TEST(kmer_loss, exhaustive_comparison)
}
}

void try_fnr(uint8_t e, size_t l, uint8_t k, uint16_t t, double expected_fnr)
void try_fnr(uint8_t e, size_t l, uint8_t k, uint8_t t, double expected_fnr)
{
valik::param_space space{};
valik::search_pattern pattern(e, l);
valik::param_set param(k, t);
valik::param_set param(k, t, space);

valik::fn_confs fn_attr(space);

Expand All @@ -187,10 +187,6 @@ TEST(false_negative, try_kmer_lemma_thershold)
try_fnr(0u, (size_t) 50, 16u, 35u, 0.0);
try_fnr(1u, (size_t) 50, 16u, 19u, 0.0);
try_fnr(2u, (size_t) 50, 16u, 3u, 0.0);

try_fnr(0u, (size_t) 1000, 35u, 966u, 0.0);
try_fnr(1u, (size_t) 1000, 35u, 931u, 0.0);
try_fnr(2u, (size_t) 1000, 35u, 896u, 0.0);
}

TEST(false_negative, try_threshold_above_kmer_lemma)
Expand All @@ -204,8 +200,8 @@ TEST(false_negative, try_threshold_above_kmer_lemma)
}
catch( const std::runtime_error& e )
{
EXPECT_STREQ( ("Calculated configuration count table for threshold=[1, " + std::to_string(space.max_thresh) + "]. "
"Can't find FNR for threshold=" + std::to_string(t)).c_str(), e.what() );
EXPECT_STREQ( ("Calculated configuration count table for t=[1, " + std::to_string(space.max_thresh) + "]. "
"Can't find FNR for t=" + std::to_string(t)).c_str(), e.what() );
throw;
}
}, std::runtime_error );
Expand Down
6 changes: 3 additions & 3 deletions test/api/utilities/threshold/param_set_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ TEST(error_handling, kmer_size)
}
catch( const std::runtime_error& e )
{
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 35]").c_str(), e.what() );
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 21]").c_str(), e.what() );
throw;
}
}, std::runtime_error );
Expand All @@ -24,15 +24,15 @@ TEST(error_handling, kmer_size)
valik::param_set(kmer_size, thresh, space);
});

kmer_size = 36;
kmer_size = 22;
EXPECT_THROW({
try
{
valik::param_set(kmer_size, thresh, space);
}
catch( const std::runtime_error& e )
{
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 35]").c_str(), e.what() );
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 21]").c_str(), e.what() );
throw;
}
}, std::runtime_error );
Expand Down
4 changes: 2 additions & 2 deletions test/cli/valik_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ struct split_options : public valik_base {};
TEST_F(split_options, too_few_segments)
{
size_t n = 30;
size_t o = 10;
size_t o = 0;
cli_test_result const result = execute_app("valik", "split", data("query.fasta"),
"--seg-count", std::to_string(n),
"--pattern", std::to_string(o),
Expand All @@ -197,7 +197,7 @@ TEST_F(split_options, too_few_segments)
TEST_F(split_options, overlap_too_large)
{
size_t n = 30;
size_t o = 1000;
size_t o = 2000;
cli_test_result const result = execute_app("valik", "split", data("query.fasta"),
"--seg-count", std::to_string(n),
"--pattern", std::to_string(o),
Expand Down
Loading