Skip to content

Commit

Permalink
Redo "Increase max bound for search arguments (#126)""
Browse files Browse the repository at this point in the history
This reverts commit 08b6a7e.
  • Loading branch information
eaasna committed Nov 7, 2024
1 parent 08b6a7e commit fcffcae
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 45 deletions.
6 changes: 3 additions & 3 deletions include/utilities/threshold/basics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,17 @@ inline size_t kmer_lemma_threshold(size_t const l, uint8_t const k, uint8_t cons
struct param_space
{
constexpr static uint8_t max_errors{15};
uint8_t max_thresh{20};
uint16_t max_thresh{20};
constexpr static size_t max_len{150};
constexpr static std::pair<uint8_t, uint8_t> kmer_range{9, 21};
constexpr static std::pair<uint8_t, uint8_t> kmer_range{9, 35};

param_space() noexcept = default;
param_space(param_space const &) noexcept = default;
param_space & operator=(param_space const &) noexcept = default;
param_space & operator=(param_space &&) noexcept = default;
~param_space() noexcept = default;

param_space(uint8_t thresh) : max_thresh(thresh) {}
param_space(uint16_t thresh) : max_thresh(thresh) {}

uint8_t min_k() const
{
Expand Down
12 changes: 10 additions & 2 deletions include/utilities/threshold/kmer_loss.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <cereal/archives/binary.hpp>
#include <cereal/types/vector.hpp>

#include <seqan3/core/debug_stream.hpp>

namespace valik
{

Expand Down Expand Up @@ -81,8 +83,14 @@ struct kmer_loss
if (kmer_lemma_threshold(pattern.l, params.k, pattern.e) >= params.t)
return 0.0;
if (params.t > fn_conf_counts.size())
throw std::runtime_error("Calculated configuration count table for t=[1, " + std::to_string(fn_conf_counts.size()) + "]. "
"Can't find FNR for t=" + std::to_string(params.t));
throw std::runtime_error("Calculated configuration count table for threshold=[1, " + std::to_string(fn_conf_counts.size()) + "]. "
"Can't find FNR for threshold=" + std::to_string(params.t));
if (pattern.e >= fn_conf_counts[0].size())
throw std::runtime_error("Calculated configuration count table for errors=[0, " + std::to_string(fn_conf_counts[0].size()) + "]. "
"Can't find FNR for errors=" + std::to_string(pattern.e));
if (pattern.l >= fn_conf_counts[0][0].size())
throw std::runtime_error("Calculated configuration count table for min_len=[1, " + std::to_string(fn_conf_counts[0][0].size()) + "]. "
"Can't find FNR for min_len=" + std::to_string(pattern.l));
return fn_conf_counts[params.t - 1][pattern.e][pattern.l] / (double) pattern.total_conf_count();
}

Expand Down
11 changes: 9 additions & 2 deletions include/utilities/threshold/param_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,31 @@ namespace valik
struct param_set
{
uint8_t k;
uint8_t t;
uint16_t t;

param_set() noexcept = default;
param_set(param_set const &) noexcept = default;
param_set & operator=(param_set const &) noexcept = default;
param_set & operator=(param_set &&) noexcept = default;
~param_set() noexcept = default;

param_set(uint8_t const kmer_size, uint8_t const thresh, param_space const & space) : k(kmer_size), t(thresh)
param_set(uint8_t const kmer_size, uint16_t const thresh, param_space const & space) : k(kmer_size), t(thresh)
{
if ((kmer_size < std::get<0>(space.kmer_range)) | (kmer_size > std::get<1>(space.kmer_range)))
{
throw std::runtime_error{"k=" + std::to_string(kmer_size) + " out of range [" +
std::to_string(std::get<0>(space.kmer_range)) + ", " +
std::to_string(std::get<1>(space.kmer_range)) + "]"};
}

if (thresh > space.max_thresh)
{
throw std::runtime_error{"t=" + std::to_string(thresh) + " out of range [0, " + std::to_string(space.max_thresh) + "]"};
}
}

param_set(uint8_t const kmer_size, uint16_t const thresh) : k(kmer_size), t(thresh) {}

template <class Archive>
void serialize(Archive & archive)
{
Expand Down
2 changes: 1 addition & 1 deletion include/valik/search/search_local.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ bool search_local(search_arguments & arguments, search_time_statistics & time_st
{
search_pattern pattern(arguments.errors, arguments.pattern_size);
param_space space;
param_set params(arguments.shape_size, arguments.threshold, space);
param_set params(arguments.shape_size, arguments.threshold);
filtering_request request(pattern, ref_meta, query_meta.value());
if ((request.fpr(params) > 0.2) && (arguments.search_type != search_kind::STELLAR))
std::cerr << "WARNING: Prefiltering will be inefficient for a high error rate.\n";
Expand Down
2 changes: 1 addition & 1 deletion lib/seqan
5 changes: 3 additions & 2 deletions src/argument_parsing/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ namespace valik::app

void init_split_parser(sharg::parser & parser, split_arguments & arguments)
{
param_space space{};
init_shared_meta(parser);
parser.add_positional_option(arguments.db_file,
sharg::config{.description = "File containing database sequences. If splitting --metagenome provide a text file with a list of cluster paths.",
Expand All @@ -21,7 +22,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
sharg::config{.short_id = '\0',
.long_id = "pattern",
.description = "Choose how much consecutive segments overlap. This is the minimum length of a local alignment.",
.validator = positive_integer_validator{true}});
.validator = sharg::arithmetic_range_validator{10u, 1000u}});
parser.add_option(arguments.error_rate,
sharg::config{.short_id = 'e',
.long_id = "error-rate",
Expand All @@ -36,7 +37,7 @@ void init_split_parser(sharg::parser & parser, split_arguments & arguments)
sharg::config{.short_id = 'k',
.long_id = "kmer",
.description = "Choose the kmer size.",
.validator = positive_integer_validator{true}});
.validator = sharg::arithmetic_range_validator{space.min_k(), space.max_k()}});
parser.add_option(arguments.seg_count_in,
sharg::config{.short_id = 'n',
.long_id = "seg-count",
Expand Down
64 changes: 39 additions & 25 deletions src/threshold/find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ param_set get_best_params(search_pattern const & pattern,
bool const verbose)
{
param_space space = fn_attr.space;

if (kmer_lemma_threshold(pattern.l, space.max_k(), pattern.e) > space.max_thresh)
return param_set(space.max_k(), kmer_lemma_threshold(pattern.l, space.max_k(), pattern.e));

param_set best_params(space.min_k(), 1, space);
auto best_score = score(fn_attr.get_kmer_loss(space.min_k()), pattern, best_params, ref_meta, PATTERNS_PER_SEGMENT);

Expand Down Expand Up @@ -94,41 +98,51 @@ search_kmer_profile find_thresholds_for_kmer_size(metadata const & ref_meta,
kmer_loss attr,
uint8_t const max_errors)
{
param_space space{};
constexpr param_space space{};
search_kmer_profile kmer_thresh{attr.k, ref_meta.pattern_size};
for (uint8_t errors{0}; errors <= max_errors && errors <= space.max_errors; errors++)
for (uint8_t errors{0}; errors <= max_errors; errors++)
{
search_pattern pattern(errors, ref_meta.pattern_size);
search_kind search_type{search_kind::LEMMA};
auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors), space);
auto try_params = best_params;

if ((best_params.t < THRESH_LOWER) ||
segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER)
if (kmer_lemma_threshold(pattern.l, attr.k, errors) > space.max_thresh)
{
search_type = search_kind::STELLAR;
try_params.t++;
while ((try_params.t <= space.max_thresh) &&
(attr.fnr_for_param_set(pattern, try_params) < FNR_UPPER))
auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors));
double fnr{0}; // == attr.fnr_for_param_set(pattern, best_params)
double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params);
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern});
}
else
{
auto best_params = param_set(attr.k, kmer_lemma_threshold(pattern.l, attr.k, errors), space);
auto try_params = best_params;
if ((best_params.t < THRESH_LOWER) ||
segment_fpr(ref_meta.pattern_spurious_match_prob(best_params), PATTERNS_PER_SEGMENT) > FPR_UPPER)
{
if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER)
search_type = search_kind::STELLAR;
try_params.t++;
while ((try_params.t <= space.max_thresh) &&
(attr.fnr_for_param_set(pattern, try_params) < FNR_UPPER))
{
best_params = std::move(try_params);
search_type = search_kind::HEURISTIC;
if (segment_fpr(ref_meta.pattern_spurious_match_prob(try_params), PATTERNS_PER_SEGMENT) <= FPR_UPPER)
{
best_params = std::move(try_params);
search_type = search_kind::HEURISTIC;
}
try_params.t++;
}
try_params.t++;
}
}

if (search_type == search_kind::STELLAR)
{
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type});
}
else
{
double fnr = attr.fnr_for_param_set(pattern, best_params);
double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params);
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern});

if (search_type == search_kind::STELLAR)
{
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type});
}
else
{
double fnr = attr.fnr_for_param_set(pattern, best_params);
double fp_per_pattern = ref_meta.pattern_spurious_match_prob(best_params);
kmer_thresh.add_error_rate(errors, {best_params, pattern, search_type, fnr, fp_per_pattern});
}
}
}

Expand Down
12 changes: 8 additions & 4 deletions test/api/utilities/threshold/kmer_loss_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,11 @@ TEST(kmer_loss, exhaustive_comparison)
}
}

void try_fnr(uint8_t e, size_t l, uint8_t k, uint8_t t, double expected_fnr)
void try_fnr(uint8_t e, size_t l, uint8_t k, uint16_t t, double expected_fnr)
{
valik::param_space space{};
valik::search_pattern pattern(e, l);
valik::param_set param(k, t, space);
valik::param_set param(k, t);

valik::fn_confs fn_attr(space);

Expand All @@ -187,6 +187,10 @@ TEST(false_negative, try_kmer_lemma_thershold)
try_fnr(0u, (size_t) 50, 16u, 35u, 0.0);
try_fnr(1u, (size_t) 50, 16u, 19u, 0.0);
try_fnr(2u, (size_t) 50, 16u, 3u, 0.0);

try_fnr(0u, (size_t) 1000, 35u, 966u, 0.0);
try_fnr(1u, (size_t) 1000, 35u, 931u, 0.0);
try_fnr(2u, (size_t) 1000, 35u, 896u, 0.0);
}

TEST(false_negative, try_threshold_above_kmer_lemma)
Expand All @@ -200,8 +204,8 @@ TEST(false_negative, try_threshold_above_kmer_lemma)
}
catch( const std::runtime_error& e )
{
EXPECT_STREQ( ("Calculated configuration count table for t=[1, " + std::to_string(space.max_thresh) + "]. "
"Can't find FNR for t=" + std::to_string(t)).c_str(), e.what() );
EXPECT_STREQ( ("Calculated configuration count table for threshold=[1, " + std::to_string(space.max_thresh) + "]. "
"Can't find FNR for threshold=" + std::to_string(t)).c_str(), e.what() );
throw;
}
}, std::runtime_error );
Expand Down
6 changes: 3 additions & 3 deletions test/api/utilities/threshold/param_set_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ TEST(error_handling, kmer_size)
}
catch( const std::runtime_error& e )
{
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 21]").c_str(), e.what() );
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 35]").c_str(), e.what() );
throw;
}
}, std::runtime_error );
Expand All @@ -24,15 +24,15 @@ TEST(error_handling, kmer_size)
valik::param_set(kmer_size, thresh, space);
});

kmer_size = 22;
kmer_size = 36;
EXPECT_THROW({
try
{
valik::param_set(kmer_size, thresh, space);
}
catch( const std::runtime_error& e )
{
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 21]").c_str(), e.what() );
EXPECT_STREQ( ("k=" + std::to_string(kmer_size) + " out of range [9, 35]").c_str(), e.what() );
throw;
}
}, std::runtime_error );
Expand Down
4 changes: 2 additions & 2 deletions test/cli/valik_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ struct split_options : public valik_base {};
TEST_F(split_options, too_few_segments)
{
size_t n = 30;
size_t o = 0;
size_t o = 10;
cli_test_result const result = execute_app("valik", "split", data("query.fasta"),
"--seg-count", std::to_string(n),
"--pattern", std::to_string(o),
Expand All @@ -197,7 +197,7 @@ TEST_F(split_options, too_few_segments)
TEST_F(split_options, overlap_too_large)
{
size_t n = 30;
size_t o = 2000;
size_t o = 1000;
cli_test_result const result = execute_app("valik", "split", data("query.fasta"),
"--seg-count", std::to_string(n),
"--pattern", std::to_string(o),
Expand Down

0 comments on commit fcffcae

Please sign in to comment.