From 152d90259bd87e626a42b4e0941daf67289da880 Mon Sep 17 00:00:00 2001 From: Evelin Date: Wed, 25 Sep 2024 14:01:03 +0200 Subject: [PATCH 1/2] Remove raptor data simulaiton submodule --- .gitmodules | 3 --- lib/raptor_data_simulation | 1 - test/data/README.md | 2 +- test/data/simulate_input.sh | 4 ++-- 4 files changed, 3 insertions(+), 7 deletions(-) delete mode 160000 lib/raptor_data_simulation diff --git a/.gitmodules b/.gitmodules index 533f40dd..c6e0fce8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -8,9 +8,6 @@ [submodule "lib/raptor"] path = lib/raptor url = https://github.com/seqan/raptor.git -[submodule "lib/raptor_data_simulation"] - path = lib/raptor_data_simulation - url = git@github.com:eaasna/raptor_data_simulation.git [submodule "lib/seqan"] path = lib/seqan url = git@github.com:seqan/seqan.git diff --git a/lib/raptor_data_simulation b/lib/raptor_data_simulation deleted file mode 160000 index 5046323f..00000000 --- a/lib/raptor_data_simulation +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5046323f5d516b35d10e599ef2f40c0ec0035387 diff --git a/test/data/README.md b/test/data/README.md index 053aed9e..e5cc4b5e 100644 --- a/test/data/README.md +++ b/test/data/README.md @@ -5,7 +5,7 @@ Large files (>50kb) should not be stored here, but rather fetched from a webserv Register each file (from external or internal storage) in the `datasources.cmake` file. There are bash scripts that set up the test cases: -1. Simulate the input data `simulate_input.sh`. The data simulation uses [raptor_data_simulation](https://github.com/eseiler/raptor_data_simulation/blob/master/src/simulate.sh "data simulation source code") which has to be built from source. Also set the correct binary directory in the script. +1. Simulate the input data `simulate_input.sh`. The data simulation uses [raptor_data_simulation](https://github.com/eaasna/raptor_data_simulation "data simulation source code") which has to be built from source. Also set the correct binary directory in the script. 2. Call the valik application and generate ground truth output files for testing with `create_output.sh`. The valik binaries have to be in PATH. 3. Register all the input and output files in the `datasources.cmake` file with `update_datasources.sh`. diff --git a/test/data/simulate_input.sh b/test/data/simulate_input.sh index 98bfa87d..6d33b762 100755 --- a/test/data/simulate_input.sh +++ b/test/data/simulate_input.sh @@ -17,8 +17,8 @@ for exec in "${execs[@]}"; do # trying to do some guessing here: paths=(../../build/bin) - paths+=(../../lib/raptor_data_simulation/build/bin) - paths+=(../../lib/raptor_data_simulation/build/src/mason2/src/mason2-build/bin) + paths+=(../../../raptor_data_simulation/build/bin) + paths+=(../../../raptor_data_simulation/build/src/mason2/src/mason2-build/bin) paths+=(../../../stellar3/build/bin) p="" From 31fc038c2f831f4c25400127fbacf1839315096e Mon Sep 17 00:00:00 2001 From: Evelin Date: Wed, 25 Sep 2024 14:33:47 +0200 Subject: [PATCH 2/2] Incorporate raptor submodule --- .gitmodules | 6 - include/raptor/LICENSE.md | 30 +++ include/raptor/adjust_seed.hpp | 42 ++++ include/raptor/dna4_traits.hpp | 25 +++ include/raptor/file_reader.hpp | 181 ++++++++++++++++++ include/raptor/strong_types.hpp | 26 +++ .../threshold/forward_strand_minimiser.hpp | 137 +++++++++++++ include/raptor/threshold/logspace.hpp | 59 ++++++ .../raptor/threshold/multiple_error_model.hpp | 25 +++ include/raptor/threshold/one_error_model.hpp | 25 +++ .../threshold/one_indirect_error_model.hpp | 23 +++ include/raptor/threshold/pascal_row.hpp | 23 +++ .../threshold/precompute_correction.hpp | 22 +++ .../raptor/threshold/precompute_threshold.hpp | 22 +++ include/raptor/threshold/threshold.hpp | 52 +++++ .../raptor/threshold/threshold_parameters.hpp | 41 ++++ lib/raptor | 1 - lib/robin-hood-hashing | 1 - src/CMakeLists.txt | 16 +- src/raptor/threshold/LICENSE.md | 30 +++ src/raptor/threshold/multiple_error_model.cpp | 82 ++++++++ src/raptor/threshold/one_error_model.cpp | 60 ++++++ .../threshold/one_indirect_error_model.cpp | 91 +++++++++ src/raptor/threshold/pascal_row.cpp | 31 +++ .../threshold/precompute_correction.cpp | 109 +++++++++++ src/raptor/threshold/precompute_threshold.cpp | 128 +++++++++++++ src/raptor/threshold/threshold.cpp | 64 +++++++ 27 files changed, 1335 insertions(+), 17 deletions(-) create mode 100644 include/raptor/LICENSE.md create mode 100644 include/raptor/adjust_seed.hpp create mode 100644 include/raptor/dna4_traits.hpp create mode 100644 include/raptor/file_reader.hpp create mode 100644 include/raptor/strong_types.hpp create mode 100644 include/raptor/threshold/forward_strand_minimiser.hpp create mode 100644 include/raptor/threshold/logspace.hpp create mode 100644 include/raptor/threshold/multiple_error_model.hpp create mode 100644 include/raptor/threshold/one_error_model.hpp create mode 100644 include/raptor/threshold/one_indirect_error_model.hpp create mode 100644 include/raptor/threshold/pascal_row.hpp create mode 100644 include/raptor/threshold/precompute_correction.hpp create mode 100644 include/raptor/threshold/precompute_threshold.hpp create mode 100644 include/raptor/threshold/threshold.hpp create mode 100644 include/raptor/threshold/threshold_parameters.hpp delete mode 160000 lib/raptor delete mode 160000 lib/robin-hood-hashing create mode 100644 src/raptor/threshold/LICENSE.md create mode 100644 src/raptor/threshold/multiple_error_model.cpp create mode 100644 src/raptor/threshold/one_error_model.cpp create mode 100644 src/raptor/threshold/one_indirect_error_model.cpp create mode 100644 src/raptor/threshold/pascal_row.cpp create mode 100644 src/raptor/threshold/precompute_correction.cpp create mode 100644 src/raptor/threshold/precompute_threshold.cpp create mode 100644 src/raptor/threshold/threshold.cpp diff --git a/.gitmodules b/.gitmodules index c6e0fce8..1ef73a98 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,12 +2,6 @@ path = lib/seqan3 url = https://github.com/seqan/seqan3.git branch = master -[submodule "lib/robin-hood-hashing"] - path = lib/robin-hood-hashing - url = https://github.com/martinus/robin-hood-hashing.git -[submodule "lib/raptor"] - path = lib/raptor - url = https://github.com/seqan/raptor.git [submodule "lib/seqan"] path = lib/seqan url = git@github.com:seqan/seqan.git diff --git a/include/raptor/LICENSE.md b/include/raptor/LICENSE.md new file mode 100644 index 00000000..70eb996f --- /dev/null +++ b/include/raptor/LICENSE.md @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/include/raptor/adjust_seed.hpp b/include/raptor/adjust_seed.hpp new file mode 100644 index 00000000..f62bc84d --- /dev/null +++ b/include/raptor/adjust_seed.hpp @@ -0,0 +1,42 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::adjust_seed. + * \author Enrico Seiler + */ + +#pragma once + +#include + +namespace raptor +{ + +/*\brief Adjust the default seed such that it does not interfere with the IBF's hashing. + *\param kmer_size The used k-mer size. For gapped shapes, this corresponds to the number of set bits (count()). + *\details + * + * The hashing used with the IBF assumes that the input values are uniformly distributed. + * However, we use a 64 bit seed, and unless the `kmer_size` is 32, not all 64 bits of the k-mers change. + * Hence, we need to shift the seed to the right. + * + * For example, using 2-mers and a seed of length 8 bit, the values for the k-mers will only change for the last 4 bits: + * + * ``` + * seed = 1111'1011 + * kmer = 0000'XXXX + * ``` + * + * `seed XOR kmer` will then always have 4 leading ones. + */ +static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept +{ + return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size); +} + +} // namespace raptor diff --git a/include/raptor/dna4_traits.hpp b/include/raptor/dna4_traits.hpp new file mode 100644 index 00000000..8336890c --- /dev/null +++ b/include/raptor/dna4_traits.hpp @@ -0,0 +1,25 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::dna4_traits. + * \author Enrico Seiler + */ + +#pragma once + +#include + +namespace raptor +{ + +struct dna4_traits : seqan3::sequence_file_input_default_traits_dna +{ + using sequence_alphabet = seqan3::dna4; +}; + +} // namespace raptor diff --git a/include/raptor/file_reader.hpp b/include/raptor/file_reader.hpp new file mode 100644 index 00000000..70535c07 --- /dev/null +++ b/include/raptor/file_reader.hpp @@ -0,0 +1,181 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::file_reader. + * \author Enrico Seiler + */ + +#pragma once + +#include +#include + +#include +#include + +namespace raptor +{ + +enum class file_types +{ + sequence, + minimiser +}; + +template +class file_reader +{}; + +template <> +class file_reader +{ +public: + file_reader() = default; + file_reader(file_reader const &) = default; + file_reader(file_reader &&) = default; // GCOVR_EXCL_LINE + file_reader & operator=(file_reader const &) = default; + file_reader & operator=(file_reader &&) = default; + ~file_reader() = default; + + explicit file_reader(seqan3::shape const shape, uint32_t const window_size) : + minimiser_view{seqan3::views::minimiser_hash(shape, + seqan3::window_size{window_size}, + seqan3::seed{adjust_seed(shape.count())})} + {} + + template it_t> + void hash_into(std::vector const & filenames, it_t target) const + { + for (auto && filename : filenames) + hash_into(filename, target); + } + + template it_t> + void hash_into(std::string const & filename, it_t target) const + { + sequence_file_t fin{filename}; + for (auto && record : fin) + std::ranges::copy(record.sequence() | minimiser_view, target); + } + + template it_t> + void hash_into_if(std::vector const & filenames, it_t target, auto && pred) const + { + for (auto && filename : filenames) + hash_into_if(filename, target, pred); + } + + template it_t> + void hash_into_if(std::string const & filename, it_t target, auto && pred) const + { + sequence_file_t fin{filename}; + for (auto && record : fin) + std::ranges::copy_if(record.sequence() | minimiser_view, target, pred); + } + + void on_hash(std::vector const & filenames, auto && callback) const + { + for (auto && filename : filenames) + on_hash(filename, callback); + } + + void on_hash(std::string const & filename, auto && callback) const + { + sequence_file_t fin{filename}; + for (auto && record : fin) + callback(record.sequence() | minimiser_view); + } + + void for_each_hash(std::vector const & filenames, auto && callback) const + { + for (auto && filename : filenames) + for_each_hash(filename, callback); + } + + void for_each_hash(std::string const & filename, auto && callback) const + { + sequence_file_t fin{filename}; + for (auto && record : fin) + std::ranges::for_each(record.sequence() | minimiser_view, callback); + } + +private: + using sequence_file_t = seqan3::sequence_file_input>; + using view_t = decltype(seqan3::views::minimiser_hash(seqan3::shape{}, seqan3::window_size{}, seqan3::seed{})); + view_t minimiser_view = seqan3::views::minimiser_hash(seqan3::shape{}, seqan3::window_size{}, seqan3::seed{}); +}; + +template <> +class file_reader +{ +public: + file_reader() = default; + file_reader(file_reader const &) = default; + file_reader(file_reader &&) = default; + file_reader & operator=(file_reader const &) = default; + file_reader & operator=(file_reader &&) = default; + ~file_reader() = default; + + explicit file_reader(seqan3::shape const, uint32_t const) + {} + + template it_t> + void hash_into(std::vector const & filenames, it_t target) const + { + for (auto && filename : filenames) + hash_into(filename, target); + } + + template it_t> + void hash_into(std::string const & filename, it_t target) const + { + std::ifstream fin{filename, std::ios::binary}; + uint64_t value; + while (fin.read(reinterpret_cast(&value), sizeof(value))) + { + *target = value; + ++target; + } + } + + template it_t> + void hash_into_if(std::vector const & filenames, it_t target, auto && pred) const + { + for (auto && filename : filenames) + hash_into_if(filename, target, pred); + } + + template it_t> + void hash_into_if(std::string const & filename, it_t target, auto && pred) const + { + std::ifstream fin{filename, std::ios::binary}; + uint64_t value; + while (fin.read(reinterpret_cast(&value), sizeof(value))) + if (pred(value)) + { + *target = value; + ++target; + } + } + + void for_each_hash(std::vector const & filenames, auto && callback) const + { + for (auto && filename : filenames) + for_each_hash(filename, callback); + } + + void for_each_hash(std::string const & filename, auto && callback) const + { + std::ifstream fin{filename, std::ios::binary}; + uint64_t value; + while (fin.read(reinterpret_cast(&value), sizeof(value))) + callback(value); + } +}; + +} // namespace raptor diff --git a/include/raptor/strong_types.hpp b/include/raptor/strong_types.hpp new file mode 100644 index 00000000..9397a3a5 --- /dev/null +++ b/include/raptor/strong_types.hpp @@ -0,0 +1,26 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::window. + * \author Enrico Seiler + */ + +#pragma once + +#include + +namespace raptor +{ + +//!\brief Strong type for passing the window size. +struct window +{ + uint32_t v{}; +}; + +} // namespace raptor diff --git a/include/raptor/threshold/forward_strand_minimiser.hpp b/include/raptor/threshold/forward_strand_minimiser.hpp new file mode 100644 index 00000000..6fc425f7 --- /dev/null +++ b/include/raptor/threshold/forward_strand_minimiser.hpp @@ -0,0 +1,137 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::forward_strand_minimiser. + * \author Enrico Seiler + */ + +#pragma once + +#include + +#include +#include + +#include +#include + +namespace raptor::threshold +{ + +// Minimiser without looking at reverse complement +struct forward_strand_minimiser +{ +private: + //!\brief The window size of the minimiser. + uint64_t window_size{}; + //!\brief The shape to use. + seqan3::shape shape{}; + //!\brief The size of the shape. + uint8_t shape_size{}; + //!\brief Random but fixed value to xor k-mers with. Counteracts consecutive minimisers. + uint64_t seed{}; + //!\brief Stores the k-mer hashes of the forward strand. + std::vector forward_hashes{}; + +public: + //!\brief Stores the begin positions of the minimisers. + std::vector minimiser_begin; + + forward_strand_minimiser() = default; //!< Defaulted + forward_strand_minimiser(forward_strand_minimiser const &) = default; //!< Defaulted + forward_strand_minimiser(forward_strand_minimiser &&) = default; //!< Defaulted + forward_strand_minimiser & operator=(forward_strand_minimiser const &) = default; //!< Defaulted + forward_strand_minimiser & operator=(forward_strand_minimiser &&) = default; //!< Defaulted + ~forward_strand_minimiser() = default; //!< Defaulted + + /*!\brief Constructs a minimiser from given k-mer, window size and a seed. + * \param[in] window_size_ The window size. + * \param[in] shape_ The shape. + */ + forward_strand_minimiser(window const window_size_, seqan3::shape const shape_) : + window_size{window_size_.v}, + shape{shape_}, + shape_size{shape.size()}, + seed{adjust_seed(shape.count())} + { + assert(window_size >= shape_size); + } + + /*!\brief Resize the minimiser. + * \param[in] window_size_ The new window size. + * \param[in] shape_ The new shape. + */ + void resize(window const window_size_, seqan3::shape const shape_) + { + window_size = window_size_.v; + shape = shape_; + shape_size = shape.size(); + seed = adjust_seed(shape.count()); + assert(window_size >= shape_size); + } + + void compute(std::vector const & text) + { + assert(window_size && shape_size && seed); // Forgot to initialise/resize? + + size_t const text_length = text.size(); + assert(shape_size <= text_length); + assert(window_size <= text_length); + + uint64_t const max_number_of_minimiser = text_length - window_size + 1u; + uint64_t const kmers_per_window = window_size - shape_size + 1u; + + minimiser_begin.clear(); + minimiser_begin.reserve(max_number_of_minimiser); + + // Compute all k-mer hashes. + auto apply_xor = [this](uint64_t const value) + { + return value ^ seed; + }; + auto kmer_view = text | seqan3::views::kmer_hash(shape) | std::views::transform(apply_xor); + forward_hashes.assign(kmer_view.begin(), kmer_view.end()); + + // Stores hash and begin for all k-mers in the window + std::deque> window_hashes; + + // Initialisation. We need to compute all hashes for the first window. + for (uint64_t i = 0; i < kmers_per_window; ++i) + window_hashes.emplace_back(forward_hashes[i], i); + + // The minimum hash is the minimiser. Store the begin position. + auto min = std::min_element(std::begin(window_hashes), std::end(window_hashes)); + minimiser_begin.push_back(min->second); + + // For the following windows, we remove the first window k-mer (is now not in window) and add the new k-mer + // that results from the window shifting. + for (uint64_t i = kmers_per_window; i < max_number_of_minimiser; ++i) + { + // Already store the new hash without removing the first one. + uint64_t const new_hash{forward_hashes[i + kmers_per_window - 1]}; // Already did kmers_per_window - 1 many + window_hashes.emplace_back(new_hash, i); + + if (new_hash < min->second) // New hash is the minimum. + { + min = std::prev(std::end(window_hashes)); + minimiser_begin.push_back(min->second); + } + else if (min == std::begin(window_hashes)) // Minimum is the yet-to-be-removed begin of the window. + { + // The first hash will be removed, the last one is caught by the previous if. + min = std::min_element(++std::begin(window_hashes), std::prev(std::end(window_hashes))); + minimiser_begin.push_back(min->second); + } + + window_hashes.pop_front(); // Remove the first k-mer. + } + return; + } +}; + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/logspace.hpp b/include/raptor/threshold/logspace.hpp new file mode 100644 index 00000000..1f3209fb --- /dev/null +++ b/include/raptor/threshold/logspace.hpp @@ -0,0 +1,59 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides functionality for working with probabilities in logspace. + * \author Enrico Seiler + */ + +#pragma once + +#include + +namespace raptor::logspace +{ + +constexpr double ln_2{0.693147180559945309417232121458176568L}; +constexpr double negative_inf{-std::numeric_limits::infinity()}; + +//!\brief The log of a sum of two log terms. +// Produces correct result if either term is -inf. +// Needs a check for the case where both terms are -inf. +[[nodiscard]] inline double add(double const log_x, double const log_y) noexcept +{ + double const max{std::max(log_x, log_y)}; + return max == negative_inf ? negative_inf : max + std::log1p(std::exp(-std::abs(log_x - log_y))); +} + +//!\brief The log of a sum of multiple log terms. +template +[[nodiscard]] double add(double const log_x, double const log_y, types... logs) noexcept +{ + return add(add(log_y, log_x), logs...); +} + +//!\brief The log of a difference of two log terms. (log_x - log_y) +// expm1 is more accurate than using exp if the difference is close to 0. +// std::log1p(-std::exp(difference)) = std::log(1 + (-std::exp(difference))) +// = std::log(1 - std::exp(difference)) +// std::log(-std::expm1(difference)) = std::log(-(std::exp(difference) - 1)) +// = std::log(1 - std::exp(difference)) +[[nodiscard]] inline double substract(double const log_x, double const log_y) noexcept +{ + double const difference{log_y - log_x}; + return log_x + difference > -ln_2 ? std::log(-std::expm1(difference)) : std::log1p(-std::exp(difference)); +} + +struct add_fn +{ + [[nodiscard]] double operator()(double const log_x, double const log_y) const noexcept + { + return add(log_x, log_y); + }; +}; + +} // namespace raptor::logspace diff --git a/include/raptor/threshold/multiple_error_model.hpp b/include/raptor/threshold/multiple_error_model.hpp new file mode 100644 index 00000000..6b3a7154 --- /dev/null +++ b/include/raptor/threshold/multiple_error_model.hpp @@ -0,0 +1,25 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::multiple_error_model. + * \author Enrico Seiler + */ + +#pragma once + +#include +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector multiple_error_model(size_t const number_of_minimisers, + size_t const errors, + std::vector const & affected_by_one_error_prob); + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/one_error_model.hpp b/include/raptor/threshold/one_error_model.hpp new file mode 100644 index 00000000..ff2078a0 --- /dev/null +++ b/include/raptor/threshold/one_error_model.hpp @@ -0,0 +1,25 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::one_error_model. + * \author Enrico Seiler + */ + +#pragma once + +#include +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector one_error_model(size_t const kmer_size, + double const p_mean, + std::vector const & affected_by_one_error_indirectly_prob); + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/one_indirect_error_model.hpp b/include/raptor/threshold/one_indirect_error_model.hpp new file mode 100644 index 00000000..e58f620e --- /dev/null +++ b/include/raptor/threshold/one_indirect_error_model.hpp @@ -0,0 +1,23 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::one_indirect_error_model. + * \author Enrico Seiler + */ + +#pragma once + +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector +one_indirect_error_model(size_t const query_length, size_t const window_size, seqan3::shape const shape); + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/pascal_row.hpp b/include/raptor/threshold/pascal_row.hpp new file mode 100644 index 00000000..58bef5a8 --- /dev/null +++ b/include/raptor/threshold/pascal_row.hpp @@ -0,0 +1,23 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::pascal_row. + * \author Enrico Seiler + */ + +#pragma once + +#include +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector pascal_row(size_t const n); + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/precompute_correction.hpp b/include/raptor/threshold/precompute_correction.hpp new file mode 100644 index 00000000..1f1b83ad --- /dev/null +++ b/include/raptor/threshold/precompute_correction.hpp @@ -0,0 +1,22 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::precompute_correction. + * \author Enrico Seiler + */ + +#pragma once + +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector precompute_correction(threshold_parameters const & arguments); + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/precompute_threshold.hpp b/include/raptor/threshold/precompute_threshold.hpp new file mode 100644 index 00000000..e3fc1a3b --- /dev/null +++ b/include/raptor/threshold/precompute_threshold.hpp @@ -0,0 +1,22 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::precompute_threshold. + * \author Enrico Seiler + */ + +#pragma once + +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector precompute_threshold(threshold_parameters const & arguments); + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/threshold.hpp b/include/raptor/threshold/threshold.hpp new file mode 100644 index 00000000..2a06acd1 --- /dev/null +++ b/include/raptor/threshold/threshold.hpp @@ -0,0 +1,52 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::threshold. + * \author Enrico Seiler + */ + +#pragma once + +#include +#include + +namespace raptor::threshold +{ + +class threshold +{ +public: + threshold() = default; + threshold(threshold const &) = default; + threshold & operator=(threshold const &) = default; + threshold(threshold &&) = default; + threshold & operator=(threshold &&) = default; + ~threshold() = default; + + threshold(threshold_parameters const & arguments); + + size_t get(size_t const minimiser_count) const noexcept; + +private: + enum class threshold_kinds + { + probabilistic, + lemma, + percentage + }; + + threshold_kinds threshold_kind{threshold_kinds::probabilistic}; + std::vector precomp_correction{}; + std::vector precomp_thresholds{}; + size_t kmer_lemma{}; + size_t minimal_number_of_minimizers{}; + size_t maximal_number_of_minimizers{}; + double threshold_percentage{}; +}; + +} // namespace raptor::threshold diff --git a/include/raptor/threshold/threshold_parameters.hpp b/include/raptor/threshold/threshold_parameters.hpp new file mode 100644 index 00000000..0f4764f3 --- /dev/null +++ b/include/raptor/threshold/threshold_parameters.hpp @@ -0,0 +1,41 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides raptor::threshold::threshold_parameters. + * \author Enrico Seiler + */ + +#pragma once + +#include + +#include + +namespace raptor::threshold +{ + +struct threshold_parameters +{ + // Basic. + uint32_t window_size{}; + seqan3::shape shape{}; + uint64_t query_length{}; + + // Threshold. + uint8_t errors{}; // threshold_kinds::(probabilistic|lemma) + double percentage{std::numeric_limits::quiet_NaN()}; // threshold_kinds::percentage + double p_max{}; // threshold_kinds::probabilistic + double fpr{}; // threshold_kinds::probabilistic + double tau{}; // threshold_kinds::probabilistic + + // Cache results. + bool cache_thresholds{}; + std::filesystem::path output_directory{}; +}; + +} // namespace raptor::threshold diff --git a/lib/raptor b/lib/raptor deleted file mode 160000 index 591433cc..00000000 --- a/lib/raptor +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 591433cce10c88c31ea9060a69c50d9f002dadc0 diff --git a/lib/robin-hood-hashing b/lib/robin-hood-hashing deleted file mode 160000 index 9145f963..00000000 --- a/lib/robin-hood-hashing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9145f963d80d6a02f0f96a47758050a89184a3ed diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ae564ce0..62d2be49 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,10 +9,8 @@ target_link_libraries ("${PROJECT_NAME}_interface" INTERFACE seqan3::seqan3) target_link_libraries ("${PROJECT_NAME}_interface" INTERFACE sharg::sharg) target_include_directories ("${PROJECT_NAME}_interface" INTERFACE ../include) # SYSTEM keyword turns off compiler warnings for subprojects -target_include_directories ("${PROJECT_NAME}_interface" SYSTEM INTERFACE ../lib/raptor/include) target_include_directories ("${PROJECT_NAME}_interface" SYSTEM INTERFACE ../lib/stellar3/include) target_include_directories ("${PROJECT_NAME}_interface" SYSTEM INTERFACE ../lib/seqan/include) -target_include_directories ("${PROJECT_NAME}_interface" SYSTEM INTERFACE ../lib/robin-hood-hashing/src/include) target_compile_options ("${PROJECT_NAME}_interface" INTERFACE "-pedantic" "-Wall" "-Wextra" "-Wno-interference-size") # IBF size @@ -36,13 +34,13 @@ target_link_libraries ("${PROJECT_NAME}_build_lib" PUBLIC "${PROJECT_NAME}_inter # Valik search add_library ("raptor_threshold" STATIC - ../lib/raptor/src/threshold/threshold.cpp - ../lib/raptor/src/threshold/one_indirect_error_model.cpp - ../lib/raptor/src/threshold/multiple_error_model.cpp - ../lib/raptor/src/threshold/pascal_row.cpp - ../lib/raptor/src/threshold/one_error_model.cpp - ../lib/raptor/src/threshold/precompute_correction.cpp - ../lib/raptor/src/threshold/precompute_threshold.cpp + raptor/threshold/threshold.cpp + raptor/threshold/one_indirect_error_model.cpp + raptor/threshold/multiple_error_model.cpp + raptor/threshold/pascal_row.cpp + raptor/threshold/one_error_model.cpp + raptor/threshold/precompute_correction.cpp + raptor/threshold/precompute_threshold.cpp ) target_link_libraries ("raptor_threshold" PUBLIC "${PROJECT_NAME}_interface") diff --git a/src/raptor/threshold/LICENSE.md b/src/raptor/threshold/LICENSE.md new file mode 100644 index 00000000..70eb996f --- /dev/null +++ b/src/raptor/threshold/LICENSE.md @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/raptor/threshold/multiple_error_model.cpp b/src/raptor/threshold/multiple_error_model.cpp new file mode 100644 index 00000000..275854f1 --- /dev/null +++ b/src/raptor/threshold/multiple_error_model.cpp @@ -0,0 +1,82 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Implements raptor::threshold::multiple_error_model. + * \author Enrico Seiler + */ + +#include + +#include +#include + +namespace raptor::threshold +{ + +// affected_by_error [2, 0, 1] => 3 errors. First affects two minimisers, second none, third one minimiser. +// current_error: All possible error configs are enumerated. current_error describes which error is to be enumerated. +void impl(size_t const minimisers_to_affect, + std::vector const & affected_by_one_error_prob, + std::vector affected_by_error, + size_t const current_error, + double & result) +{ + if (!minimisers_to_affect) + { + // Determine the probability that this comination of errors occurs. + double current_prob{}; + + // The probability that the errors affect minimisers in this specific way. + for (size_t i = 0; i < current_error; ++i) + current_prob += affected_by_one_error_prob[affected_by_error[i]]; + // Then the other errors must not affect any minimisers. + for (size_t i = current_error; i < affected_by_error.size(); ++i) + current_prob += affected_by_one_error_prob[0]; + + result = logspace::add(result, current_prob); + return; + } + + if (current_error >= affected_by_error.size()) + return; + + // Enumerate. Can't use too many minimisers and can't destroy more than proba.size() with one error. + for (size_t i = 0; i <= minimisers_to_affect && i < affected_by_one_error_prob.size(); ++i) + { + affected_by_error[current_error] = i; + impl(minimisers_to_affect - i, affected_by_one_error_prob, affected_by_error, current_error + 1, result); + } +} + +[[nodiscard]] std::vector multiple_error_model(size_t const number_of_minimisers, + size_t const errors, + std::vector const & affected_by_one_error_prob) +{ + size_t const window_size{affected_by_one_error_prob.size() - 1}; + size_t const max_affected{std::clamp(errors * window_size, 0u, number_of_minimisers)}; + std::vector affected_by_e_errors(max_affected + 1, 0); + + double sum{logspace::negative_inf}; + + // Enumerate all combinations which lead to i many affected minimisers using e errors. + for (size_t i = 0; i <= max_affected; ++i) + { + double result{logspace::negative_inf}; + impl(i, affected_by_one_error_prob, std::vector(errors, 0), 0, result); + affected_by_e_errors[i] = result; + sum = logspace::add(sum, result); + } + + // Normalise. + for (auto & x : affected_by_e_errors) + x -= sum; + + return affected_by_e_errors; +} + +} // namespace raptor::threshold diff --git a/src/raptor/threshold/one_error_model.cpp b/src/raptor/threshold/one_error_model.cpp new file mode 100644 index 00000000..f5a25d3c --- /dev/null +++ b/src/raptor/threshold/one_error_model.cpp @@ -0,0 +1,60 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Implements raptor::threshold::one_error_model. + * \author Enrico Seiler + */ + +#include +#include + +#include +#include +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector one_error_model(size_t const kmer_size, + double const p_mean, + std::vector const & affected_by_one_error_indirectly_prob) +{ + size_t const window_size{affected_by_one_error_indirectly_prob.size() - 1}; + std::vector const coefficients{pascal_row(kmer_size)}; + // Probabilities that i minimisers are affected by one error. + std::vector probabilities(window_size + 1, logspace::negative_inf); + double const inv_p_mean{logspace::substract(0, p_mean)}; + + // One error can affect between 0 and w minimisers. Direct errors 0 to k. + for (size_t i = 0; i <= kmer_size; ++i) + { + // Probability that i minimisers are directly modified by one error. + double const p_direct{coefficients[i] + i * p_mean + (kmer_size - i) * inv_p_mean}; + + // Incorporate indirect errors: + // p_direct = probability that one error affects i minimisers directly + // affected_by_one_error_indirectly_prob[j] = probability that one error affects j minimisers indirectly + // At most w many minimisers can be affected: i + j <= window_size + // indirect and direct errors occur independently + // The for loops will enumerate all combinations of i and j for achieving 0 to w many affected minimiser. + for (size_t j = 0; i + j <= window_size; ++j) + probabilities[i + j] = + logspace::add(probabilities[i + j], p_direct + affected_by_one_error_indirectly_prob[j]); + } + + // Normalise probabilities. + double const p_sum = + std::accumulate(probabilities.begin(), probabilities.end(), logspace::negative_inf, logspace::add_fn{}); + + for (double & x : probabilities) + x -= p_sum; + + return probabilities; +} + +} // namespace raptor::threshold diff --git a/src/raptor/threshold/one_indirect_error_model.cpp b/src/raptor/threshold/one_indirect_error_model.cpp new file mode 100644 index 00000000..992c41e1 --- /dev/null +++ b/src/raptor/threshold/one_indirect_error_model.cpp @@ -0,0 +1,91 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Implements raptor::threshold::one_indirect_error_model. + * \author Enrico Seiler + */ + +#include + +#include +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector +one_indirect_error_model(size_t const query_length, size_t const window_size, seqan3::shape const shape) +{ + uint8_t const kmer_size{shape.size()}; + size_t const max_number_of_minimiser{query_length - window_size + 1}; + size_t const iterations{10'000}; + + std::mt19937_64 gen{0x1D2B8284D988C4D0}; + std::uniform_int_distribution random_error_position{0u, query_length - 1u}; + std::uniform_int_distribution random_dna4_rank{0u, 3u}; + auto random_dna = [&random_dna4_rank, &gen]() + { + return seqan3::assign_rank_to(random_dna4_rank(gen), seqan3::dna4{}); + }; + + std::vector sequence(query_length); + + // Minimiser begin positions of original sequence + std::vector minimiser_positions(max_number_of_minimiser, false); + // Minimiser begin positions after introducing one error into the sequence + std::vector minimiser_positions_error(max_number_of_minimiser, false); + // In the worst case, one error can indirectly affect w minimisers + std::vector result(window_size + 1, 0.0); + forward_strand_minimiser fwd_minimiser{window{static_cast(window_size)}, shape}; + + for (size_t iteration = 0; iteration < iterations; ++iteration) + { + std::ranges::fill(minimiser_positions, 0u); + std::ranges::fill(minimiser_positions_error, 0u); + std::ranges::generate(sequence, random_dna); + + // Minimiser begin positions of original sequence + fwd_minimiser.compute(sequence); + for (auto pos : fwd_minimiser.minimiser_begin) + minimiser_positions[pos] = true; + + // Introduce one error + size_t const error_position = random_error_position(gen); + uint8_t new_rank{random_dna4_rank(gen)}; + while (new_rank == seqan3::to_rank(sequence[error_position])) + new_rank = random_dna4_rank(gen); + sequence[error_position] = seqan3::assign_rank_to(new_rank, seqan3::dna4{}); + + // Minimiser begin positions after introducing one error into the sequence + fwd_minimiser.compute(sequence); + for (auto pos : fwd_minimiser.minimiser_begin) + minimiser_positions_error[pos] = true; + + // Determine number of affected minimisers + size_t affected_minimiser{}; + // An error destroyed a minimiser indirectly iff + // (1) A minimiser begin position changed and + // (2) The error occurs before the window or after the window + for (size_t i = 0; i < max_number_of_minimiser; ++i) + { + affected_minimiser += (minimiser_positions[i] != minimiser_positions_error[i]) && // (1) + ((error_position < i) || (i + kmer_size < error_position)); // (2) + } + + ++result[affected_minimiser]; + } + + // Convert counts to log probabilities + double const log_iterations{std::log(iterations)}; + for (double & x : result) + x = std::log(x) - log_iterations; + + return result; +} + +} // namespace raptor::threshold diff --git a/src/raptor/threshold/pascal_row.cpp b/src/raptor/threshold/pascal_row.cpp new file mode 100644 index 00000000..08c5c01c --- /dev/null +++ b/src/raptor/threshold/pascal_row.cpp @@ -0,0 +1,31 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Implements raptor::threshold::pascal_row. + * \author Enrico Seiler + */ + +#include +#include + +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::vector pascal_row(size_t const n) +{ + std::vector result(n + 1); + + for (size_t i = 1; i <= n; ++i) + result[i] = result[i - 1] + std::log((n + 1 - i) / i); + + return result; +} + +} // namespace raptor::threshold diff --git a/src/raptor/threshold/precompute_correction.cpp b/src/raptor/threshold/precompute_correction.cpp new file mode 100644 index 00000000..287cbdd7 --- /dev/null +++ b/src/raptor/threshold/precompute_correction.cpp @@ -0,0 +1,109 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Implements raptor::threshold::precompute_correction. + * \author Enrico Seiler + */ + +#include + +#include + +#include +#include +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::string const correction_filename(threshold_parameters const & arguments) +{ + std::stringstream stream{}; + stream << "correction_" << std::hex << arguments.query_length << '_' << arguments.window_size << '_' + << arguments.shape.to_ulong() << '_' << arguments.p_max << '_' << arguments.fpr << ".bin"; + std::string result = stream.str(); + if (auto it = result.find("0."); it != std::string::npos) + result.replace(it, 2, ""); + if (auto it = result.find("0."); it != std::string::npos) + result.replace(it, 2, ""); + return result; +} + +void write_correction(std::vector const & vec, threshold_parameters const & arguments) +{ + if (!arguments.cache_thresholds) + return; + + std::filesystem::path filename = arguments.output_directory / correction_filename(arguments); + std::ofstream os{filename, std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(vec); +} + +bool read_correction(std::vector & vec, threshold_parameters const & arguments) +{ + std::filesystem::path filename = arguments.output_directory / correction_filename(arguments); + if (!arguments.cache_thresholds || !std::filesystem::exists(filename)) + return false; + + std::ifstream is{filename, std::ios::binary}; + cereal::BinaryInputArchive iarchive{is}; + iarchive(vec); + return true; +} + +[[nodiscard]] std::vector precompute_correction(threshold_parameters const & arguments) +{ + uint8_t const kmer_size{arguments.shape.size()}; + assert(arguments.window_size != kmer_size); // Use k-mer lemma. + assert(std::isnan(arguments.percentage)); // Use percentage. + + std::vector correction; + + if (read_correction(correction, arguments)) + return correction; + + double const fpr{std::log(arguments.fpr)}; + double const inv_fpr{std::log(1.0 - arguments.fpr)}; + double const log_p_max{std::log(arguments.p_max)}; + size_t const kmers_per_window{arguments.window_size - kmer_size + 1}; + size_t const kmers_per_pattern{arguments.query_length - kmer_size + 1}; + size_t const minimal_number_of_minimisers{kmers_per_pattern / kmers_per_window}; + size_t const maximal_number_of_minimisers{arguments.query_length - arguments.window_size + 1}; + + correction.reserve(maximal_number_of_minimisers - minimal_number_of_minimisers + 1); + + auto binom = [&fpr, &inv_fpr](std::vector const & binom_coeff, + size_t const number_of_minimisers, + size_t const number_of_fp) + { + return binom_coeff[number_of_fp] + number_of_fp * fpr + (number_of_minimisers - number_of_fp) * inv_fpr; + }; + + // Iterate over the possible number of minimisers. + for (size_t number_of_minimisers = minimal_number_of_minimisers; + number_of_minimisers <= maximal_number_of_minimisers; + ++number_of_minimisers) + { + size_t number_of_fp{1u}; + std::vector const binom_coeff{pascal_row(number_of_minimisers)}; + // How many FPs to expect for a given fpr and number of minimisers? + // The probability of seeing this many FP must be below p_max. + while (binom(binom_coeff, number_of_minimisers, number_of_fp) >= log_p_max) + ++number_of_fp; // GCOVR_EXCL_LINE + + correction.push_back(number_of_fp - 1); + } + assert(correction.size() != 0); + + write_correction(correction, arguments); + + return correction; +} + +} // namespace raptor::threshold diff --git a/src/raptor/threshold/precompute_threshold.cpp b/src/raptor/threshold/precompute_threshold.cpp new file mode 100644 index 00000000..369aab26 --- /dev/null +++ b/src/raptor/threshold/precompute_threshold.cpp @@ -0,0 +1,128 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Implements raptor::threshold::precompute_threshold. + * \author Enrico Seiler + */ + +#include + +#include + +#include +#include +#include +#include +#include + +namespace raptor::threshold +{ + +[[nodiscard]] std::string const threshold_filename(threshold_parameters const & arguments) +{ + std::stringstream stream{}; + stream << "threshold_" << std::hex << arguments.query_length << '_' << arguments.window_size << '_' + << arguments.shape.to_ulong() << '_' << static_cast(arguments.errors) << '_' << arguments.tau + << ".bin"; + std::string result = stream.str(); + if (auto it = result.find("0."); it != std::string::npos) + result.replace(it, 2, ""); + return result; +} + +void write_thresholds(std::vector const & vec, threshold_parameters const & arguments) +{ + if (!arguments.cache_thresholds) + return; + + std::filesystem::path filename = arguments.output_directory / threshold_filename(arguments); + std::ofstream os{filename, std::ios::binary}; + cereal::BinaryOutputArchive oarchive{os}; + oarchive(vec); +} + +bool read_thresholds(std::vector & vec, threshold_parameters const & arguments) +{ + std::filesystem::path filename = arguments.output_directory / threshold_filename(arguments); + if (!arguments.cache_thresholds || !std::filesystem::exists(filename)) + return false; + + std::ifstream is{filename, std::ios::binary}; + cereal::BinaryInputArchive iarchive{is}; + iarchive(vec); + return true; +} + +[[nodiscard]] std::vector precompute_threshold(threshold_parameters const & arguments) +{ + uint8_t const kmer_size{arguments.shape.size()}; + assert(arguments.window_size != kmer_size); // Use k-mer lemma. + assert(std::isnan(arguments.percentage)); // Use percentage. + + std::vector thresholds; + + if (read_thresholds(thresholds, arguments)) + return thresholds; + + double const log_tau{std::log(arguments.tau)}; + size_t const kmers_per_window{arguments.window_size - kmer_size + 1}; + size_t const kmers_per_pattern{arguments.query_length - kmer_size + 1}; + size_t const minimal_number_of_minimisers{kmers_per_pattern / kmers_per_window}; + size_t const maximal_number_of_minimisers{arguments.query_length - arguments.window_size + 1}; + + thresholds.reserve(maximal_number_of_minimisers - minimal_number_of_minimisers + 1); + + // Probability that i minimisers are indirectly affected by one error. + std::vector const affected_by_one_error_indirectly_prob{ + one_indirect_error_model(arguments.query_length, arguments.window_size, arguments.shape)}; + + // Iterate over the possible number of minimisers. + for (size_t number_of_minimisers = minimal_number_of_minimisers; + number_of_minimisers <= maximal_number_of_minimisers; + ++number_of_minimisers) + { + // Probability that a minimiser starts at index i. Uniform => Number of minimisers / possible indices. + double const uniform_start_index_prob{std::log(number_of_minimisers) - std::log(kmers_per_pattern)}; + + // Probability that i minimisers are affected by one error (directly or indirectly). + std::vector const affected_by_one_error_prob{ + one_error_model(kmer_size, uniform_start_index_prob, affected_by_one_error_indirectly_prob)}; + + // Probability that i minimisers are affected e errors. + std::vector const affected_by_e_errors_prob{ + multiple_error_model(number_of_minimisers, arguments.errors, affected_by_one_error_prob)}; + + // Max number of affected minimisers as predicted by `multiple_error_model`. Used for a check when adding + // the probabilities. + // This check is not strictly necessary, but in case of floating point number inaccuracies, it prevents + // adding all probabilities in `affected_by_e_errors_prob`. + // While `affected_by_e_errors_prob` computes all probabilities according to a theoretical worst case, + // in practice, there are probabilites of 0 starting at a certain number of affected minimisers. + size_t const max_affected = + std::ranges::find(affected_by_e_errors_prob, logspace::negative_inf) - affected_by_e_errors_prob.begin(); + + // The fraction of covered cases. + double cumulative_prob{affected_by_e_errors_prob[0]}; + // How many minimisers are affected at most... + size_t affected_minimisers{}; + // such that threshold holds with a probability of at least (1 - tau)? + while (cumulative_prob < log_tau && affected_minimisers < max_affected) + cumulative_prob = logspace::add(cumulative_prob, affected_by_e_errors_prob[++affected_minimisers]); + + assert(affected_minimisers <= number_of_minimisers); + // Hence, there are at least this many left unaffected (threshold). + thresholds.push_back(number_of_minimisers - affected_minimisers); + } + assert(thresholds.size() == maximal_number_of_minimisers - minimal_number_of_minimisers + 1); + + write_thresholds(thresholds, arguments); + + return thresholds; +} + +} // namespace raptor::threshold diff --git a/src/raptor/threshold/threshold.cpp b/src/raptor/threshold/threshold.cpp new file mode 100644 index 00000000..1d57418c --- /dev/null +++ b/src/raptor/threshold/threshold.cpp @@ -0,0 +1,64 @@ +// -------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md +// -------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Implements raptor::threshold::threshold. + * \author Enrico Seiler + */ + +#include + +namespace raptor::threshold +{ + +threshold::threshold(threshold_parameters const & arguments) +{ + uint8_t const kmer_size{arguments.shape.size()}; + size_t const kmers_per_window = arguments.window_size - kmer_size + 1; + + if (!std::isnan(arguments.percentage)) + { + threshold_kind = threshold_kinds::percentage; + threshold_percentage = arguments.percentage; + } + else if (kmers_per_window == 1u) + { + threshold_kind = threshold_kinds::lemma; + size_t const kmer_lemma_minuend = arguments.query_length + 1u; + size_t const kmer_lemma_subtrahend = (arguments.errors + 1u) * kmer_size; + kmer_lemma = kmer_lemma_minuend > kmer_lemma_subtrahend ? kmer_lemma_minuend - kmer_lemma_subtrahend : 1u; + } + else + { + threshold_kind = threshold_kinds::probabilistic; + size_t const kmers_per_pattern = arguments.query_length - kmer_size + 1; + minimal_number_of_minimizers = kmers_per_pattern / kmers_per_window; + maximal_number_of_minimizers = arguments.query_length - arguments.window_size + 1; + precomp_correction = precompute_correction(arguments); + precomp_thresholds = precompute_threshold(arguments); + } +} + +size_t threshold::get(size_t const minimiser_count) const noexcept +{ + switch (threshold_kind) + { + case threshold_kinds::lemma: + return kmer_lemma; + case threshold_kinds::percentage: + return std::max(1u, minimiser_count * threshold_percentage); + default: + { + assert(threshold_kind == threshold_kinds::probabilistic); + size_t const index = std::clamp(minimiser_count, minimal_number_of_minimizers, maximal_number_of_minimizers) + - minimal_number_of_minimizers; + return std::max(1u, precomp_thresholds[index] + precomp_correction[index]); + } + } +} + +} // namespace raptor::threshold