diff --git a/CMakeLists.txt b/CMakeLists.txt index 86717c759..8211fb370 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -192,6 +192,7 @@ set(HEYOKA_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/setup_variational_ics.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp" # NOTE: this will be an empty file in case we are not # building with support for real. "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp" diff --git a/include/heyoka/detail/aligned_buffer.hpp b/include/heyoka/detail/aligned_buffer.hpp new file mode 100644 index 000000000..1b68bc867 --- /dev/null +++ b/include/heyoka/detail/aligned_buffer.hpp @@ -0,0 +1,39 @@ +// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef HEYOKA_DETAIL_ALIGNED_BUFFER_HPP +#define HEYOKA_DETAIL_ALIGNED_BUFFER_HPP + +#include +#include + +#include + +HEYOKA_BEGIN_NAMESPACE + +namespace detail +{ + +// Utilities to create and destroy tape arrays for compiled functions +// and/or Taylor integrators in compact mode. These may have custom alignment requirements due +// to the use of SIMD instructions, hence we need to use aligned new/delete +// and a custom deleter for the unique ptr. +struct aligned_buffer_deleter { + std::align_val_t al{}; + void operator()(void *ptr) const noexcept; +}; + +using aligned_buffer_t = std::unique_ptr; + +aligned_buffer_t make_aligned_buffer(std::size_t, std::size_t); + +} // namespace detail + +HEYOKA_END_NAMESPACE + +#endif diff --git a/include/heyoka/detail/llvm_helpers.hpp b/include/heyoka/detail/llvm_helpers.hpp index 3c60deaa3..cbc4e0395 100644 --- a/include/heyoka/detail/llvm_helpers.hpp +++ b/include/heyoka/detail/llvm_helpers.hpp @@ -67,6 +67,8 @@ HEYOKA_DLL_PUBLIC llvm::Type *make_vector_type(llvm::Type *, std::uint32_t); HEYOKA_DLL_PUBLIC std::string llvm_mangle_type(llvm::Type *); +HEYOKA_DLL_PUBLIC llvm::Type *llvm_clone_type(llvm_state &, llvm::Type *); + HEYOKA_DLL_PUBLIC std::uint32_t get_vector_size(llvm::Value *); HEYOKA_DLL_PUBLIC std::uint64_t get_alignment(llvm::Module &, llvm::Type *); diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp index 2758a14c5..a5c099c12 100644 --- a/include/heyoka/expression.hpp +++ b/include/heyoka/expression.hpp @@ -698,8 +698,8 @@ auto cfunc_common_opts(const KwArgs &...kw_args) template std::tuple, std::vector>> -make_multi_cfunc(const llvm_state &, const std::string &, const std::vector &, - const std::vector &, std::uint32_t, bool, bool, long long); +make_multi_cfunc(llvm_state, const std::string &, const std::vector &, const std::vector &, + std::uint32_t, bool, bool, long long); } // namespace detail diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp index 5916cf99d..93882b47c 100644 --- a/src/cfunc_class.cpp +++ b/src/cfunc_class.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -49,6 +48,7 @@ #endif +#include #include #include #include @@ -59,60 +59,6 @@ HEYOKA_BEGIN_NAMESPACE -namespace detail -{ - -namespace -{ - -// Utilities to create and destroy tape arrays for compiled functions -// in compact mode. These may have custom alignment requirements due -// to the use of SIMD instructions, hence we need to use aligned new/delete -// and a custom deleter for the unique ptr. -struct aligned_array_deleter { - std::align_val_t al{}; - void operator()(void *ptr) const noexcept - { - // NOTE: here we are using directly the delete operator (which does not invoke destructors), - // rather than a delete expression (which would also invoke destructors). However, because - // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be - // sufficient. - ::operator delete[](ptr, al); - } -}; - -using aligned_array_t = std::unique_ptr; - -aligned_array_t make_aligned_array(std::size_t sz, std::size_t al) -{ - assert(al > 0u); - assert((al & (al - 1u)) == 0u); - - if (sz == 0u) { - return {}; - } else { -#if defined(_MSC_VER) - // MSVC workaround for this issue: - // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320 - - // Allocate the raw memory. - auto *buf = ::operator new[](sz, std::align_val_t{al}); - - // Formally construct the bytes array. - auto *ptr = ::new (buf) std::byte[sz]; - - // Construct and return the unique ptr. - return aligned_array_t{ptr, {.al = std::align_val_t{al}}}; -#else - return aligned_array_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}}; -#endif - } -} - -} // namespace - -} // namespace detail - template struct cfunc::impl { // The compiled function types. @@ -124,7 +70,7 @@ struct cfunc::impl { using c_cfunc_ptr_s_t = void (*)(T *, const T *, const T *, const T *, void *, std::size_t) noexcept; // Thread-local storage for parallel operations. - using ets_item_t = detail::aligned_array_t; + using ets_item_t = detail::aligned_buffer_t; using ets_t = oneapi::tbb::enumerable_thread_specific, oneapi::tbb::ets_key_usage_type::ets_key_per_instance>; @@ -135,7 +81,7 @@ struct cfunc::impl { std::uint32_t m_batch_size = 0; std::vector m_dc; std::vector> m_tape_sa; - std::vector m_tapes; + std::vector m_tapes; std::variant m_fptr_scal; std::variant m_fptr_scal_s; std::variant m_fptr_batch_s; @@ -223,7 +169,7 @@ struct cfunc::impl { assert(m_tapes.empty()); for (const auto [sz, al] : m_tape_sa) { - m_tapes.push_back(detail::make_aligned_array(sz, al)); + m_tapes.push_back(detail::make_aligned_buffer(sz, al)); } } @@ -260,8 +206,8 @@ struct cfunc::impl { if (compact_mode) { // Build the multi cfunc, and assign the internal members. - std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc(s, "cfunc", m_fn, m_vars, m_batch_size, - high_accuracy, m_parallel_mode, prec); + std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc( + std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec); // Compile. std::get<1>(m_states).compile(); @@ -845,8 +791,8 @@ void cfunc::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional typename impl::ets_t ets_batch([this, batch_size]() { // NOTE: the batch-mode tape is at index 1 only if the batch // size is > 1, otherwise we are using the scalar tape. - return detail::make_aligned_array(m_impl->m_tape_sa[batch_size > 1u][0], - m_impl->m_tape_sa[batch_size > 1u][1]); + return detail::make_aligned_buffer(m_impl->m_tape_sa[batch_size > 1u][0], + m_impl->m_tape_sa[batch_size > 1u][1]); }); oneapi::tbb::parallel_invoke( @@ -865,7 +811,7 @@ void cfunc::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional // will block as execution in the parallel region of the cfunc begins. The // blocked thread could then grab another task from the parallel for loop // we are currently in, and it would then start writing for a second time - // into the same tape it already begun writing into, leading to UB. + // into the same tape it already begun writing into. oneapi::tbb::this_task_arena::isolate( [&]() { batch_iter.template operator()(range, tape_ptr); }); }); diff --git a/src/detail/aligned_buffer.cpp b/src/detail/aligned_buffer.cpp new file mode 100644 index 000000000..5e9ba6eb0 --- /dev/null +++ b/src/detail/aligned_buffer.cpp @@ -0,0 +1,58 @@ +// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include +#include + +#include +#include + +HEYOKA_BEGIN_NAMESPACE + +namespace detail +{ + +void aligned_buffer_deleter::operator()(void *ptr) const noexcept +{ + // NOTE: here we are using directly the delete operator (which does not invoke destructors), + // rather than a delete expression (which would also invoke destructors). However, because + // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be + // sufficient. + ::operator delete[](ptr, al); +} + +aligned_buffer_t make_aligned_buffer(std::size_t sz, std::size_t al) +{ + assert(al > 0u); + assert((al & (al - 1u)) == 0u); + + if (sz == 0u) { + return {}; + } else { +#if defined(_MSC_VER) + // MSVC workaround for this issue: + // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320 + + // Allocate the raw memory. + auto *buf = ::operator new[](sz, std::align_val_t{al}); + + // Formally construct the bytes array. + auto *ptr = ::new (buf) std::byte[sz]; + + // Construct and return the unique ptr. + return aligned_buffer_t{ptr, {.al = std::align_val_t{al}}}; +#else + return aligned_buffer_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}}; +#endif + } +} + +} // namespace detail + +HEYOKA_END_NAMESPACE diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index 010ed98fc..c5a4afc2c 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -3359,6 +3359,83 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t) } } +// Utility to create an identical copy of the type tp in the context of the state s. +// NOTE: although it may sound like this is a read-only operation on tp, it is not, +// since we are potentially poking into the context of tp during operations. Thus, this +// function cannot be called concurrently from multiple threads on the same tp object, +// or even on different tp objects defined in the same context. +// NOTE: this handles only floating-point (vector) types at this time, extending +// to integral types should be fairly easy. +// NOTE: perhaps this function could be made more generic for arbitrary struct types +// by (recursively) reading the struct layout and then reproducing it in the target +// context. Like this, we could avoid special casing for the mppp::real types. +llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp) +{ + assert(tp != nullptr); + + // Fetch the target context. + auto &ctx = s.context(); + + // Construct the scalar type first, then we will convert + // to a vector if needed. + auto *tp_scal = tp->getScalarType(); + llvm::Type *ret_scal_t = nullptr; + +#define HEYOKA_LLVM_CLONE_TYPE_IMPL(tid) \ + case llvm::Type::tid##TyID: \ + ret_scal_t = llvm::Type::get##tid##Ty(ctx); \ + break + + // NOTE: gcov seems to get a bit confused by the macro usage. + // LCOV_EXCL_START + switch (tp_scal->getTypeID()) { + HEYOKA_LLVM_CLONE_TYPE_IMPL(Float); + HEYOKA_LLVM_CLONE_TYPE_IMPL(Double); + HEYOKA_LLVM_CLONE_TYPE_IMPL(X86_FP80); + HEYOKA_LLVM_CLONE_TYPE_IMPL(FP128); + default: { + +#if defined(HEYOKA_HAVE_REAL) + + if (const auto prec = llvm_is_real(tp_scal); prec != 0) { + // tp_scal is the internal counterpart of mppp::real. + ret_scal_t = to_internal_llvm_type(s, prec); + break; + } else if (tp_scal == to_external_llvm_type(tp_scal->getContext())) { + // tp_scal is mppp::real. + ret_scal_t = to_external_llvm_type(ctx); + break; + } + +#endif + + throw std::invalid_argument( + fmt::format("Cannot clone the LLVM type '{}' to another context", llvm_type_name(tp))); + } + } + +#undef HEYOKA_LLVM_CLONE_TYPE_IMPL + // LCOV_EXCL_STOP + + assert(ret_scal_t != nullptr); + + if (tp->isVectorTy()) { + // tp is a vector type. + if (const auto *vtp = llvm::dyn_cast(tp)) [[likely]] { + return make_vector_type(ret_scal_t, boost::numeric_cast(vtp->getNumElements())); + } else { + // LCOV_EXCL_START + throw std::invalid_argument(fmt::format("Cannot clone the LLVM type '{}' to another context - the type is " + "a vector type whose size is not fixed", + llvm_type_name(tp))); + // LCOV_EXCL_STOP + } + } else { + // tp is a scalar type. + return ret_scal_t; + } +} + } // namespace detail HEYOKA_END_NAMESPACE diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp index 00a8c5aa0..5eeb93538 100644 --- a/src/expression_cfunc.cpp +++ b/src/expression_cfunc.cpp @@ -1568,14 +1568,15 @@ namespace // implemented in distinct llvm_state objects. // // states is the current list of states (to which more will be added by this function), and the last state -// in the list is the "main" state. s_dc is the segmented decomposition of the function to be compiled. +// in the list is the "main" state. main_fp_t is the internal scalar floating-point type as defined in the main state. +// s_dc is the segmented decomposition of the function to be compiled. // base_name is the name of the main function from which the drivers are to be invoked. main_eval_arr, // main_par_ptr, main_time_ptr and main_stride are, respectively, the pointer to the evaluation tape, // the pointer to the parameter values, the pointer to time coordinate(s) and the stride - these are all // defined in the main state and they are passed to the driver functions invocations. -template -void multi_cfunc_evaluate_segments(std::list &states, const SDC &s_dc, std::uint32_t nvars, - std::uint32_t batch_size, bool high_accuracy, long long prec, +template +void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list &states, const SDC &s_dc, + std::uint32_t nvars, std::uint32_t batch_size, bool high_accuracy, const std::string &base_name, llvm::Value *main_eval_arr, llvm::Value *main_par_ptr, llvm::Value *main_time_ptr, llvm::Value *main_stride) { @@ -1608,6 +1609,11 @@ void multi_cfunc_evaluate_segments(std::list &states, const SDC &s_d llvm_func_name_compare>; // Push back a new state and use it as initial current state. + // NOTE: like this, we always end up creating at least one driver + // function and a state, even in the degenerate case of an empty decomposition, + // which is suboptimal peformance-wise. + // I do not think however that it is worth it to complicate the code to avoid + // this corner-case pessimisation. states.push_back(main_state.make_similar()); auto *cur_state = &states.back(); @@ -1729,7 +1735,7 @@ void multi_cfunc_evaluate_segments(std::list &states, const SDC &s_d } // Fetch the internal fp type and its vector counterpart for the current state. - auto *fp_t = to_internal_llvm_type(*cur_state, prec); + auto *fp_t = llvm_clone_type(*cur_state, main_fp_t); auto *fp_vec_type = make_vector_type(fp_t, batch_size); // Fetch the current builder. @@ -1955,17 +1961,13 @@ void multi_cfunc_evaluate_segments(std::list &states, const SDC &s_d // LCOV_EXCL_STOP } -// NOTE: here we are forced to use a templated function, rather than passing in the -// LLVM type fp_t as usual, because we need to re-create the type for every context -// in every state, and there seems not to be an easy way to transfer/copy a type -// from one context to the other. -template -std::array -add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm::Value *in_ptr, llvm::Value *par_ptr, - llvm::Value *time_ptr, llvm::Value *stride, const std::vector &dc, std::uint32_t nvars, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - std::uint32_t nuvars, std::uint32_t batch_size, bool high_accuracy, long long prec, - const std::string &base_name, llvm::Value *eval_arr) +std::array add_multi_cfunc_impl(llvm::Type *fp_t, std::list &states, llvm::Value *out_ptr, + llvm::Value *in_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr, + llvm::Value *stride, const std::vector &dc, + std::uint32_t nvars, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + std::uint32_t nuvars, std::uint32_t batch_size, bool high_accuracy, + const std::string &base_name, llvm::Value *eval_arr) { // Fetch the main state, module, etc. auto &main_state = states.back(); @@ -1973,7 +1975,10 @@ add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm:: auto &main_builder = main_state.builder(); // Fetch the fp types for the main state. - auto *main_fp_t = to_internal_llvm_type(main_state, prec); + // NOTE: cloning is safe here, as even though this function is being invoked + // in parallel from multiple threads, we have made sure that each invocation + // gets its own cloned copy of fp_t. + auto *main_fp_t = llvm_clone_type(main_state, fp_t); auto *main_ext_fp_t = make_external_llvm_type(main_fp_t); auto *fp_vec_type = make_vector_type(main_fp_t, batch_size); @@ -1991,7 +1996,7 @@ add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm:: const auto al = boost::numeric_cast(get_alignment(main_md, fp_vec_type)); // NOTE: eval_arr is used as temporary storage for the current function, - // but it provided externally from dynamically-allocated memory in order to avoid stack overflow. + // but it is provided externally from dynamically-allocated memory in order to avoid stack overflow. // This creates a situation in which LLVM cannot elide stores into eval_arr // (even if it figures out a way to avoid storing intermediate results into // eval_arr) because LLVM must assume that some other function may @@ -2015,8 +2020,8 @@ add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm:: }); // Generate the code for the evaluation of all segments. - multi_cfunc_evaluate_segments(states, s_dc, nvars, batch_size, high_accuracy, prec, base_name, eval_arr, par_ptr, - time_ptr, stride); + multi_cfunc_evaluate_segments(main_fp_t, states, s_dc, nvars, batch_size, high_accuracy, base_name, eval_arr, + par_ptr, time_ptr, stride); // Write the results to the output pointer. cfunc_c_write_outputs(main_state, main_fp_t, out_ptr, cout_gl, eval_arr, par_ptr, stride, batch_size); @@ -2027,32 +2032,10 @@ add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm:: return {sz, al}; } -} // namespace - -// This function will compile several versions of the input function fn, with input variables vars, in compact mode. -// -// The compiled functions are implemented across several llvm_states which are collated together and returned as -// a single llvm_multi_state (this is the first element of the return tuple). If batch_size is 1, -// then 2 compiled functions are created - a scalar strided and a scalar unstrided version. -// If batch size is > 1, then an additional batch-mode strided compiled function is returned. -// The function names are created using "name" as base name and then mangling in the strided/unstrided -// property and the batch size. -// -// The second element of the return tuple is the decomposition of fn. -// -// The third element of the return tuple is a vector of pairs, each pair containing the size and alignment requirements -// for the externally-provided storage for the evaluation tape. If batch_size is 1, then only a single -// pair is returned, representing the size/alignment requirements for the scalar-mode evaluation tape. -// If batch_size > 1, then an additional pair is appended representing the size/alignment requirements -// for the batch-mode evaluation tape. -// -// NOTE: there is a bunch of boilerplate logic overlap here with add_cfunc_impl(). Make sure to -// coordinate changes between the two functions. -template std::tuple, std::vector>> -make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vector &fn, - const std::vector &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode, - long long prec) +make_multi_cfunc_impl(llvm::Type *fp_t, const llvm_state &tplt, const std::string &name, + const std::vector &fn, const std::vector &vars, std::uint32_t batch_size, + bool high_accuracy, bool parallel_mode) { if (batch_size == 0u) [[unlikely]] { throw std::invalid_argument("The batch size of a compiled function cannot be zero"); @@ -2062,27 +2045,6 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec throw std::invalid_argument("Parallel mode has not been implemented yet"); } -#if defined(HEYOKA_ARCH_PPC) - if constexpr (std::is_same_v) { - throw not_implemented_error("'long double' computations are not supported on PowerPC"); - } -#endif - -#if defined(HEYOKA_HAVE_REAL) - - if constexpr (std::is_same_v) { - const auto sprec = boost::numeric_cast(prec); - - if (sprec < mppp::real_prec_min() || sprec > mppp::real_prec_max()) [[unlikely]] { - throw std::invalid_argument( - fmt::format("An invalid precision value of {} was passed to make_multi_cfunc() (the " - "value must be in the [{}, {}] range)", - sprec, mppp::real_prec_min(), mppp::real_prec_max())); - } - } - -#endif - if (name.empty()) [[unlikely]] { throw std::invalid_argument("A non-empty function name is required when invoking make_multi_cfunc()"); } @@ -2126,9 +2088,26 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec tape_size_align.resize(2); } - // Helper to create a cfunc. + // NOTE: this is ugly, but needed. Cloning an LLVM type into another + // context is not a thread-safe operation as we might be poking into + // the context of the original type. Thus, we first make 2 or 3 clones + // of fp_t each associated to a different llvm_state without any multithreading, + // and then we use these clones for further cloning while parallel invoking + // create_cfunc(). + std::vector> fp_t_clones; + fp_t_clones.reserve(3); + for (auto i = 0; i < (batch_size == 1u ? 2 : 3); ++i) { + // Create a new state and clone fp_t into it. + auto new_state = tplt.make_similar(); + auto *new_fp_t = llvm_clone_type(new_state, fp_t); + + fp_t_clones.emplace_back(std::move(new_state), new_fp_t); + } + + // Helper to create a single cfunc. auto create_cfunc = [&states_lists, &tape_size_align, &tplt, &name, &dc = std::as_const(dc), nvars, nuvars, - high_accuracy, prec](bool strided, std::uint32_t cur_batch_size) { + high_accuracy, + &fp_t_clones = std::as_const(fp_t_clones)](bool strided, std::uint32_t cur_batch_size) { // NOTE: the batch unstrided variant is not supposed to be requested. assert(strided || cur_batch_size == 1u); @@ -2145,6 +2124,9 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec assert(states.empty()); + // Fetch the local cloned fp_t. + auto *loc_fp_t = fp_t_clones[sidx].second; + // Add a new state and fetch it. states.push_back(tplt.make_similar()); auto &s = states.back(); @@ -2228,8 +2210,8 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec builder.SetInsertPoint(bb); // Create the body of the function. - const auto tape_sa = add_multi_cfunc_impl(states, out_ptr, in_ptr, par_ptr, time_ptr, stride, dc, nvars, - nuvars, cur_batch_size, high_accuracy, prec, cur_name, tape_ptr); + const auto tape_sa = add_multi_cfunc_impl(loc_fp_t, states, out_ptr, in_ptr, par_ptr, time_ptr, stride, dc, + nvars, nuvars, cur_batch_size, high_accuracy, cur_name, tape_ptr); // Add the size/alignment requirements for the tape storage. // NOTE: there's no difference in requirements between strided and @@ -2258,7 +2240,8 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec // into a thread-safe tbb vector. // // At the moment though it looks like the practical gains from such further parallelisation - // would not be worth it, perhaps we can reconsider in the future. + // would not be worth it, perhaps we can reconsider in the future. It is also not clear how + // to deal with thread-unsafe type cloning in this hypothetical scenario. if (batch_size == 1u) { oneapi::tbb::parallel_invoke([&create_cfunc]() { create_cfunc(false, 1); }, [&create_cfunc]() { create_cfunc(true, 1); }); @@ -2285,11 +2268,66 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec std::move(dc), std::move(tape_size_align)); } +} // namespace + +// This function will compile several versions of the input function fn, with input variables vars, in compact mode. +// +// The compiled functions are implemented across several llvm_states which are collated together and returned as +// a single llvm_multi_state (this is the first element of the return tuple). If batch_size is 1, +// then 2 compiled functions are created - a scalar strided and a scalar unstrided version. +// If batch size is > 1, then an additional batch-mode strided compiled function is returned. +// The function names are created using "name" as base name and then mangling in the strided/unstrided +// property and the batch size. +// +// The second element of the return tuple is the decomposition of fn. +// +// The third element of the return tuple is a vector of pairs, each pair containing the size and alignment requirements +// for the externally-provided storage for the evaluation tape. If batch_size is 1, then only a single +// pair is returned, representing the size/alignment requirements for the scalar-mode evaluation tape. +// If batch_size > 1, then an additional pair is appended representing the size/alignment requirements +// for the batch-mode evaluation tape. +// +// NOTE: there is a bunch of boilerplate logic overlap here with add_cfunc_impl(). Make sure to +// coordinate changes between the two functions. +template +std::tuple, std::vector>> +make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector &fn, + const std::vector &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode, + long long prec) +{ +#if defined(HEYOKA_ARCH_PPC) + if constexpr (std::is_same_v) { + throw not_implemented_error("'long double' computations are not supported on PowerPC"); + } +#endif + +#if defined(HEYOKA_HAVE_REAL) + + if constexpr (std::is_same_v) { + const auto sprec = boost::numeric_cast(prec); + + if (sprec < mppp::real_prec_min() || sprec > mppp::real_prec_max()) [[unlikely]] { + throw std::invalid_argument( + fmt::format("An invalid precision value of {} was passed to make_multi_cfunc() (the " + "value must be in the [{}, {}] range)", + sprec, mppp::real_prec_min(), mppp::real_prec_max())); + } + } + +#endif + + // Fetch the internal scalar fp type from the template state. We will be cloning + // this throughout the rest of the implementation. + auto *fp_t = to_internal_llvm_type(tplt, prec); + + return make_multi_cfunc_impl(fp_t, tplt, name, fn, vars, batch_size, high_accuracy, parallel_mode); +} + // Explicit instantiations. #define HEYOKA_MAKE_MULTI_CFUNC_INST(T) \ template HEYOKA_DLL_PUBLIC \ std::tuple, std::vector>> \ - make_multi_cfunc(const llvm_state &, const std::string &, const std::vector &, \ + make_multi_cfunc(llvm_state, const std::string &, const std::vector &, \ const std::vector &, std::uint32_t, bool, bool, long long); HEYOKA_MAKE_MULTI_CFUNC_INST(float) diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 9bfdd5238..130cdda90 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -671,7 +671,6 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect // Helper for the computation of a jet of derivatives in compact mode, // used in taylor_compute_jet(). -// NOTE: order0, par_ptr and time_ptr are external pointers. std::pair taylor_compute_jet_compact_mode( // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) llvm_state &s, llvm::Type *fp_type, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr, diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp index 38cf97974..3a8bb1bab 100644 --- a/src/taylor_adaptive.cpp +++ b/src/taylor_adaptive.cpp @@ -437,8 +437,7 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, // Add the function for the computation of // the dense output. - detail::taylor_add_d_out_function(m_llvm, detail::internal_llvm_type_like(m_llvm, m_state[0]), m_dim, m_order, 1, - high_accuracy); + detail::taylor_add_d_out_function(m_llvm, fp_t, m_dim, m_order, 1, high_accuracy); detail::get_logger()->trace("Taylor dense output runtime: {}", sw); sw.reset(); diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp index a8e0f6046..e97b4d0df 100644 --- a/src/taylor_adaptive_batch.cpp +++ b/src/taylor_adaptive_batch.cpp @@ -284,8 +284,7 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta // Add the function for the computation of // the dense output. - detail::taylor_add_d_out_function(m_llvm, detail::to_external_llvm_type(m_llvm.context()), m_dim, m_order, - m_batch_size, high_accuracy); + detail::taylor_add_d_out_function(m_llvm, ext_fp_t, m_dim, m_order, m_batch_size, high_accuracy); detail::get_logger()->trace("Taylor batch dense output runtime: {}", sw); sw.reset(); diff --git a/test/llvm_helpers.cpp b/test/llvm_helpers.cpp index a09f40f1a..ec13bd327 100644 --- a/test/llvm_helpers.cpp +++ b/test/llvm_helpers.cpp @@ -3030,3 +3030,36 @@ TEST_CASE("switch") #endif } + +TEST_CASE("clone type") +{ + using detail::llvm_clone_type; + + auto tester = [](fp_t) { + llvm_state source, dest; + + auto *tp_source = detail::to_external_llvm_type(source.context()); + auto *tp_dest = llvm_clone_type(dest, tp_source); + REQUIRE(tp_dest == detail::to_external_llvm_type(dest.context())); + + auto *vec_tp_source = detail::make_vector_type(tp_source, 4); + auto *vec_tp_dest = llvm_clone_type(dest, vec_tp_source); + REQUIRE(vec_tp_dest == detail::make_vector_type(tp_dest, 4)); + }; + + tuple_for_each(fp_types, tester); + +#if defined(HEYOKA_HAVE_REAL) + + llvm_state source, dest; + + auto *tp_ext_source = detail::to_external_llvm_type(source.context()); + auto *tp_ext_dest = llvm_clone_type(dest, tp_ext_source); + REQUIRE(tp_ext_dest == detail::to_external_llvm_type(dest.context())); + + auto *tp_int_source = detail::to_internal_llvm_type(source, 11); + auto *tp_int_dest = llvm_clone_type(dest, tp_int_source); + REQUIRE(tp_int_dest == detail::to_internal_llvm_type(dest, 11)); + +#endif +}