From 27da677b1deda8cfddd1ae503ad6e68bd3d5f5de Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 21 Aug 2024 09:11:19 +0200 Subject: [PATCH 01/11] Minor. --- src/taylor_02.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 9bfdd5238..130cdda90 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -671,7 +671,6 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect // Helper for the computation of a jet of derivatives in compact mode, // used in taylor_compute_jet(). -// NOTE: order0, par_ptr and time_ptr are external pointers. std::pair taylor_compute_jet_compact_mode( // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) llvm_state &s, llvm::Type *fp_type, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr, From 85230389298883622d48a629e7e490799f9830a2 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 21 Aug 2024 10:15:18 +0200 Subject: [PATCH 02/11] Rename and relocate the utilities for aligned arrays. --- CMakeLists.txt | 1 + include/heyoka/detail/aligned_buffer.hpp | 39 ++++++++++++++ src/cfunc_class.cpp | 66 +++--------------------- src/detail/aligned_buffer.cpp | 58 +++++++++++++++++++++ 4 files changed, 104 insertions(+), 60 deletions(-) create mode 100644 include/heyoka/detail/aligned_buffer.hpp create mode 100644 src/detail/aligned_buffer.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 86717c759..8211fb370 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -192,6 +192,7 @@ set(HEYOKA_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/setup_variational_ics.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp" # NOTE: this will be an empty file in case we are not # building with support for real. "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp" diff --git a/include/heyoka/detail/aligned_buffer.hpp b/include/heyoka/detail/aligned_buffer.hpp new file mode 100644 index 000000000..1b68bc867 --- /dev/null +++ b/include/heyoka/detail/aligned_buffer.hpp @@ -0,0 +1,39 @@ +// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef HEYOKA_DETAIL_ALIGNED_BUFFER_HPP +#define HEYOKA_DETAIL_ALIGNED_BUFFER_HPP + +#include +#include + +#include + +HEYOKA_BEGIN_NAMESPACE + +namespace detail +{ + +// Utilities to create and destroy tape arrays for compiled functions +// and/or Taylor integrators in compact mode. These may have custom alignment requirements due +// to the use of SIMD instructions, hence we need to use aligned new/delete +// and a custom deleter for the unique ptr. +struct aligned_buffer_deleter { + std::align_val_t al{}; + void operator()(void *ptr) const noexcept; +}; + +using aligned_buffer_t = std::unique_ptr; + +aligned_buffer_t make_aligned_buffer(std::size_t, std::size_t); + +} // namespace detail + +HEYOKA_END_NAMESPACE + +#endif diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp index 5916cf99d..021b563da 100644 --- a/src/cfunc_class.cpp +++ b/src/cfunc_class.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -49,6 +48,7 @@ #endif +#include #include #include #include @@ -59,60 +59,6 @@ HEYOKA_BEGIN_NAMESPACE -namespace detail -{ - -namespace -{ - -// Utilities to create and destroy tape arrays for compiled functions -// in compact mode. These may have custom alignment requirements due -// to the use of SIMD instructions, hence we need to use aligned new/delete -// and a custom deleter for the unique ptr. -struct aligned_array_deleter { - std::align_val_t al{}; - void operator()(void *ptr) const noexcept - { - // NOTE: here we are using directly the delete operator (which does not invoke destructors), - // rather than a delete expression (which would also invoke destructors). However, because - // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be - // sufficient. - ::operator delete[](ptr, al); - } -}; - -using aligned_array_t = std::unique_ptr; - -aligned_array_t make_aligned_array(std::size_t sz, std::size_t al) -{ - assert(al > 0u); - assert((al & (al - 1u)) == 0u); - - if (sz == 0u) { - return {}; - } else { -#if defined(_MSC_VER) - // MSVC workaround for this issue: - // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320 - - // Allocate the raw memory. - auto *buf = ::operator new[](sz, std::align_val_t{al}); - - // Formally construct the bytes array. - auto *ptr = ::new (buf) std::byte[sz]; - - // Construct and return the unique ptr. - return aligned_array_t{ptr, {.al = std::align_val_t{al}}}; -#else - return aligned_array_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}}; -#endif - } -} - -} // namespace - -} // namespace detail - template struct cfunc::impl { // The compiled function types. @@ -124,7 +70,7 @@ struct cfunc::impl { using c_cfunc_ptr_s_t = void (*)(T *, const T *, const T *, const T *, void *, std::size_t) noexcept; // Thread-local storage for parallel operations. - using ets_item_t = detail::aligned_array_t; + using ets_item_t = detail::aligned_buffer_t; using ets_t = oneapi::tbb::enumerable_thread_specific, oneapi::tbb::ets_key_usage_type::ets_key_per_instance>; @@ -135,7 +81,7 @@ struct cfunc::impl { std::uint32_t m_batch_size = 0; std::vector m_dc; std::vector> m_tape_sa; - std::vector m_tapes; + std::vector m_tapes; std::variant m_fptr_scal; std::variant m_fptr_scal_s; std::variant m_fptr_batch_s; @@ -223,7 +169,7 @@ struct cfunc::impl { assert(m_tapes.empty()); for (const auto [sz, al] : m_tape_sa) { - m_tapes.push_back(detail::make_aligned_array(sz, al)); + m_tapes.push_back(detail::make_aligned_buffer(sz, al)); } } @@ -845,8 +791,8 @@ void cfunc::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional typename impl::ets_t ets_batch([this, batch_size]() { // NOTE: the batch-mode tape is at index 1 only if the batch // size is > 1, otherwise we are using the scalar tape. - return detail::make_aligned_array(m_impl->m_tape_sa[batch_size > 1u][0], - m_impl->m_tape_sa[batch_size > 1u][1]); + return detail::make_aligned_buffer(m_impl->m_tape_sa[batch_size > 1u][0], + m_impl->m_tape_sa[batch_size > 1u][1]); }); oneapi::tbb::parallel_invoke( diff --git a/src/detail/aligned_buffer.cpp b/src/detail/aligned_buffer.cpp new file mode 100644 index 000000000..5e9ba6eb0 --- /dev/null +++ b/src/detail/aligned_buffer.cpp @@ -0,0 +1,58 @@ +// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include +#include + +#include +#include + +HEYOKA_BEGIN_NAMESPACE + +namespace detail +{ + +void aligned_buffer_deleter::operator()(void *ptr) const noexcept +{ + // NOTE: here we are using directly the delete operator (which does not invoke destructors), + // rather than a delete expression (which would also invoke destructors). However, because + // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be + // sufficient. + ::operator delete[](ptr, al); +} + +aligned_buffer_t make_aligned_buffer(std::size_t sz, std::size_t al) +{ + assert(al > 0u); + assert((al & (al - 1u)) == 0u); + + if (sz == 0u) { + return {}; + } else { +#if defined(_MSC_VER) + // MSVC workaround for this issue: + // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320 + + // Allocate the raw memory. + auto *buf = ::operator new[](sz, std::align_val_t{al}); + + // Formally construct the bytes array. + auto *ptr = ::new (buf) std::byte[sz]; + + // Construct and return the unique ptr. + return aligned_buffer_t{ptr, {.al = std::align_val_t{al}}}; +#else + return aligned_buffer_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}}; +#endif + } +} + +} // namespace detail + +HEYOKA_END_NAMESPACE From e6d8b3368d6490966c39b91fc7c7d778f212dd0d Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 21 Aug 2024 10:17:01 +0200 Subject: [PATCH 03/11] Minor. --- src/cfunc_class.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp index 021b563da..ee662a27c 100644 --- a/src/cfunc_class.cpp +++ b/src/cfunc_class.cpp @@ -811,7 +811,7 @@ void cfunc::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional // will block as execution in the parallel region of the cfunc begins. The // blocked thread could then grab another task from the parallel for loop // we are currently in, and it would then start writing for a second time - // into the same tape it already begun writing into, leading to UB. + // into the same tape it already begun writing into. oneapi::tbb::this_task_arena::isolate( [&]() { batch_iter.template operator()(range, tape_ptr); }); }); From b5f475f620aca579e4a86d42b0b5304f3f1ed1e3 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 21 Aug 2024 13:03:50 +0200 Subject: [PATCH 04/11] Introduce helper to clone LLVM types from one context to the other. --- include/heyoka/detail/llvm_helpers.hpp | 4 +- include/heyoka/detail/real_helpers.hpp | 2 +- src/detail/llvm_helpers.cpp | 67 +++++++++++++++++++++++++- src/detail/real_helpers.cpp | 4 +- 4 files changed, 72 insertions(+), 5 deletions(-) diff --git a/include/heyoka/detail/llvm_helpers.hpp b/include/heyoka/detail/llvm_helpers.hpp index 3c60deaa3..c5e3fcf42 100644 --- a/include/heyoka/detail/llvm_helpers.hpp +++ b/include/heyoka/detail/llvm_helpers.hpp @@ -67,6 +67,8 @@ HEYOKA_DLL_PUBLIC llvm::Type *make_vector_type(llvm::Type *, std::uint32_t); HEYOKA_DLL_PUBLIC std::string llvm_mangle_type(llvm::Type *); +llvm::Type *llvm_clone_type(llvm_state &, const llvm::Type *); + HEYOKA_DLL_PUBLIC std::uint32_t get_vector_size(llvm::Value *); HEYOKA_DLL_PUBLIC std::uint64_t get_alignment(llvm::Module &, llvm::Type *); @@ -121,7 +123,7 @@ HEYOKA_DLL_PUBLIC void llvm_if_then_else(llvm_state &, llvm::Value *, const std: HEYOKA_DLL_PUBLIC void llvm_switch_u32(llvm_state &, llvm::Value *, const std::function &, const std::map> &); -HEYOKA_DLL_PUBLIC std::string llvm_type_name(llvm::Type *); +HEYOKA_DLL_PUBLIC std::string llvm_type_name(const llvm::Type *); void llvm_append_block(llvm::Function *, llvm::BasicBlock *); diff --git a/include/heyoka/detail/real_helpers.hpp b/include/heyoka/detail/real_helpers.hpp index c64a239de..57f19d5dc 100644 --- a/include/heyoka/detail/real_helpers.hpp +++ b/include/heyoka/detail/real_helpers.hpp @@ -28,7 +28,7 @@ namespace detail // The integral type corresponding to the mpfr_rnd_t enum. using real_rnd_t = std::underlying_type_t; -mpfr_prec_t llvm_is_real(llvm::Type *); +mpfr_prec_t llvm_is_real(const llvm::Type *); llvm::Value *llvm_real_fneg(llvm_state &, llvm ::Value *); llvm::Function *real_nary_op(llvm_state &, llvm::Type *, const std::string &, unsigned); diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index 010ed98fc..5cc7a48cb 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -1492,7 +1492,7 @@ void llvm_loop_u32(llvm_state &s, llvm::Value *begin, llvm::Value *end, const st // Small helper to fetch a string representation // of an LLVM type. -std::string llvm_type_name(llvm::Type *t) +std::string llvm_type_name(const llvm::Type *t) { assert(t != nullptr); @@ -3359,6 +3359,71 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t) } } +// Utility to create an identical copy of the type tp in the context of the state s. +llvm::Type *llvm_clone_type(llvm_state &s, const llvm::Type *tp) +{ + assert(tp != nullptr); + + // Fetch the target context. + auto &ctx = s.context(); + + // Construct the scalar type first, then we will convert + // to a vector if needed. + const auto *tp_scal = tp->getScalarType(); + llvm::Type *ret_scal_t = nullptr; + +#define HEYOKA_LLVM_CLONE_TYPE_IMPL(tid) \ + case llvm::Type::tid##TyID: \ + ret_scal_t = llvm::Type::get##tid##Ty(ctx); \ + break + + switch (tp_scal->getTypeID()) { + HEYOKA_LLVM_CLONE_TYPE_IMPL(Float); + HEYOKA_LLVM_CLONE_TYPE_IMPL(Double); + HEYOKA_LLVM_CLONE_TYPE_IMPL(X86_FP80); + HEYOKA_LLVM_CLONE_TYPE_IMPL(FP128); + default: { +#if defined(HEYOKA_HAVE_REAL) + + if (const auto prec = llvm_is_real(tp_scal); prec != 0) { + // tp_scal is the internal counterpart of mppp::real. + ret_scal_t = to_internal_llvm_type(s, prec); + break; + } else if (tp_scal == to_external_llvm_type(tp_scal->getContext())) { + // tp_scal is mppp::real. + ret_scal_t = to_external_llvm_type(ctx); + break; + } + +#endif + // LCOV_EXCL_START + throw std::invalid_argument( + fmt::format("Cannot clone the LLVM type '{}' to another context", llvm_type_name(tp))); + // LCOV_EXCL_STOP + } + } + +#undef HEYOKA_LLVM_CLONE_TYPE_IMPL + + assert(ret_scal_t != nullptr); + + if (tp->isVectorTy()) { + // tp is a vector type. + if (const auto *vtp = llvm::dyn_cast(tp)) [[likely]] { + return make_vector_type(ret_scal_t, boost::numeric_cast(vtp->getNumElements())); + } else { + // LCOV_EXCL_START + throw std::invalid_argument(fmt::format("Cannot clone the LLVM type '{}' to another context - the type is " + "a vector type whose size is not fixed", + llvm_type_name(tp))); + // LCOV_EXCL_STOP + } + } else { + // tp is a scalar type. + return ret_scal_t; + } +} + } // namespace detail HEYOKA_END_NAMESPACE diff --git a/src/detail/real_helpers.cpp b/src/detail/real_helpers.cpp index 100684190..58457e034 100644 --- a/src/detail/real_helpers.cpp +++ b/src/detail/real_helpers.cpp @@ -78,9 +78,9 @@ llvm::AttributeList get_mpfr_attr_list(llvm::LLVMContext &context) // Determine if the input type is heyoka.real.N, // and, in such case, return N. Otherwise, return 0. -mpfr_prec_t llvm_is_real(llvm::Type *t) +mpfr_prec_t llvm_is_real(const llvm::Type *t) { - if (auto *ptr = llvm::dyn_cast(t)) { + if (const auto *ptr = llvm::dyn_cast(t)) { const auto sname = ptr->getStructName(); if ( From 9d46115ffb705b5da0ad5cc7663c67d547e46a6e Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 21 Aug 2024 16:19:21 +0200 Subject: [PATCH 05/11] De-template the implementation of make_multi_cfunc(). --- include/heyoka/detail/llvm_helpers.hpp | 4 +- include/heyoka/detail/real_helpers.hpp | 2 +- include/heyoka/expression.hpp | 4 +- src/cfunc_class.cpp | 4 +- src/detail/llvm_helpers.cpp | 9 +- src/detail/real_helpers.cpp | 4 +- src/expression_cfunc.cpp | 175 +++++++++++++++---------- 7 files changed, 119 insertions(+), 83 deletions(-) diff --git a/include/heyoka/detail/llvm_helpers.hpp b/include/heyoka/detail/llvm_helpers.hpp index c5e3fcf42..cbb671415 100644 --- a/include/heyoka/detail/llvm_helpers.hpp +++ b/include/heyoka/detail/llvm_helpers.hpp @@ -67,7 +67,7 @@ HEYOKA_DLL_PUBLIC llvm::Type *make_vector_type(llvm::Type *, std::uint32_t); HEYOKA_DLL_PUBLIC std::string llvm_mangle_type(llvm::Type *); -llvm::Type *llvm_clone_type(llvm_state &, const llvm::Type *); +llvm::Type *llvm_clone_type(llvm_state &, llvm::Type *); HEYOKA_DLL_PUBLIC std::uint32_t get_vector_size(llvm::Value *); @@ -123,7 +123,7 @@ HEYOKA_DLL_PUBLIC void llvm_if_then_else(llvm_state &, llvm::Value *, const std: HEYOKA_DLL_PUBLIC void llvm_switch_u32(llvm_state &, llvm::Value *, const std::function &, const std::map> &); -HEYOKA_DLL_PUBLIC std::string llvm_type_name(const llvm::Type *); +HEYOKA_DLL_PUBLIC std::string llvm_type_name(llvm::Type *); void llvm_append_block(llvm::Function *, llvm::BasicBlock *); diff --git a/include/heyoka/detail/real_helpers.hpp b/include/heyoka/detail/real_helpers.hpp index 57f19d5dc..c64a239de 100644 --- a/include/heyoka/detail/real_helpers.hpp +++ b/include/heyoka/detail/real_helpers.hpp @@ -28,7 +28,7 @@ namespace detail // The integral type corresponding to the mpfr_rnd_t enum. using real_rnd_t = std::underlying_type_t; -mpfr_prec_t llvm_is_real(const llvm::Type *); +mpfr_prec_t llvm_is_real(llvm::Type *); llvm::Value *llvm_real_fneg(llvm_state &, llvm ::Value *); llvm::Function *real_nary_op(llvm_state &, llvm::Type *, const std::string &, unsigned); diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp index 2758a14c5..a5c099c12 100644 --- a/include/heyoka/expression.hpp +++ b/include/heyoka/expression.hpp @@ -698,8 +698,8 @@ auto cfunc_common_opts(const KwArgs &...kw_args) template std::tuple, std::vector>> -make_multi_cfunc(const llvm_state &, const std::string &, const std::vector &, - const std::vector &, std::uint32_t, bool, bool, long long); +make_multi_cfunc(llvm_state, const std::string &, const std::vector &, const std::vector &, + std::uint32_t, bool, bool, long long); } // namespace detail diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp index ee662a27c..93882b47c 100644 --- a/src/cfunc_class.cpp +++ b/src/cfunc_class.cpp @@ -206,8 +206,8 @@ struct cfunc::impl { if (compact_mode) { // Build the multi cfunc, and assign the internal members. - std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc(s, "cfunc", m_fn, m_vars, m_batch_size, - high_accuracy, m_parallel_mode, prec); + std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc( + std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec); // Compile. std::get<1>(m_states).compile(); diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index 5cc7a48cb..7c8fdb99d 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -1492,7 +1492,7 @@ void llvm_loop_u32(llvm_state &s, llvm::Value *begin, llvm::Value *end, const st // Small helper to fetch a string representation // of an LLVM type. -std::string llvm_type_name(const llvm::Type *t) +std::string llvm_type_name(llvm::Type *t) { assert(t != nullptr); @@ -3360,7 +3360,10 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t) } // Utility to create an identical copy of the type tp in the context of the state s. -llvm::Type *llvm_clone_type(llvm_state &s, const llvm::Type *tp) +// NOTE: although it may look like this is a read-only operation on tp, it is not, +// since we are potentially poking into the context of tp during operations. Thus, this +// function cannot be called concurrently from multiple threads on the same tp object. +llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp) { assert(tp != nullptr); @@ -3369,7 +3372,7 @@ llvm::Type *llvm_clone_type(llvm_state &s, const llvm::Type *tp) // Construct the scalar type first, then we will convert // to a vector if needed. - const auto *tp_scal = tp->getScalarType(); + auto *tp_scal = tp->getScalarType(); llvm::Type *ret_scal_t = nullptr; #define HEYOKA_LLVM_CLONE_TYPE_IMPL(tid) \ diff --git a/src/detail/real_helpers.cpp b/src/detail/real_helpers.cpp index 58457e034..100684190 100644 --- a/src/detail/real_helpers.cpp +++ b/src/detail/real_helpers.cpp @@ -78,9 +78,9 @@ llvm::AttributeList get_mpfr_attr_list(llvm::LLVMContext &context) // Determine if the input type is heyoka.real.N, // and, in such case, return N. Otherwise, return 0. -mpfr_prec_t llvm_is_real(const llvm::Type *t) +mpfr_prec_t llvm_is_real(llvm::Type *t) { - if (const auto *ptr = llvm::dyn_cast(t)) { + if (auto *ptr = llvm::dyn_cast(t)) { const auto sname = ptr->getStructName(); if ( diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp index 00a8c5aa0..549b26204 100644 --- a/src/expression_cfunc.cpp +++ b/src/expression_cfunc.cpp @@ -1568,14 +1568,15 @@ namespace // implemented in distinct llvm_state objects. // // states is the current list of states (to which more will be added by this function), and the last state -// in the list is the "main" state. s_dc is the segmented decomposition of the function to be compiled. +// in the list is the "main" state. main_fp_t is the internal scalar floating-point type as defined in the main state. +// s_dc is the segmented decomposition of the function to be compiled. // base_name is the name of the main function from which the drivers are to be invoked. main_eval_arr, // main_par_ptr, main_time_ptr and main_stride are, respectively, the pointer to the evaluation tape, // the pointer to the parameter values, the pointer to time coordinate(s) and the stride - these are all // defined in the main state and they are passed to the driver functions invocations. -template -void multi_cfunc_evaluate_segments(std::list &states, const SDC &s_dc, std::uint32_t nvars, - std::uint32_t batch_size, bool high_accuracy, long long prec, +template +void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list &states, const SDC &s_dc, + std::uint32_t nvars, std::uint32_t batch_size, bool high_accuracy, const std::string &base_name, llvm::Value *main_eval_arr, llvm::Value *main_par_ptr, llvm::Value *main_time_ptr, llvm::Value *main_stride) { @@ -1729,7 +1730,7 @@ void multi_cfunc_evaluate_segments(std::list &states, const SDC &s_d } // Fetch the internal fp type and its vector counterpart for the current state. - auto *fp_t = to_internal_llvm_type(*cur_state, prec); + auto *fp_t = llvm_clone_type(*cur_state, main_fp_t); auto *fp_vec_type = make_vector_type(fp_t, batch_size); // Fetch the current builder. @@ -1955,17 +1956,13 @@ void multi_cfunc_evaluate_segments(std::list &states, const SDC &s_d // LCOV_EXCL_STOP } -// NOTE: here we are forced to use a templated function, rather than passing in the -// LLVM type fp_t as usual, because we need to re-create the type for every context -// in every state, and there seems not to be an easy way to transfer/copy a type -// from one context to the other. -template -std::array -add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm::Value *in_ptr, llvm::Value *par_ptr, - llvm::Value *time_ptr, llvm::Value *stride, const std::vector &dc, std::uint32_t nvars, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - std::uint32_t nuvars, std::uint32_t batch_size, bool high_accuracy, long long prec, - const std::string &base_name, llvm::Value *eval_arr) +std::array add_multi_cfunc_impl(llvm::Type *fp_t, std::list &states, llvm::Value *out_ptr, + llvm::Value *in_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr, + llvm::Value *stride, const std::vector &dc, + std::uint32_t nvars, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + std::uint32_t nuvars, std::uint32_t batch_size, bool high_accuracy, + const std::string &base_name, llvm::Value *eval_arr) { // Fetch the main state, module, etc. auto &main_state = states.back(); @@ -1973,7 +1970,10 @@ add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm:: auto &main_builder = main_state.builder(); // Fetch the fp types for the main state. - auto *main_fp_t = to_internal_llvm_type(main_state, prec); + // NOTE: cloning is safe here, as even though this function is being invoked + // in parallel from multiple threads, we have made sure that each invocation + // gets its own cloned copy of fp_t. + auto *main_fp_t = llvm_clone_type(main_state, fp_t); auto *main_ext_fp_t = make_external_llvm_type(main_fp_t); auto *fp_vec_type = make_vector_type(main_fp_t, batch_size); @@ -2015,8 +2015,8 @@ add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm:: }); // Generate the code for the evaluation of all segments. - multi_cfunc_evaluate_segments(states, s_dc, nvars, batch_size, high_accuracy, prec, base_name, eval_arr, par_ptr, - time_ptr, stride); + multi_cfunc_evaluate_segments(main_fp_t, states, s_dc, nvars, batch_size, high_accuracy, base_name, eval_arr, + par_ptr, time_ptr, stride); // Write the results to the output pointer. cfunc_c_write_outputs(main_state, main_fp_t, out_ptr, cout_gl, eval_arr, par_ptr, stride, batch_size); @@ -2027,32 +2027,10 @@ add_multi_cfunc_impl(std::list &states, llvm::Value *out_ptr, llvm:: return {sz, al}; } -} // namespace - -// This function will compile several versions of the input function fn, with input variables vars, in compact mode. -// -// The compiled functions are implemented across several llvm_states which are collated together and returned as -// a single llvm_multi_state (this is the first element of the return tuple). If batch_size is 1, -// then 2 compiled functions are created - a scalar strided and a scalar unstrided version. -// If batch size is > 1, then an additional batch-mode strided compiled function is returned. -// The function names are created using "name" as base name and then mangling in the strided/unstrided -// property and the batch size. -// -// The second element of the return tuple is the decomposition of fn. -// -// The third element of the return tuple is a vector of pairs, each pair containing the size and alignment requirements -// for the externally-provided storage for the evaluation tape. If batch_size is 1, then only a single -// pair is returned, representing the size/alignment requirements for the scalar-mode evaluation tape. -// If batch_size > 1, then an additional pair is appended representing the size/alignment requirements -// for the batch-mode evaluation tape. -// -// NOTE: there is a bunch of boilerplate logic overlap here with add_cfunc_impl(). Make sure to -// coordinate changes between the two functions. -template std::tuple, std::vector>> -make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vector &fn, - const std::vector &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode, - long long prec) +make_multi_cfunc_impl(llvm::Type *fp_t, const llvm_state &tplt, const std::string &name, + const std::vector &fn, const std::vector &vars, std::uint32_t batch_size, + bool high_accuracy, bool parallel_mode) { if (batch_size == 0u) [[unlikely]] { throw std::invalid_argument("The batch size of a compiled function cannot be zero"); @@ -2062,27 +2040,6 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec throw std::invalid_argument("Parallel mode has not been implemented yet"); } -#if defined(HEYOKA_ARCH_PPC) - if constexpr (std::is_same_v) { - throw not_implemented_error("'long double' computations are not supported on PowerPC"); - } -#endif - -#if defined(HEYOKA_HAVE_REAL) - - if constexpr (std::is_same_v) { - const auto sprec = boost::numeric_cast(prec); - - if (sprec < mppp::real_prec_min() || sprec > mppp::real_prec_max()) [[unlikely]] { - throw std::invalid_argument( - fmt::format("An invalid precision value of {} was passed to make_multi_cfunc() (the " - "value must be in the [{}, {}] range)", - sprec, mppp::real_prec_min(), mppp::real_prec_max())); - } - } - -#endif - if (name.empty()) [[unlikely]] { throw std::invalid_argument("A non-empty function name is required when invoking make_multi_cfunc()"); } @@ -2126,9 +2083,26 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec tape_size_align.resize(2); } - // Helper to create a cfunc. + // NOTE: this is ugly, but needed. Cloning an LLVM type into another + // context is not a thread-safe operation as we might be poking into + // the context of the original type. Thus, we first make 2 or 3 clones + // of fp_t each associated to a different llvm_state without any multithreading, + // and then we use these clones for further cloning while parallel invoking + // create_cfunc(). + std::vector> fp_t_clones; + fp_t_clones.reserve(3); + for (auto i = 0; i < (batch_size == 1u ? 2 : 3); ++i) { + // Create a new state and clone fp_t into it. + auto new_state = tplt.make_similar(); + auto *new_fp_t = llvm_clone_type(new_state, fp_t); + + fp_t_clones.emplace_back(std::move(new_state), new_fp_t); + } + + // Helper to create a single cfunc. auto create_cfunc = [&states_lists, &tape_size_align, &tplt, &name, &dc = std::as_const(dc), nvars, nuvars, - high_accuracy, prec](bool strided, std::uint32_t cur_batch_size) { + high_accuracy, + &fp_t_clones = std::as_const(fp_t_clones)](bool strided, std::uint32_t cur_batch_size) { // NOTE: the batch unstrided variant is not supposed to be requested. assert(strided || cur_batch_size == 1u); @@ -2145,6 +2119,9 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec assert(states.empty()); + // Fetch the local cloned fp_t. + auto *loc_fp_t = fp_t_clones[sidx].second; + // Add a new state and fetch it. states.push_back(tplt.make_similar()); auto &s = states.back(); @@ -2228,8 +2205,8 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec builder.SetInsertPoint(bb); // Create the body of the function. - const auto tape_sa = add_multi_cfunc_impl(states, out_ptr, in_ptr, par_ptr, time_ptr, stride, dc, nvars, - nuvars, cur_batch_size, high_accuracy, prec, cur_name, tape_ptr); + const auto tape_sa = add_multi_cfunc_impl(loc_fp_t, states, out_ptr, in_ptr, par_ptr, time_ptr, stride, dc, + nvars, nuvars, cur_batch_size, high_accuracy, cur_name, tape_ptr); // Add the size/alignment requirements for the tape storage. // NOTE: there's no difference in requirements between strided and @@ -2258,7 +2235,8 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec // into a thread-safe tbb vector. // // At the moment though it looks like the practical gains from such further parallelisation - // would not be worth it, perhaps we can reconsider in the future. + // would not be worth it, perhaps we can reconsider in the future. It is also not clear how + // to deal with thread-unsafe type cloning in this hypothetical scenario. if (batch_size == 1u) { oneapi::tbb::parallel_invoke([&create_cfunc]() { create_cfunc(false, 1); }, [&create_cfunc]() { create_cfunc(true, 1); }); @@ -2285,11 +2263,66 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec std::move(dc), std::move(tape_size_align)); } +} // namespace + +// This function will compile several versions of the input function fn, with input variables vars, in compact mode. +// +// The compiled functions are implemented across several llvm_states which are collated together and returned as +// a single llvm_multi_state (this is the first element of the return tuple). If batch_size is 1, +// then 2 compiled functions are created - a scalar strided and a scalar unstrided version. +// If batch size is > 1, then an additional batch-mode strided compiled function is returned. +// The function names are created using "name" as base name and then mangling in the strided/unstrided +// property and the batch size. +// +// The second element of the return tuple is the decomposition of fn. +// +// The third element of the return tuple is a vector of pairs, each pair containing the size and alignment requirements +// for the externally-provided storage for the evaluation tape. If batch_size is 1, then only a single +// pair is returned, representing the size/alignment requirements for the scalar-mode evaluation tape. +// If batch_size > 1, then an additional pair is appended representing the size/alignment requirements +// for the batch-mode evaluation tape. +// +// NOTE: there is a bunch of boilerplate logic overlap here with add_cfunc_impl(). Make sure to +// coordinate changes between the two functions. +template +std::tuple, std::vector>> +make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector &fn, + const std::vector &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode, + long long prec) +{ +#if defined(HEYOKA_ARCH_PPC) + if constexpr (std::is_same_v) { + throw not_implemented_error("'long double' computations are not supported on PowerPC"); + } +#endif + +#if defined(HEYOKA_HAVE_REAL) + + if constexpr (std::is_same_v) { + const auto sprec = boost::numeric_cast(prec); + + if (sprec < mppp::real_prec_min() || sprec > mppp::real_prec_max()) [[unlikely]] { + throw std::invalid_argument( + fmt::format("An invalid precision value of {} was passed to make_multi_cfunc() (the " + "value must be in the [{}, {}] range)", + sprec, mppp::real_prec_min(), mppp::real_prec_max())); + } + } + +#endif + + // Fetch the internal scalar fp type from the template state. We will be cloning + // this throughout the rest of the implementation. + auto *fp_t = to_internal_llvm_type(tplt, prec); + + return make_multi_cfunc_impl(fp_t, tplt, name, fn, vars, batch_size, high_accuracy, parallel_mode); +} + // Explicit instantiations. #define HEYOKA_MAKE_MULTI_CFUNC_INST(T) \ template HEYOKA_DLL_PUBLIC \ std::tuple, std::vector>> \ - make_multi_cfunc(const llvm_state &, const std::string &, const std::vector &, \ + make_multi_cfunc(llvm_state, const std::string &, const std::vector &, \ const std::vector &, std::uint32_t, bool, bool, long long); HEYOKA_MAKE_MULTI_CFUNC_INST(float) From 79f0d7751c30160a2477cad5d1c525b7a94347ee Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 22 Aug 2024 09:41:58 +0200 Subject: [PATCH 06/11] Testing for the type cloning function. --- include/heyoka/detail/llvm_helpers.hpp | 2 +- src/detail/llvm_helpers.cpp | 5 ++-- test/llvm_helpers.cpp | 33 ++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/include/heyoka/detail/llvm_helpers.hpp b/include/heyoka/detail/llvm_helpers.hpp index cbb671415..cbc4e0395 100644 --- a/include/heyoka/detail/llvm_helpers.hpp +++ b/include/heyoka/detail/llvm_helpers.hpp @@ -67,7 +67,7 @@ HEYOKA_DLL_PUBLIC llvm::Type *make_vector_type(llvm::Type *, std::uint32_t); HEYOKA_DLL_PUBLIC std::string llvm_mangle_type(llvm::Type *); -llvm::Type *llvm_clone_type(llvm_state &, llvm::Type *); +HEYOKA_DLL_PUBLIC llvm::Type *llvm_clone_type(llvm_state &, llvm::Type *); HEYOKA_DLL_PUBLIC std::uint32_t get_vector_size(llvm::Value *); diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index 7c8fdb99d..7b307afa5 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -3360,9 +3360,10 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t) } // Utility to create an identical copy of the type tp in the context of the state s. -// NOTE: although it may look like this is a read-only operation on tp, it is not, +// NOTE: although it may sound like this is a read-only operation on tp, it is not, // since we are potentially poking into the context of tp during operations. Thus, this -// function cannot be called concurrently from multiple threads on the same tp object. +// function cannot be called concurrently from multiple threads on the same tp object, +// or even on different tp objects defined in the same context. llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp) { assert(tp != nullptr); diff --git a/test/llvm_helpers.cpp b/test/llvm_helpers.cpp index a09f40f1a..ec13bd327 100644 --- a/test/llvm_helpers.cpp +++ b/test/llvm_helpers.cpp @@ -3030,3 +3030,36 @@ TEST_CASE("switch") #endif } + +TEST_CASE("clone type") +{ + using detail::llvm_clone_type; + + auto tester = [](fp_t) { + llvm_state source, dest; + + auto *tp_source = detail::to_external_llvm_type(source.context()); + auto *tp_dest = llvm_clone_type(dest, tp_source); + REQUIRE(tp_dest == detail::to_external_llvm_type(dest.context())); + + auto *vec_tp_source = detail::make_vector_type(tp_source, 4); + auto *vec_tp_dest = llvm_clone_type(dest, vec_tp_source); + REQUIRE(vec_tp_dest == detail::make_vector_type(tp_dest, 4)); + }; + + tuple_for_each(fp_types, tester); + +#if defined(HEYOKA_HAVE_REAL) + + llvm_state source, dest; + + auto *tp_ext_source = detail::to_external_llvm_type(source.context()); + auto *tp_ext_dest = llvm_clone_type(dest, tp_ext_source); + REQUIRE(tp_ext_dest == detail::to_external_llvm_type(dest.context())); + + auto *tp_int_source = detail::to_internal_llvm_type(source, 11); + auto *tp_int_dest = llvm_clone_type(dest, tp_int_source); + REQUIRE(tp_int_dest == detail::to_internal_llvm_type(dest, 11)); + +#endif +} From 731cbcccde8c9d5b2b35df927268fca88f25931e Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 22 Aug 2024 14:46:01 +0200 Subject: [PATCH 07/11] Small tweaks. --- src/detail/llvm_helpers.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index 7b307afa5..ba766450e 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -3364,6 +3364,11 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t) // since we are potentially poking into the context of tp during operations. Thus, this // function cannot be called concurrently from multiple threads on the same tp object, // or even on different tp objects defined in the same context. +// NOTE: this handles only floating-point (vector) types at this time, extending +// to intgeral types should be fairly easy. +// NOTE: perhaps this function could be made more generic for arbitrary struct types +// by (recursively) reading the struct layout and then reproducing it in the target +// context. Like this, we could avoid special casing for the mppp::real types. llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp) { assert(tp != nullptr); @@ -3381,12 +3386,15 @@ llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp) ret_scal_t = llvm::Type::get##tid##Ty(ctx); \ break + // NOTE: gcov seems to get a bit confused by the macro usage. + // LCOV_EXCL_START switch (tp_scal->getTypeID()) { HEYOKA_LLVM_CLONE_TYPE_IMPL(Float); HEYOKA_LLVM_CLONE_TYPE_IMPL(Double); HEYOKA_LLVM_CLONE_TYPE_IMPL(X86_FP80); HEYOKA_LLVM_CLONE_TYPE_IMPL(FP128); default: { + #if defined(HEYOKA_HAVE_REAL) if (const auto prec = llvm_is_real(tp_scal); prec != 0) { @@ -3400,14 +3408,14 @@ llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp) } #endif - // LCOV_EXCL_START + throw std::invalid_argument( fmt::format("Cannot clone the LLVM type '{}' to another context", llvm_type_name(tp))); - // LCOV_EXCL_STOP } } #undef HEYOKA_LLVM_CLONE_TYPE_IMPL + // LCOV_EXCL_STOP assert(ret_scal_t != nullptr); From 207011cb1cbe0a1a31bdb47cf988054a6344f31c Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 22 Aug 2024 14:58:13 +0200 Subject: [PATCH 08/11] Minor. --- src/detail/llvm_helpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index ba766450e..c5a4afc2c 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -3365,7 +3365,7 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t) // function cannot be called concurrently from multiple threads on the same tp object, // or even on different tp objects defined in the same context. // NOTE: this handles only floating-point (vector) types at this time, extending -// to intgeral types should be fairly easy. +// to integral types should be fairly easy. // NOTE: perhaps this function could be made more generic for arbitrary struct types // by (recursively) reading the struct layout and then reproducing it in the target // context. Like this, we could avoid special casing for the mppp::real types. From e08fa543c0cff8016ee1732156de14608201758a Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Fri, 23 Aug 2024 09:16:57 +0200 Subject: [PATCH 09/11] A coupld of small simplifications. --- src/taylor_adaptive.cpp | 3 +-- src/taylor_adaptive_batch.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp index 38cf97974..3a8bb1bab 100644 --- a/src/taylor_adaptive.cpp +++ b/src/taylor_adaptive.cpp @@ -437,8 +437,7 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, // Add the function for the computation of // the dense output. - detail::taylor_add_d_out_function(m_llvm, detail::internal_llvm_type_like(m_llvm, m_state[0]), m_dim, m_order, 1, - high_accuracy); + detail::taylor_add_d_out_function(m_llvm, fp_t, m_dim, m_order, 1, high_accuracy); detail::get_logger()->trace("Taylor dense output runtime: {}", sw); sw.reset(); diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp index a8e0f6046..e97b4d0df 100644 --- a/src/taylor_adaptive_batch.cpp +++ b/src/taylor_adaptive_batch.cpp @@ -284,8 +284,7 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta // Add the function for the computation of // the dense output. - detail::taylor_add_d_out_function(m_llvm, detail::to_external_llvm_type(m_llvm.context()), m_dim, m_order, - m_batch_size, high_accuracy); + detail::taylor_add_d_out_function(m_llvm, ext_fp_t, m_dim, m_order, m_batch_size, high_accuracy); detail::get_logger()->trace("Taylor batch dense output runtime: {}", sw); sw.reset(); From 8ae04c0ffc4d3d6cce5c14395aec1f670b19f2fc Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Fri, 23 Aug 2024 14:55:52 +0200 Subject: [PATCH 10/11] Fix typo. --- src/expression_cfunc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp index 549b26204..0eb30a861 100644 --- a/src/expression_cfunc.cpp +++ b/src/expression_cfunc.cpp @@ -1991,7 +1991,7 @@ std::array add_multi_cfunc_impl(llvm::Type *fp_t, std::list(get_alignment(main_md, fp_vec_type)); // NOTE: eval_arr is used as temporary storage for the current function, - // but it provided externally from dynamically-allocated memory in order to avoid stack overflow. + // but it is provided externally from dynamically-allocated memory in order to avoid stack overflow. // This creates a situation in which LLVM cannot elide stores into eval_arr // (even if it figures out a way to avoid storing intermediate results into // eval_arr) because LLVM must assume that some other function may From bc7bf4697b1b73085929a3aff4e32d4e3a3c6a4a Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Fri, 23 Aug 2024 15:11:50 +0200 Subject: [PATCH 11/11] Internal doc bit. --- src/expression_cfunc.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp index 0eb30a861..5eeb93538 100644 --- a/src/expression_cfunc.cpp +++ b/src/expression_cfunc.cpp @@ -1609,6 +1609,11 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list llvm_func_name_compare>; // Push back a new state and use it as initial current state. + // NOTE: like this, we always end up creating at least one driver + // function and a state, even in the degenerate case of an empty decomposition, + // which is suboptimal peformance-wise. + // I do not think however that it is worth it to complicate the code to avoid + // this corner-case pessimisation. states.push_back(main_state.make_similar()); auto *cur_state = &states.back();