From b91b2407404e57f156d818c032325b7525b76983 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 22 Aug 2024 14:58:13 +0200 Subject: [PATCH 01/30] Minor. --- src/detail/llvm_helpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index ba766450e..c5a4afc2c 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -3365,7 +3365,7 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t) // function cannot be called concurrently from multiple threads on the same tp object, // or even on different tp objects defined in the same context. // NOTE: this handles only floating-point (vector) types at this time, extending -// to intgeral types should be fairly easy. +// to integral types should be fairly easy. // NOTE: perhaps this function could be made more generic for arbitrary struct types // by (recursively) reading the struct layout and then reproducing it in the target // context. Like this, we could avoid special casing for the mppp::real types. From 3e71b6e1d1ddf2c968364f72d160f20f750e88e6 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Sun, 25 Aug 2024 15:05:45 +0200 Subject: [PATCH 02/30] Some very preliminar, non-functional code. --- include/heyoka/detail/i_data.hpp | 21 +- include/heyoka/taylor.hpp | 6 +- src/detail/i_data.cpp | 57 ++- src/taylor_00.cpp | 41 +- src/taylor_02.cpp | 776 ++++++++++++++----------------- src/taylor_adaptive.cpp | 19 +- 6 files changed, 467 insertions(+), 453 deletions(-) diff --git a/include/heyoka/detail/i_data.hpp b/include/heyoka/detail/i_data.hpp index 3f56994d4..205e0a943 100644 --- a/include/heyoka/detail/i_data.hpp +++ b/include/heyoka/detail/i_data.hpp @@ -15,12 +15,15 @@ #endif +#include +#include #include #include #include #include #include +#include #include #include #include @@ -64,8 +67,8 @@ struct taylor_adaptive::i_data { std::vector m_state; // Time. detail::dfloat m_time; - // The LLVM machinery. - llvm_state m_llvm; + // The LLVM (multi)state. + std::variant m_llvm_state; // Dimension of the system. std::uint32_t m_dim{}; // Taylor decomposition. @@ -78,10 +81,18 @@ struct taylor_adaptive::i_data { bool m_high_accuracy{}; // Compact mode. bool m_compact_mode{}; - // The steppers. + // The stepper types (non-compact mode). using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept; using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept; - std::variant m_step_f; + // The stepper types (compact mode). These have an additional argument - the tape pointer. + using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept; + using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept; + // The stepper. + std::variant m_step_f; + // Size/alignment for the compact mode tape. + std::array m_tape_sa{}; + // Compact mode tape. + detail::aligned_buffer_t m_tape; // The vector of parameters. std::vector m_pars; // The vector for the Taylor coefficients. @@ -118,6 +129,8 @@ struct taylor_adaptive::i_data { i_data &operator=(i_data &&) noexcept = delete; ~i_data(); + + void init_cm_tape(); }; template diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp index 5282f6885..46bcc7258 100644 --- a/include/heyoka/taylor.hpp +++ b/include/heyoka/taylor.hpp @@ -109,9 +109,9 @@ taylor_dc_t taylor_add_adaptive_step(llvm_state &, llvm::Type *, llvm::Type *, c llvm::Value *taylor_c_make_sv_funcs_arr(llvm_state &, const std::vector &); std::variant, std::vector> -taylor_compute_jet(llvm_state &, llvm::Type *, llvm::Value *, llvm::Value *, llvm::Value *, const taylor_dc_t &, - const std::vector &, std::uint32_t, std::uint32_t, std::uint32_t, std::uint32_t, bool, - bool, bool); +taylor_compute_jet(llvm_state &, llvm::Type *, llvm::Value *, llvm::Value *, llvm::Value *, llvm::Value *, + const taylor_dc_t &, const std::vector &, std::uint32_t, std::uint32_t, std::uint32_t, + std::uint32_t, bool, bool, bool); std::pair> taylor_c_diff_func_name_args(llvm::LLVMContext &, llvm::Type *, const std::string &, std::uint32_t, std::uint32_t, diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp index 51f15f257..572ca7655 100644 --- a/src/detail/i_data.cpp +++ b/src/detail/i_data.cpp @@ -8,9 +8,13 @@ #include +#include #include #include #include +#include + +#include #if defined(HEYOKA_HAVE_REAL128) @@ -24,6 +28,7 @@ #endif +#include #include #include #include @@ -89,18 +94,38 @@ void serialize(Archive &ar, std::tuple &tup, un HEYOKA_BEGIN_NAMESPACE +// Helper to initialise the compact-mode tape. Assumes an empty tape. +template +void taylor_adaptive::i_data::init_cm_tape() +{ + assert(!m_tape); + + const auto [sz, al] = m_tape_sa; + + if (m_compact_mode) { + assert(sz != 0u); + assert(al != 0u); + + m_tape = detail::make_aligned_buffer(sz, al); + } else { + assert(sz == 0u); + assert(al == 0u); + } +} + template void taylor_adaptive::i_data::save(boost::archive::binary_oarchive &ar, unsigned) const { ar << m_state; ar << m_time; - ar << m_llvm; + ar << m_llvm_state; ar << m_dim; ar << m_dc; ar << m_order; ar << m_tol; ar << m_high_accuracy; ar << m_compact_mode; + ar << m_tape_sa; ar << m_pars; ar << m_tc; ar << m_last_h; @@ -114,13 +139,14 @@ void taylor_adaptive::i_data::load(boost::archive::binary_iarchive &ar, unsig { ar >> m_state; ar >> m_time; - ar >> m_llvm; + ar >> m_llvm_state; ar >> m_dim; ar >> m_dc; ar >> m_order; ar >> m_tol; ar >> m_high_accuracy; ar >> m_compact_mode; + ar >> m_tape_sa; ar >> m_pars; ar >> m_tc; ar >> m_last_h; @@ -129,22 +155,35 @@ void taylor_adaptive::i_data::load(boost::archive::binary_iarchive &ar, unsig ar >> m_tm_data; // Recover the function pointers. - m_d_out_f = reinterpret_cast(m_llvm.jit_lookup("d_out_f")); + m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); + + // Reconstruct the compact mode tape, if necessary. + m_tape.reset(); + init_cm_tape(); } +// NOTE: this ctor provides only partial initialisation of the data members. +// The rest of the initialisation is performed from the integrator ctor. +// NOTE: m_llvm_state is inited as a single llvm_state regardless of the use +// of compact mode. It will be converted into a multi state if needed at a +// later stage. template -taylor_adaptive::i_data::i_data(llvm_state s) : m_llvm(std::move(s)) +taylor_adaptive::i_data::i_data(llvm_state s) : m_llvm_state(std::move(s)) { } template taylor_adaptive::i_data::i_data(const i_data &other) - : m_state(other.m_state), m_time(other.m_time), m_llvm(other.m_llvm), m_dim(other.m_dim), m_dc(other.m_dc), - m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy), - m_compact_mode(other.m_compact_mode), m_pars(other.m_pars), m_tc(other.m_tc), m_last_h(other.m_last_h), - m_d_out(other.m_d_out), m_vsys(other.m_vsys), m_tm_data(other.m_tm_data) + : m_state(other.m_state), m_time(other.m_time), m_llvm_state(other.m_llvm_state), m_dim(other.m_dim), + m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy), + m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa), m_pars(other.m_pars), m_tc(other.m_tc), + m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_vsys(other.m_vsys), m_tm_data(other.m_tm_data) { - m_d_out_f = reinterpret_cast(m_llvm.jit_lookup("d_out_f")); + // Recover the function pointers. + m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); + + // Init the compact mode tape, if necessary. + init_cm_tape(); } template diff --git a/src/taylor_00.cpp b/src/taylor_00.cpp index 6861f7b8a..b3e322a57 100644 --- a/src/taylor_00.cpp +++ b/src/taylor_00.cpp @@ -9,7 +9,9 @@ #include #include +#include #include +#include #include #include #include @@ -689,14 +691,24 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_ } // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) -taylor_dc_t taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name, - const std::vector> &sys, - std::uint32_t batch_size, bool high_accuracy, bool compact_mode, - bool parallel_mode, std::uint32_t order) +std::tuple>, std::array> +taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name, + const std::vector> &sys, std::uint32_t batch_size, + bool high_accuracy, bool compact_mode, bool parallel_mode, std::uint32_t order) { - assert(!s.is_compiled()); + assert(!tplt.is_compiled()); assert(batch_size > 0u); + // Setup the return state(s) and fetch the main state. + auto ret_states = [compact_mode, &tplt]() -> std::variant> { + if (compact_mode) { + return std::vector{tplt.make_similar()}; + } else { + return tplt.make_similar(); + } + }(); + auto &s = compact_mode ? std::get<1>(ret_states)[0] : std::get<0>(ret_states); + // Record the number of equations/variables. const auto n_eq = boost::numeric_cast(sys.size()); @@ -715,14 +727,17 @@ taylor_dc_t taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm:: auto &md = s.module(); // Prepare the function prototype. The arguments are: + // // - pointer to the current state vector (read & write), // - pointer to the parameters (read only), // - pointer to the time value(s) (read only), // - pointer to the array of max timesteps (read & write), - // - pointer to the Taylor coefficients output (write only). + // - pointer to the Taylor coefficients output (write only), + // - pointer to the tape (read & write, compact mode only). + // // These pointers cannot overlap. auto *fp_vec_t = make_vector_type(fp_t, batch_size); - const std::vector fargs(5, llvm::PointerType::getUnqual(ext_fp_t)); + const std::vector fargs(compact_mode ? 6 : 5, llvm::PointerType::getUnqual(context)); // The function does not return anything. auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); assert(ft != nullptr); @@ -760,14 +775,22 @@ taylor_dc_t taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm:: tc_ptr->addAttr(llvm::Attribute::NoAlias); tc_ptr->addAttr(llvm::Attribute::WriteOnly); + llvm::Argument *tape_ptr = nullptr; + if (compact_mode) { + tape_ptr = tc_ptr + 1; + tape_ptr->setName("tape_ptr"); + tape_ptr->addAttr(llvm::Attribute::NoCapture); + tape_ptr->addAttr(llvm::Attribute::NoAlias); + } + // Create a new basic block to start insertion into. auto *bb = llvm::BasicBlock::Create(context, "entry", f); assert(bb != nullptr); builder.SetInsertPoint(bb); // Compute the jet of derivatives at the given order. - auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, dc, {}, n_eq, n_uvars, order, - batch_size, compact_mode, high_accuracy, parallel_mode); + auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, tape_ptr, dc, {}, n_eq, n_uvars, + order, batch_size, compact_mode, high_accuracy, parallel_mode); // Determine the integration timestep. auto *h = taylor_determine_h(s, fp_t, diff_variant, sv_funcs_dc, nullptr, h_ptr, n_eq, n_uvars, order, batch_size, diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 130cdda90..0b824b892 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -11,18 +11,20 @@ #include #include #include +#include #include #include #include +#include #include #include -#include #include #include #include #include #include +#include #include #include @@ -500,35 +502,233 @@ void taylor_c_compute_sv_diffs(llvm_state &s, llvm::Type *fp_t, }); } -// For each segment in s_dc, this function will return a dict mapping an LLVM function -// f for the computation of a Taylor derivative to a size and a vector of std::functions. For example, one entry -// in the return value will read something like: -// {f : (2, [g_0, g_1, g_2])} -// The meaning in this example is that the arity of f is 3 and it will be called with 2 different -// sets of arguments. The g_i functions are expected to be called with input argument j in [0, 1] -// to yield the value of the i-th function argument for f at the j-th invocation. -auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vector &s_dc, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t batch_size, bool high_accuracy) +// Helper to perform the computation of the Taylor derivatives in compact mode across +// multiple LLVM states. +auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *main_par_ptr, + llvm::Value *main_time_ptr, llvm::Value *main_tape_ptr, const taylor_dc_t &dc, + const std::vector &s_dc, const std::vector &sv_funcs_dc, + std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, + bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx) { - // Log runtime in trace mode. - spdlog::stopwatch sw; + // TODO implement. + (void)parallel_mode; - // Init the return value. - // NOTE: use maps with name-based comparison for the functions. This ensures that the order in which these - // functions are invoked in taylor_compute_jet_compact_mode() is always the same. If we used directly pointer + // Generate the global arrays for the computation of the derivatives + // of the state variables in the main state. + const auto svd_gl = taylor_c_make_sv_diff_globals(main_state, main_fp_t, dc, n_uvars); + + // Structure used to log, in trace mode, the breakdown of each segment. + // For each segment, this structure contains the number of invocations + // of each function in the segment. It will be unused if we are not tracing. + std::vector> segment_bd; + + // Are we tracing? + const auto is_tracing = get_logger()->should_log(spdlog::level::trace); + + // List of evaluation functions in a segment. + // + // This map contains a list of functions for the compact-mode evaluation of Taylor derivatives. + // Each function is mapped to a pair, containing: + // + // - the number of times the function is to be invoked, + // - a list of functors (generators) that generate the arguments for + // the invocation. + // + // NOTE: we use maps with name-based comparison for the functions. This ensures that the order in which these + // functions are invoked is always the same. If we used directly pointer // comparisons instead, the order could vary across different executions and different platforms. The name // mangling we do when creating the function names should ensure that there are no possible name collisions. - std::vector< - std::map>>, - llvm_func_name_compare>> - retval; + using seg_f_list_t + = std::map>>, + llvm_func_name_compare>; + + // Init the list of states. + // NOTE: we use lists here because it is convenient to have + // pointer/reference stability when iteratively constructing + // the set of states. + std::list states; + + // Push back a new state and use it as initial current state. + // NOTE: like this, we always end up creating at least one driver + // function and a state, even in the degenerate case of an empty decomposition, + // which is suboptimal peformance-wise. + // I do not think however that it is worth it to complicate the code to avoid + // this corner-case pessimisation. + states.push_back(main_state.make_similar()); + auto *cur_state = &states.back(); + + // Index of the state we are currently operating on. + boost::safe_numerics::safe cur_state_idx = 0; + + // Helper to create and return the prototype of a driver function in the state s. + auto make_driver_proto = [](llvm_state &s, unsigned cur_idx) { + auto &builder = s.builder(); + auto &md = s.module(); + auto &ctx = s.context(); + + // The arguments to the driver are: + // - a pointer to the tape, + // - pointers to par and time, + // - the current diff order. + auto *ptr_tp = llvm::PointerType::getUnqual(ctx); + std::vector fargs{ptr_tp, ptr_tp, ptr_tp, builder.getInt32Ty()}; + + // The driver does not return anything. + auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); + assert(ft != nullptr); // LCOV_EXCL_LINE + + // Now create the driver. + const auto cur_name = fmt::format("heyoka.cm_jet.driver_{}", cur_idx); + auto *f = llvm_func_create(ft, llvm::Function::ExternalLinkage, cur_name, &md); + // NOTE: the driver cannot call itself recursively. + f->addFnAttr(llvm::Attribute::NoRecurse); + + // Add the arguments' attributes. + // NOTE: no aliasing is assumed between the pointer + // arguments. + auto *tape_arg = f->args().begin(); + tape_arg->setName("tape_ptr"); + tape_arg->addAttr(llvm::Attribute::NoCapture); + tape_arg->addAttr(llvm::Attribute::NoAlias); + + auto *par_ptr_arg = tape_arg + 1; + par_ptr_arg->setName("par_ptr"); + par_ptr_arg->addAttr(llvm::Attribute::NoCapture); + par_ptr_arg->addAttr(llvm::Attribute::NoAlias); + par_ptr_arg->addAttr(llvm::Attribute::ReadOnly); + + auto *time_ptr_arg = tape_arg + 2; + time_ptr_arg->setName("time_ptr"); + time_ptr_arg->addAttr(llvm::Attribute::NoCapture); + time_ptr_arg->addAttr(llvm::Attribute::NoAlias); + time_ptr_arg->addAttr(llvm::Attribute::ReadOnly); + + return f; + }; + + // TODO doc fix. + // Helper to compute the Taylor derivatives for a block. + // func is the LLVM function for the computation of the Taylor derivative in the block, + // ncalls the number of times it must be called, gens the generators for the + // function arguments and cur_order the order of the derivative. s is the llvm state + // in which we are computing the derivatives. + auto block_diff = [n_uvars](llvm_state &s, llvm::Function *func, std::uint32_t ncalls, const auto &gens, + llvm::Value *tape_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr, + llvm::Value *cur_order, llvm::Type *fp_vec_type) { + // LCOV_EXCL_START + assert(ncalls > 0u); + assert(!gens.empty()); + assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast(f); })); + // LCOV_EXCL_STOP + + // Fetch the builder for the current state. + auto &bld = s.builder(); + + // We will be manually unrolling loops if ncalls is small enough. + // This seems to help with compilation times. + constexpr auto max_unroll_n = 5u; + + if (ncalls > max_unroll_n) { + // Loop over the number of calls. + llvm_loop_u32(s, bld.getInt32(0), bld.getInt32(ncalls), [&](llvm::Value *cur_call_idx) { + // Create the u variable index from the first generator. + auto u_idx = gens[0](cur_call_idx); + + // Initialise the vector of arguments with which func must be called. The following + // initial arguments are always present: + // - current Taylor order, + // - u index of the variable, + // - tape of derivatives, + // - pointer to the param values, + // - pointer to the time value(s). + std::vector args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr}; + + // Create the other arguments via the generators. + for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { + args.push_back(gens[i](cur_call_idx)); + } + + // Calculate the derivative and store the result. + taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args)); + }); + } else { + // The manually-unrolled version of the above. + for (std::uint32_t idx = 0; idx < ncalls; ++idx) { + auto *cur_call_idx = bld.getInt32(idx); + auto u_idx = gens[0](cur_call_idx); + std::vector args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr}; + + for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { + args.push_back(gens[i](cur_call_idx)); + } + + taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args)); + } + } + }; + + // NOTE: unlike in compiled functions, we cannot at the same time + // declare and invoke the drivers from the main module as the invocation + // happens from within an LLVM loop. Thus, we first define the drivers + // in the states and add their declarations in the main state, and only + // at a later stage we perform the invocation of the drivers in the + // main state. + + // Declarations of the drivers in the main state. + std::vector main_driver_decls; + // Add the declaration for the first driver. + main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx)); + + // Add the driver declaration to the current state, + // and start insertion into the driver. + cur_state->builder().SetInsertPoint( + llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx))); + + // Variable to keep track of how many blocks have been codegenned + // in the current state. + boost::safe_numerics::safe n_cg_blocks = 0; + + // Limit of codegenned blocks per state. + // NOTE: this has not been really properly tuned, + // needs more investigation. + constexpr auto max_n_cg_blocks = 20u; // Variable to keep track of the u variable // on whose definition we are operating. auto cur_u_idx = n_eq; + + // Iterate over the segments in s_dc. for (const auto &seg : s_dc) { - // This structure maps an LLVM function to sets of arguments + if (n_cg_blocks > max_n_cg_blocks) { + // We have codegenned enough blocks for this state. Create the return + // value for the current driver, and move to the next one. + cur_state->builder().CreateRetVoid(); + + // Create the new current state. + states.push_back(main_state.make_similar()); + cur_state = &states.back(); + + // Reset/update the counters. + n_cg_blocks = 0; + ++cur_state_idx; + + // Add the driver declaration to the main state. + main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx)); + + // Add the driver declaration to the current state, + // and start insertion into the driver. + cur_state->builder().SetInsertPoint( + llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx))); + } + + // Fetch the internal fp type and its vector counterpart for the current state. + auto *fp_t = llvm_clone_type(*cur_state, main_fp_t); + auto *fp_vec_type = make_vector_type(fp_t, batch_size); + + // Fetch the current builder. + auto &cur_builder = cur_state->builder(); + + // This structure maps a function to sets of arguments // with which the function is to be called. For instance, if function // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map // will contain {f : [[a, b, c], [d, e, f]]}. @@ -547,7 +747,7 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect for (const auto &ex : seg) { // Get the function for the computation of the derivative. - auto *func = taylor_c_diff_func(s, fp_t, ex.first, n_uvars, batch_size, high_accuracy); + auto *func = taylor_c_diff_func(*cur_state, fp_t, ex.first, n_uvars, batch_size, high_accuracy); // Insert the function into tmp_map. const auto [it, is_new_func] = tmp_map.try_emplace(func); @@ -610,9 +810,8 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect } } - // Add a new entry in retval for the current segment. - retval.emplace_back(); - auto &a_map = retval.back(); + // Create the seg_f_list_t for the current segment. + seg_f_list_t seg_map; for (const auto &[func, vv] : tmp_map_transpose) { // NOTE: vv.size() is now the number of arguments. We know it cannot @@ -622,7 +821,7 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect assert(!vv.empty()); // LCOV_EXCL_LINE // Add the function. - const auto [it, ins_status] = a_map.try_emplace(func); + const auto [it, ins_status] = seg_map.try_emplace(func); assert(ins_status); // LCOV_EXCL_LINE // Set the number of calls for this function. @@ -633,447 +832,180 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect // Create the g functions for each argument. for (const auto &v : vv) { it->second.second.push_back(std::visit( - [&s, fp_t](const auto &x) { + [cur_state, fp_t](const auto &x) { using type = uncvref_t; if constexpr (std::is_same_v>) { - return cm_make_arg_gen_vidx(s, x); + return cm_make_arg_gen_vidx(*cur_state, x); } else { - return cm_make_arg_gen_vc(s, fp_t, x); + return cm_make_arg_gen_vc(*cur_state, fp_t, x); } }, v)); } } - } - get_logger()->trace("Taylor build function maps runtime: {}", sw); + // Fetch the arguments from the driver prototype. + auto *driver_f = cur_builder.GetInsertBlock()->getParent(); + auto *tape_ptr = driver_f->args().begin(); + auto *par_ptr = driver_f->args().begin() + 1; + auto *time_ptr = driver_f->args().begin() + 2; + auto *cur_order = driver_f->args().begin() + 3; - // LCOV_EXCL_START - // Log a breakdown of the return value in trace mode. - if (get_logger()->should_log(spdlog::level::trace)) { - std::vector> fm_bd; + // Compute the derivatives for this segment. + for (const auto &[func, fpair] : seg_map) { + const auto &[ncalls, gens] = fpair; + + block_diff(*cur_state, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type); + } - for (const auto &m : retval) { - fm_bd.emplace_back(); + // Update the number of codegenned blocks. + n_cg_blocks += seg_map.size(); - for (const auto &p : m) { - fm_bd.back().push_back(p.second.first); + // LCOV_EXCL_START + // Update segment_bd if needed. + if (is_tracing) { + segment_bd.emplace_back(); + + for (const auto &p : seg_map) { + segment_bd.back().push_back(p.second.first); } } + // LCOV_EXCL_STOP + } - get_logger()->trace("Taylor function maps breakdown: {}", fm_bd); + // We need one last return statement for the last added state. + cur_state->builder().CreateRetVoid(); + + // LCOV_EXCL_START + // Log segment_bd, if needed. + if (is_tracing) { + get_logger()->trace("Taylor function maps breakdown: {}", segment_bd); } // LCOV_EXCL_STOP - return retval; + // Back in the main state, we begin by invoking all the drivers with order zero. + // That is, we are computing the initial values of the u variables. + auto &main_bld = main_state.builder(); + for (auto *cur_driver_f : main_driver_decls) { + main_bld.CreateCall(cur_driver_f, {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(0)}); + } + + // Next, we compute all derivatives up to order 'order - 1'. + llvm_loop_u32(main_state, main_bld.getInt32(1), main_bld.getInt32(order), [&](llvm::Value *cur_order) { + // State variables first. + taylor_c_compute_sv_diffs(main_state, main_fp_t, svd_gl, main_tape_ptr, main_par_ptr, n_uvars, cur_order, + batch_size); + + // The other u variables. + for (auto *cur_driver_f : main_driver_decls) { + main_bld.CreateCall(cur_driver_f, {main_tape_ptr, main_par_ptr, main_time_ptr, cur_order}); + } + }); + + // Next, we compute the last-order derivatives for the state variables. + taylor_c_compute_sv_diffs(main_state, main_fp_t, svd_gl, main_tape_ptr, main_par_ptr, n_uvars, + main_bld.getInt32(order), batch_size); + + // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs + // correspond to u variables somewhere in the decomposition, we will have to compute the + // last-order derivatives of the u variables until we are sure all sv_funcs derivatives + // have been properly computed. + if (max_svf_idx >= n_eq) { + // Monitor the starting index of the current + // segment while iterating on the segments. + auto cur_start_u_idx = n_eq; + + for (decltype(s_dc.size()) seg_idx = 0; seg_idx < s_dc.size(); ++seg_idx) { + if (cur_start_u_idx > max_svf_idx) { + // We computed all the necessary derivatives, break out. + break; + } + + // Invoke the driver for the current segment. + main_bld.CreateCall(main_driver_decls[seg_idx], + {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(order)}); + + // Update cur_start_u_idx. + cur_start_u_idx += static_cast(s_dc[seg_idx].size()); + } + } } // Helper for the computation of a jet of derivatives in compact mode, -// used in taylor_compute_jet(). -std::pair taylor_compute_jet_compact_mode( +// used in taylor_compute_jet(). The return value are the size/alignment +// requirements for the tape of derivatives. All LLVM values and types +// passed to this function are defined in the main state. +std::array taylor_compute_jet_compact_mode( // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - llvm_state &s, llvm::Type *fp_type, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr, - const taylor_dc_t &dc, const std::vector &sv_funcs_dc, std::uint32_t n_eq, std::uint32_t n_uvars, - std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode) + llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr, + llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector &sv_funcs_dc, std::uint32_t n_eq, + std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode) { - auto &builder = s.builder(); - auto &context = s.context(); - auto &md = s.module(); + auto &main_bld = main_state.builder(); + auto &main_md = main_state.module(); + + // Determine the vector type corresponding to main_fp_t. + auto *main_fp_vec_t = make_vector_type(main_fp_t, batch_size); // Fetch the external type corresponding to fp_type. - auto *ext_fp_t = make_external_llvm_type(fp_type); + auto *main_ext_fp_t = make_external_llvm_type(main_fp_t); // Split dc into segments. const auto s_dc = taylor_segment_dc(dc, n_eq); - // Generate the function maps. - const auto f_maps = taylor_build_function_maps(s, fp_type, s_dc, n_eq, n_uvars, batch_size, high_accuracy); - - // Log the runtime of IR construction in trace mode. - spdlog::stopwatch sw; - - // Generate the global arrays for the computation of the derivatives - // of the state variables. - const auto svd_gl = taylor_c_make_sv_diff_globals(s, fp_type, dc, n_uvars); - // Determine the maximum u variable index appearing in sv_funcs_dc, or zero // if sv_funcs_dc is empty. - const auto max_svf_idx = sv_funcs_dc.empty() ? static_cast(0) - : *std::max_element(sv_funcs_dc.begin(), sv_funcs_dc.end()); + const auto max_svf_idx + = sv_funcs_dc.empty() ? static_cast(0) : *std::ranges::max_element(sv_funcs_dc); - // Prepare the array that will contain the jet of derivatives. + // Determine the total number of elements to be stored in the tape of derivatives. // We will be storing all the derivatives of the u variables // up to order 'order - 1', the derivatives of order // 'order' of the state variables and the derivatives // of order 'order' of the sv_funcs. - // NOTE: the array size is specified as a 64-bit integer in the - // LLVM API. - // NOTE: fp_type is the original, scalar floating-point type. - // It will be turned into a vector type (if necessary) by - // make_vector_type() below. // NOTE: if sv_funcs_dc is empty, or if all its indices are not greater // than the indices of the state variables, then we don't need additional // slots after the sv derivatives. If we need additional slots, allocate // another full column of derivatives, as it is complicated at this stage // to know exactly how many slots we will need. - auto *fp_vec_type = make_vector_type(fp_type, batch_size); - auto *diff_array_type - = llvm::ArrayType::get(fp_vec_type, (max_svf_idx < n_eq) ? (n_uvars * order + n_eq) : (n_uvars * (order + 1u))); - - // Make the global array and fetch a pointer to its first element. - // NOTE: we use a global array rather than a local one here because - // its size can grow quite large, which can lead to stack overflow issues. - // This has of course consequences in terms of thread safety, which - // we will have to document. - auto *diff_arr_gvar = make_global_zero_array(md, diff_array_type); - auto *diff_arr - = builder.CreateInBoundsGEP(diff_array_type, diff_arr_gvar, {builder.getInt32(0), builder.getInt32(0)}); - - // NOTE: diff_arr is used as temporary storage for the current function, - // but it is declared as a global variable in order to avoid stack overflow. - // This creates a situation in which LLVM cannot elide stores into diff_arr + // NOTE: overflow checking for this computation has been performed externally. + const auto tot_tape_N = (max_svf_idx < n_eq) ? (n_uvars * order + n_eq) : (n_uvars * (order + 1u)); + + // Total required size in bytes for the tape. + const auto tape_sz = boost::safe_numerics::safe(get_size(main_md, main_fp_vec_t)) * tot_tape_N; + + // Tape alignment. + const auto tape_al = boost::numeric_cast(get_alignment(main_md, main_fp_vec_t)); + + // Log the runtime of IR construction in trace mode. + spdlog::stopwatch sw; + + // NOTE: tape_ptr is used as temporary storage for the current function, + // but it is provided externally from dynamically-allocated memory in order to avoid stack overflow. + // This creates a situation in which LLVM cannot elide stores into tape_ptr // (even if it figures out a way to avoid storing intermediate results into - // diff_arr) because LLVM must assume that some other function may + // it) because LLVM must assume that some other function may // use these stored values later. Thus, we declare via an intrinsic that the - // lifetime of diff_arr begins here and ends at the end of the function, + // lifetime of tape_ptr begins here and ends at the end of the function, // so that LLVM can assume that any value stored in it cannot be possibly // used outside this function. - builder.CreateLifetimeStart(diff_arr, builder.getInt64(get_size(md, diff_array_type))); + main_bld.CreateLifetimeStart(tape_ptr, main_bld.getInt64(tape_sz)); // Copy over the order-0 derivatives of the state variables. // NOTE: overflow checking is already done in the parent function. - llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) { + llvm_loop_u32(main_state, main_bld.getInt32(0), main_bld.getInt32(n_eq), [&](llvm::Value *cur_var_idx) { // Fetch the pointer from order0. - auto *ptr - = builder.CreateInBoundsGEP(ext_fp_t, order0, builder.CreateMul(cur_var_idx, builder.getInt32(batch_size))); + auto *ptr = main_bld.CreateInBoundsGEP(main_ext_fp_t, order0, + main_bld.CreateMul(cur_var_idx, main_bld.getInt32(batch_size))); // Load as a vector. - auto *vec = ext_load_vector_from_memory(s, fp_type, ptr, batch_size); - - // Store into diff_arr. - taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, builder.getInt32(0), cur_var_idx, vec); - }); - - // NOTE: these are used only in parallel mode. - std::vector> par_funcs_ptrs; - llvm::Value *gl_par_data = nullptr; - llvm::Type *par_data_t = nullptr; - - if (parallel_mode) { - auto *ext_fp_ptr_t = llvm::PointerType::getUnqual(ext_fp_t); - - // NOTE: we will use a global variable with these fields: - // - // - int32 (current Taylor order), - // - T * (pointer to the runtime parameters), - // - T * (pointer to the time coordinate(s)), - // - // to pass the data necessary to the parallel workers. - par_data_t = llvm::StructType::get(context, {builder.getInt32Ty(), ext_fp_ptr_t, ext_fp_ptr_t}); - // NOLINTNEXTLINE(cppcoreguidelines-owning-memory) - gl_par_data = new llvm::GlobalVariable(md, par_data_t, false, llvm::GlobalVariable::InternalLinkage, - llvm::ConstantAggregateZero::get(par_data_t)); - - // Write the par/time pointers into the global struct (unlike the current order, this needs - // to be done only once). - builder.CreateStore( - par_ptr, builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(1)})); - builder.CreateStore( - time_ptr, builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(2)})); - - // Fetch the function types for the parallel worker and the wrapper. - auto *worker_t - = llvm::FunctionType::get(builder.getVoidTy(), {builder.getInt32Ty(), builder.getInt32Ty()}, false); - assert(worker_t != nullptr); // LCOV_EXCL_LINE - - auto *wrapper_t = llvm::FunctionType::get(builder.getVoidTy(), {}, false); - assert(wrapper_t != nullptr); // LCOV_EXCL_LINE - - for (const auto &map : f_maps) { - par_funcs_ptrs.emplace_back(); - - for (const auto &p : map) { - // The LLVM function for the computation of the - // derivative in compact mode. - const auto &func = p.first; - - // The number of func calls. - const auto ncalls = p.second.first; - - // The generators for the arguments of func. - const auto &gens = p.second.second; - - // Fetch the current insertion block. - auto *orig_bb = builder.GetInsertBlock(); - - // Create the worker function. - auto *worker = llvm::Function::Create(worker_t, llvm::Function::InternalLinkage, "", &md); - assert(worker != nullptr); // LCOV_EXCL_LINE - - // Fetch the function arguments. - auto *b_idx = worker->args().begin(); - auto *e_idx = worker->args().begin() + 1; - - // Create a new basic block to start insertion into. - builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", worker)); - - // Load the order and par/time pointers from the global variable. - auto *cur_order = builder.CreateLoad( - builder.getInt32Ty(), - builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(0)})); - auto *par_arg = builder.CreateLoad( - ext_fp_ptr_t, - builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(1)})); - auto *time_arg = builder.CreateLoad( - ext_fp_ptr_t, - builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(2)})); - - // Iterate over the range. - llvm_loop_u32(s, b_idx, e_idx, [&](llvm::Value *cur_call_idx) { - // Create the u variable index from the first generator. - auto *u_idx = gens[0](cur_call_idx); - - // Initialise the vector of arguments with which func must be called. The following - // initial arguments are always present: - // - current Taylor order, - // - u index of the variable, - // - array of derivatives, - // - pointer to the param values, - // - pointer to the time value(s). - std::vector args{cur_order, u_idx, diff_arr, par_arg, time_arg}; - - // Create the other arguments via the generators. - for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { - args.push_back(gens[i](cur_call_idx)); - } - - // Calculate the derivative and store the result. - taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, cur_order, u_idx, - builder.CreateCall(func, args)); - }); - - // Return. - builder.CreateRetVoid(); - - // Create the wrapper function. This will execute multiple calls - // to the worker in parallel, until the entire range [0, ncalls) has - // been consumed. - auto *wrapper = llvm::Function::Create(wrapper_t, llvm::Function::InternalLinkage, "", &md); - assert(wrapper != nullptr); // LCOV_EXCL_LINE - - // Create a new basic block to start insertion into. - builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", wrapper)); - - // Invoke the parallel looper. - llvm_invoke_external( - s, "heyoka_cm_par_looper", builder.getVoidTy(), {builder.getInt32(ncalls), worker}, - llvm::AttributeList::get(context, llvm::AttributeList::FunctionIndex, - {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn})); - - // Return. - builder.CreateRetVoid(); - - // Restore the original insertion block. - builder.SetInsertPoint(orig_bb); - - // Add a pointer to the wrapper to par_funcs_ptrs. - auto *f_ptr = builder.CreateAlloca(wrapper->getType()); - builder.CreateStore(wrapper, f_ptr); - par_funcs_ptrs.back().push_back(f_ptr); - } - } - } - - // Helper to compute the Taylor derivatives for a block. - // func is the LLVM function for the computation of the Taylor derivative in the block, - // ncalls the number of times it must be called, gens the generators for the - // function arguments and cur_order the order of the derivative. - auto block_diff = [&](llvm::Function *func, std::uint32_t ncalls, const auto &gens, llvm::Value *cur_order) { - // LCOV_EXCL_START - assert(ncalls > 0u); - assert(!gens.empty()); - assert(std::all_of(gens.begin(), gens.end(), [](const auto &f) { return static_cast(f); })); - // LCOV_EXCL_STOP - - // We will be manually unrolling loops if ncalls is small enough. - // This seems to help with compilation times. - constexpr auto max_unroll_n = 5u; - - if (ncalls > max_unroll_n) { - // Loop over the number of calls. - llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) { - // Create the u variable index from the first generator. - auto u_idx = gens[0](cur_call_idx); - - // Initialise the vector of arguments with which func must be called. The following - // initial arguments are always present: - // - current Taylor order, - // - u index of the variable, - // - array of derivatives, - // - pointer to the param values, - // - pointer to the time value(s). - std::vector args{cur_order, u_idx, diff_arr, par_ptr, time_ptr}; - - // Create the other arguments via the generators. - for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { - args.push_back(gens[i](cur_call_idx)); - } - - // Calculate the derivative and store the result. - taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, cur_order, u_idx, - builder.CreateCall(func, args)); - }); - } else { - // The manually-unrolled version of the above. - for (std::uint32_t idx = 0; idx < ncalls; ++idx) { - auto *cur_call_idx = builder.getInt32(idx); - auto u_idx = gens[0](cur_call_idx); - std::vector args{cur_order, u_idx, diff_arr, par_ptr, time_ptr}; - - for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { - args.push_back(gens[i](cur_call_idx)); - } - - taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, cur_order, u_idx, - builder.CreateCall(func, args)); - } - } - }; - - // Helper to compute concurrently all the derivatives - // in a segment using the parallel wrappers. - auto parallel_segment_diff = [&](const auto &pfptrs) { - assert(!pfptrs.empty()); // LCOV_EXCL_LINE - - // NOTE: we can invoke in parallel only up to a fixed number - // of wrappers. Thus, we process them in chunks. - - // The remaining number of wrappers to invoke. - auto rem = pfptrs.size(); - - // Starting index in pfptrs. - decltype(rem) start_idx = 0; - - while (rem != 0u) { - // Current chunk size. - const auto cur_size = std::min(static_cast(HEYOKA_CM_PAR_MAX_INVOKE_N), rem); - - // Setup the function name. - const auto fname = fmt::format("heyoka_cm_par_invoke_{}", cur_size); - - // Setup the function arguments. - std::vector args; - for (auto i = start_idx; i < start_idx + cur_size; ++i) { - assert(i < pfptrs.size()); // LCOV_EXCL_LINE - auto *ptr = pfptrs[i]; - args.push_back(builder.CreateLoad(ptr->getAllocatedType(), ptr)); - } - - // Invoke. - llvm_invoke_external(s, fname, builder.getVoidTy(), args, - llvm::AttributeList::get(context, llvm::AttributeList::FunctionIndex, - {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn})); - - // Update rem and start_idx. - rem -= cur_size; - start_idx += cur_size; - } - }; - - // Helper to compute and store the derivatives of order cur_order - // of the u variables which are not state variables. - auto compute_u_diffs = [&](llvm::Value *cur_order) { - if (parallel_mode) { - // Store the current order in the global struct. - builder.CreateStore(cur_order, builder.CreateInBoundsGEP(par_data_t, gl_par_data, - {builder.getInt32(0), builder.getInt32(0)})); - - // For each segment, invoke the wrapper functions concurrently. - for (const auto &pfptrs : par_funcs_ptrs) { - parallel_segment_diff(pfptrs); - } - } else { - // For each block in each segment, compute the derivatives - // of order cur_order serially. - for (const auto &map : f_maps) { - for (const auto &p : map) { - block_diff(p.first, p.second.first, p.second.second, cur_order); - } - } - } - }; - - // Compute the order-0 derivatives (i.e., the initial values) - // for all u variables which are not state variables. - compute_u_diffs(builder.getInt32(0)); - - // Compute all derivatives up to order 'order - 1'. - llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(order), [&](llvm::Value *cur_order) { - // State variables first. - taylor_c_compute_sv_diffs(s, fp_type, svd_gl, diff_arr, par_ptr, n_uvars, cur_order, batch_size); + auto *vec = ext_load_vector_from_memory(main_state, main_fp_t, ptr, batch_size); - // The other u variables. - compute_u_diffs(cur_order); + // Store into tape_ptr. + taylor_c_store_diff(main_state, main_fp_vec_t, tape_ptr, n_uvars, main_bld.getInt32(0), cur_var_idx, vec); }); - // Compute the last-order derivatives for the state variables. - taylor_c_compute_sv_diffs(s, fp_type, svd_gl, diff_arr, par_ptr, n_uvars, builder.getInt32(order), batch_size); - - // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs - // correspond to u variables in the decomposition, we will have to compute the - // last-order derivatives of the u variables until we are sure all sv_funcs derivatives - // have been properly computed. - if (max_svf_idx >= n_eq) { - // Monitor the starting index of the current - // segment while iterating on the segments. - auto cur_start_u_idx = n_eq; - - if (parallel_mode) { - // Store the derivative order in the global struct. - builder.CreateStore( - builder.getInt32(order), - builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(0)})); - - for (decltype(f_maps.size()) i = 0; i < f_maps.size(); ++i) { - if (cur_start_u_idx > max_svf_idx) { - // We computed all the necessary derivatives, break out. - break; - } - - // Compute the derivatives for the current segment. - parallel_segment_diff(par_funcs_ptrs[i]); - - // Update cur_start_u_idx, taking advantage of the fact - // that each block in a segment processes the derivatives - // of exactly ncalls u variables. - for (const auto &p : f_maps[i]) { - const auto ncalls = p.second.first; - cur_start_u_idx += ncalls; - } - } - } else { - for (const auto &map : f_maps) { - if (cur_start_u_idx > max_svf_idx) { - // We computed all the necessary derivatives, break out. - break; - } - - // Compute the derivatives of all the blocks in the segment. - for (const auto &p : map) { - const auto ncalls = p.second.first; - - block_diff(p.first, ncalls, p.second.second, builder.getInt32(order)); - - // Update cur_start_u_idx taking advantage of the fact - // that each block in a segment processes the derivatives - // of exactly ncalls u variables. - cur_start_u_idx += ncalls; - } - } - } - } - get_logger()->trace("Taylor IR creation compact mode runtime: {}", sw); // Return the array of derivatives of the u variables and its type. @@ -1114,26 +1046,28 @@ auto taylor_load_values(llvm_state &s, llvm::Type *fp_t, llvm::Value *in, std::u // order is the max derivative order desired, batch_size the batch size. // order0 is a pointer to an array of (at least) n_eq * batch_size scalar elements // containing the derivatives of order 0. par_ptr is a pointer to an array containing -// the numerical values of the parameters, time_ptr a pointer to the time value(s). -// sv_funcs are the indices, in the decomposition, of the functions of state +// the numerical values of the parameters, time_ptr a pointer to the time value(s), +// tape_ptr a pointer to the tape of derivatives (only in compact mode, otherwise +// a null value). sv_funcs are the indices, in the decomposition, of the functions of state // variables. // // order0, par_ptr and time_ptr are all external pointers. // // The return value is a variant containing either: -// - in compact mode, the array containing the derivatives of all u variables, +// - in compact mode, the size/alignment requirements for the tape of derivatives, // - otherwise, the jet of derivatives of the state variables and sv_funcs // up to order 'order'. -std::variant, std::vector> +std::variant, std::vector> taylor_compute_jet(llvm_state &s, llvm::Type *fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr, - const taylor_dc_t &dc, const std::vector &sv_funcs_dc, std::uint32_t n_eq, - std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool compact_mode, - bool high_accuracy, bool parallel_mode) + llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector &sv_funcs_dc, + std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, + bool compact_mode, bool high_accuracy, bool parallel_mode) { // LCOV_EXCL_START assert(batch_size > 0u); assert(n_eq > 0u); assert(order > 0u); + assert((tape_ptr != nullptr) == compact_mode); // LCOV_EXCL_STOP // Make sure we can represent n_uvars * (order + 1) as a 32-bit @@ -1155,8 +1089,8 @@ taylor_compute_jet(llvm_state &s, llvm::Type *fp_t, llvm::Value *order0, llvm::V // LCOV_EXCL_STOP if (compact_mode) { - return taylor_compute_jet_compact_mode(s, fp_t, order0, par_ptr, time_ptr, dc, sv_funcs_dc, n_eq, n_uvars, - order, batch_size, high_accuracy, parallel_mode); + return taylor_compute_jet_compact_mode(s, fp_t, order0, par_ptr, time_ptr, tape_ptr, dc, sv_funcs_dc, n_eq, + n_uvars, order, batch_size, high_accuracy, parallel_mode); } else { // Log the runtime of IR construction in trace mode. spdlog::stopwatch sw; diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp index 3a8bb1bab..ffaa6b5f2 100644 --- a/src/taylor_adaptive.cpp +++ b/src/taylor_adaptive.cpp @@ -180,7 +180,7 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_time); - HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_last_h); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tol); @@ -386,12 +386,15 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, m_order = detail::taylor_order_from_tol(m_tol); // Determine the external fp type. - auto *ext_fp_t = detail::to_external_llvm_type(m_llvm.context()); + auto *ext_fp_t = detail::to_external_llvm_type(std::get<0>(m_llvm_state).context()); // Determine the internal fp type. // NOTE: in case of mppp::real, we ensured earlier that the tolerance value // has the correct precision, so that internal_llvm_type_like() will yield the correct internal type. - auto *fp_t = detail::internal_llvm_type_like(m_llvm, m_tol); + auto *fp_t = detail::internal_llvm_type_like(std::get<0>(m_llvm_state), m_tol); + + // The state(s) which will be returned by the construction of the stepper function. + std::variant> states; // Add the stepper function. if (with_events) { @@ -404,11 +407,13 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, ee.push_back(ev.get_expression()); } - m_dc = detail::taylor_add_adaptive_step_with_events(m_llvm, ext_fp_t, fp_t, "step_e", sys, 1, compact_mode, ee, - high_accuracy, parallel_mode, m_order); + std::tie(m_dc, states) + = detail::taylor_add_adaptive_step_with_events(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step_e", sys, 1, + compact_mode, ee, high_accuracy, parallel_mode, m_order); } else { - m_dc = detail::taylor_add_adaptive_step(m_llvm, ext_fp_t, fp_t, "step", sys, 1, high_accuracy, compact_mode, - parallel_mode, m_order); + std::tie(m_dc, states) + = detail::taylor_add_adaptive_step(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step", sys, 1, high_accuracy, + compact_mode, parallel_mode, m_order); } // Fix m_pars' size, if necessary. From 06c47b4ecb2f33567d8f0f3a3a0254f49be99af1 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Mon, 26 Aug 2024 15:12:29 +0200 Subject: [PATCH 03/30] More non-functional WIP. [skip ci] --- include/heyoka/detail/i_data.hpp | 4 + include/heyoka/taylor.hpp | 20 +- src/detail/i_data.cpp | 14 +- src/taylor_00.cpp | 178 ++++---- src/taylor_02.cpp | 753 +++++++++++++++++-------------- src/taylor_adaptive.cpp | 130 ++++-- 6 files changed, 633 insertions(+), 466 deletions(-) diff --git a/include/heyoka/detail/i_data.hpp b/include/heyoka/detail/i_data.hpp index 205e0a943..be49f7f49 100644 --- a/include/heyoka/detail/i_data.hpp +++ b/include/heyoka/detail/i_data.hpp @@ -69,6 +69,10 @@ struct taylor_adaptive::i_data { detail::dfloat m_time; // The LLVM (multi)state. std::variant m_llvm_state; + // A template LLVM state we keep around to create states + // similar to m_llvm_state as needed. This is created with the + // same settings as m_llvm_state. + llvm_state m_tplt_state; // Dimension of the system. std::uint32_t m_dim{}; // Taylor decomposition. diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp index 46bcc7258..39498c1e1 100644 --- a/include/heyoka/taylor.hpp +++ b/include/heyoka/taylor.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -98,17 +99,19 @@ HEYOKA_DLL_PUBLIC llvm::Value *taylor_c_load_diff(llvm_state &, llvm::Type *, ll HEYOKA_DLL_PUBLIC void taylor_c_store_diff(llvm_state &, llvm::Type *, llvm::Value *, std::uint32_t, llvm::Value *, llvm::Value *, llvm::Value *); -taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &, llvm::Type *, llvm::Type *, const std::string &, - const std::vector> &, std::uint32_t, - bool, const std::vector &, bool, bool, std::uint32_t); +std::tuple, std::vector> +taylor_add_adaptive_step_with_events(llvm_state &, llvm::Type *, const std::string &, + const std::vector> &, std::uint32_t, bool, + const std::vector &, bool, bool, std::uint32_t); -taylor_dc_t taylor_add_adaptive_step(llvm_state &, llvm::Type *, llvm::Type *, const std::string &, - const std::vector> &, std::uint32_t, bool, bool, - bool, std::uint32_t); +std::tuple, std::vector> +taylor_add_adaptive_step(llvm_state &, llvm::Type *, llvm::Type *, const std::string &, + const std::vector> &, std::uint32_t, bool, bool, bool, + std::uint32_t); llvm::Value *taylor_c_make_sv_funcs_arr(llvm_state &, const std::vector &); -std::variant, std::vector> +std::variant, std::vector>, std::vector> taylor_compute_jet(llvm_state &, llvm::Type *, llvm::Value *, llvm::Value *, llvm::Value *, llvm::Value *, const taylor_dc_t &, const std::vector &, std::uint32_t, std::uint32_t, std::uint32_t, std::uint32_t, bool, bool, bool); @@ -507,6 +510,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada explicit taylor_adaptive(private_ctor_t, llvm_state); HEYOKA_DLL_LOCAL void check_variational(const char *) const; + HEYOKA_DLL_LOCAL void assign_stepper(bool); // Input type for Taylor map computation. using tm_input_t = mdspan>; @@ -548,7 +552,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada ~taylor_adaptive(); - [[nodiscard]] const llvm_state &get_llvm_state() const; + [[nodiscard]] const std::variant &get_llvm_state() const; [[nodiscard]] const taylor_dc_t &get_decomposition() const; diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp index 572ca7655..cbb27b9ac 100644 --- a/src/detail/i_data.cpp +++ b/src/detail/i_data.cpp @@ -119,6 +119,7 @@ void taylor_adaptive::i_data::save(boost::archive::binary_oarchive &ar, unsig ar << m_state; ar << m_time; ar << m_llvm_state; + ar << m_tplt_state; ar << m_dim; ar << m_dc; ar << m_order; @@ -140,6 +141,7 @@ void taylor_adaptive::i_data::load(boost::archive::binary_iarchive &ar, unsig ar >> m_state; ar >> m_time; ar >> m_llvm_state; + ar >> m_tplt_state; ar >> m_dim; ar >> m_dc; ar >> m_order; @@ -168,16 +170,18 @@ void taylor_adaptive::i_data::load(boost::archive::binary_iarchive &ar, unsig // of compact mode. It will be converted into a multi state if needed at a // later stage. template -taylor_adaptive::i_data::i_data(llvm_state s) : m_llvm_state(std::move(s)) +taylor_adaptive::i_data::i_data(llvm_state s) + : m_llvm_state(std::move(s)), m_tplt_state(std::get<0>(m_llvm_state).make_similar()) { } template taylor_adaptive::i_data::i_data(const i_data &other) - : m_state(other.m_state), m_time(other.m_time), m_llvm_state(other.m_llvm_state), m_dim(other.m_dim), - m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy), - m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa), m_pars(other.m_pars), m_tc(other.m_tc), - m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_vsys(other.m_vsys), m_tm_data(other.m_tm_data) + : m_state(other.m_state), m_time(other.m_time), m_llvm_state(other.m_llvm_state), m_tplt_state(other.m_tplt_state), + m_dim(other.m_dim), m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol), + m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa), + m_pars(other.m_pars), m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_vsys(other.m_vsys), + m_tm_data(other.m_tm_data) { // Recover the function pointers. m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); diff --git a/src/taylor_00.cpp b/src/taylor_00.cpp index b3e322a57..7ddc57979 100644 --- a/src/taylor_00.cpp +++ b/src/taylor_00.cpp @@ -8,7 +8,6 @@ #include -#include #include #include #include @@ -95,20 +94,20 @@ number taylor_determine_h_rhofac(llvm_state &s, llvm::Type *fp_t, std::uint32_t } // Helper to generate the LLVM code to determine the timestep in an adaptive Taylor integrator, -// following Jorba's prescription. diff_variant is the output of taylor_compute_jet(), and it contains -// the jet of derivatives for the state variables and the sv_funcs. h_ptr is an external pointer containing -// the clamping values for the timesteps. svf_ptr is a pointer to the first element of an LLVM array containing the -// values in sv_funcs_dc. If max_abs_state_ptr is not nullptr, the computed norm infinity of the -// state vector (including sv_funcs, if any) will be written into it (max_abs_state_ptr is an external pointer). -llvm::Value * -taylor_determine_h(llvm_state &s, llvm::Type *fp_t, - const std::variant, std::vector> &diff_variant, - const std::vector &sv_funcs_dc, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - llvm::Value *svf_ptr, llvm::Value *h_ptr, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, - llvm::Value *max_abs_state_ptr) +// following Jorba's prescription. diff_variant is the output of taylor_compute_jet(). h_ptr is an external pointer +// containing the clamping values for the timesteps. svf_ptr is a pointer to the first element of an LLVM array +// containing the values in sv_funcs_dc. If max_abs_state_ptr is not nullptr, the computed norm infinity of the state +// vector (including sv_funcs, if any) will be written into it (max_abs_state_ptr is an external pointer). +// tape_ptr is the pointer to the tape of derivatives in compact mode, or a null pointer otherwise. +llvm::Value *taylor_determine_h(llvm_state &s, llvm::Type *fp_t, + const std::variant, std::vector>, + std::vector> &diff_variant, + const std::vector &sv_funcs_dc, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + llvm::Value *svf_ptr, llvm::Value *h_ptr, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, + std::uint32_t batch_size, llvm::Value *max_abs_state_ptr, llvm::Value *tape_ptr) { assert(batch_size != 0u); #if !defined(NDEBUG) @@ -130,7 +129,8 @@ taylor_determine_h(llvm_state &s, llvm::Type *fp_t, if (diff_variant.index() == 0u) { // Compact mode. - auto *diff_arr = std::get<0>(diff_variant).first; + assert(tape_ptr != nullptr); + auto *diff_arr = tape_ptr; // These will end up containing the norm infinity of the state vector + sv_funcs and the // norm infinity of the derivatives at orders order and order - 1. @@ -196,6 +196,7 @@ taylor_determine_h(llvm_state &s, llvm::Type *fp_t, max_abs_diff_om1 = builder.CreateLoad(vec_t, max_abs_diff_om1); } else { // Non-compact mode. + assert(tape_ptr == nullptr); const auto &diff_arr = std::get>(diff_variant); const auto n_sv_funcs = static_cast(sv_funcs_dc.size()); @@ -271,23 +272,24 @@ taylor_determine_h(llvm_state &s, llvm::Type *fp_t, } // Run the Horner scheme to propagate an ODE state via the evaluation of the Taylor polynomials. -// diff_var contains either the derivatives for all u variables (in compact mode) or only -// for the state variables (non-compact mode). The evaluation point (i.e., the timestep) +// diff_var is the output of taylor_compute_jet(). The evaluation point (i.e., the timestep) // is h. The evaluation is run in parallel over the polynomials of all the state -// variables. -std::variant> -taylor_run_multihorner(llvm_state &s, llvm::Type *fp_t, - const std::variant, std::vector> &diff_var, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - llvm::Value *h, std::uint32_t n_eq, std::uint32_t n_uvars, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - std::uint32_t order, std::uint32_t batch_size) +// variables. tape_ptr is the pointer to the tape of derivatives in compact mode, or a null pointer otherwise. +std::variant> taylor_run_multihorner( + llvm_state &s, llvm::Type *fp_t, + const std::variant, std::vector>, std::vector> + &diff_var, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + llvm::Value *h, std::uint32_t n_eq, std::uint32_t n_uvars, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + std::uint32_t order, std::uint32_t batch_size, llvm::Value *tape_ptr) { auto &builder = s.builder(); if (diff_var.index() == 0u) { // Compact mode. - auto *diff_arr = std::get<0>(diff_var).first; + assert(tape_ptr != nullptr); + auto *diff_arr = tape_ptr; // Create the array storing the results of the evaluation. auto *fp_vec_t = make_vector_type(fp_t, batch_size); @@ -325,6 +327,7 @@ taylor_run_multihorner(llvm_state &s, llvm::Type *fp_t, return res_arr; } else { // Non-compact mode. + assert(tape_ptr == nullptr); const auto &diff_arr = std::get>(diff_var); // Init the return value, filling it with the values of the @@ -347,18 +350,21 @@ taylor_run_multihorner(llvm_state &s, llvm::Type *fp_t, // Same as taylor_run_multihorner(), but instead of the Horner scheme this implementation uses // a compensated summation over the naive evaluation of monomials. -std::variant> -taylor_run_ceval(llvm_state &s, llvm::Type *fp_t, - const std::variant, std::vector> &diff_var, - llvm::Value *h, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, bool, std::uint32_t batch_size) +std::variant> taylor_run_ceval( + llvm_state &s, llvm::Type *fp_t, + const std::variant, std::vector>, std::vector> + &diff_var, + llvm::Value *h, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, bool, std::uint32_t batch_size, + llvm::Value *tape_ptr) { auto &builder = s.builder(); if (diff_var.index() == 0u) { // Compact mode. - auto *diff_arr = std::get<0>(diff_var).first; + assert(tape_ptr != nullptr); + auto *diff_arr = tape_ptr; // Create the arrays storing the results of the evaluation and the running compensations. auto *fp_vec_t = make_vector_type(fp_t, batch_size); @@ -416,6 +422,7 @@ taylor_run_ceval(llvm_state &s, llvm::Type *fp_t, return res_arr; } else { // Non-compact mode. + assert(tape_ptr == nullptr); const auto &diff_arr = std::get>(diff_var); // Init the return values with the order-0 monomials, and the running @@ -453,13 +460,15 @@ taylor_run_ceval(llvm_state &s, llvm::Type *fp_t, // Helper to generate the LLVM code to store the Taylor coefficients of the state variables and // the sv funcs into an external array. The Taylor polynomials are stored in row-major order, // first the state variables and after the sv funcs. For use in the adaptive timestepper implementations. -// tc_ptr is an external pointer. -void taylor_write_tc( - llvm_state &s, llvm::Type *fp_t, - const std::variant, std::vector> &diff_variant, - const std::vector &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *tc_ptr, - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size) +// tc_ptr is an external pointer. tape_ptr is the pointer to the tape of derivatives in compact mode, or a null pointer +// otherwise. +void taylor_write_tc(llvm_state &s, llvm::Type *fp_t, + const std::variant, std::vector>, + std::vector> &diff_variant, + const std::vector &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *tc_ptr, + // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) + std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, + llvm::Value *tape_ptr) { // LCOV_EXCL_START assert(batch_size != 0u); @@ -499,8 +508,8 @@ void taylor_write_tc( if (diff_variant.index() == 0u) { // Compact mode. - - auto *diff_arr = std::get<0>(diff_variant).first; + assert(tape_ptr != nullptr); + auto *diff_arr = tape_ptr; // Write out the Taylor coefficients for the state variables. llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var) { @@ -546,7 +555,7 @@ void taylor_write_tc( } } else { // Non-compact mode. - + assert(tape_ptr == nullptr); const auto &diff_arr = std::get>(diff_variant); for (std::uint32_t j = 0; j < n_eq + n_sv_funcs; ++j) { @@ -578,12 +587,11 @@ void taylor_write_tc( // propagate the state of the system. Instead, its output will be the jet of derivatives // of all state variables and event equations, and the deduced timestep value(s). // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) -taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_fp_t, llvm::Type *fp_t, - const std::string &name, - const std::vector> &sys, - std::uint32_t batch_size, bool compact_mode, - const std::vector &evs, bool high_accuracy, - bool parallel_mode, std::uint32_t order) +std::tuple, std::vector> +taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *fp_t, const std::string &name, + const std::vector> &sys, + std::uint32_t batch_size, bool compact_mode, const std::vector &evs, + bool high_accuracy, bool parallel_mode, std::uint32_t order) { assert(!s.is_compiled()); assert(batch_size != 0u); @@ -603,14 +611,17 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_ auto &md = s.module(); // Prepare the function prototype. The arguments are: + // // - pointer to the output jet of derivative (write only), // - pointer to the current state vector (read only), // - pointer to the parameters (read only), // - pointer to the time value(s) (read only), // - pointer to the array of max timesteps (read & write), - // - pointer to the max_abs_state output variable (write only). + // - pointer to the max_abs_state output variable (write only), + // - pointer to the tape (read & write, compact mode only). + // // These pointers cannot overlap. - const std::vector fargs(6, llvm::PointerType::getUnqual(ext_fp_t)); + const std::vector fargs(compact_mode ? 7 : 6, llvm::PointerType::getUnqual(context)); // The function does not return anything. auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); assert(ft != nullptr); @@ -655,6 +666,14 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_ max_abs_state_ptr->addAttr(llvm::Attribute::NoAlias); max_abs_state_ptr->addAttr(llvm::Attribute::WriteOnly); + llvm::Argument *tape_ptr = nullptr; + if (compact_mode) { + tape_ptr = max_abs_state_ptr + 1; + tape_ptr->setName("tape_ptr"); + tape_ptr->addAttr(llvm::Attribute::NoCapture); + tape_ptr->addAttr(llvm::Attribute::NoAlias); + } + // Create a new basic block to start insertion into. auto *bb = llvm::BasicBlock::Create(context, "entry", f); assert(bb != nullptr); // LCOV_EXCL_LINE @@ -665,50 +684,44 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_ auto *svf_ptr = compact_mode ? taylor_c_make_sv_funcs_arr(s, ev_dc) : nullptr; // Compute the jet of derivatives at the given order. - auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, dc, ev_dc, n_eq, n_uvars, order, - batch_size, compact_mode, high_accuracy, parallel_mode); + auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, tape_ptr, dc, ev_dc, n_eq, n_uvars, + order, batch_size, compact_mode, high_accuracy, parallel_mode); // Determine the integration timestep. auto *h = taylor_determine_h(s, fp_t, diff_variant, ev_dc, svf_ptr, h_ptr, n_eq, n_uvars, order, batch_size, - max_abs_state_ptr); + max_abs_state_ptr, tape_ptr); // Store h to memory. ext_store_vector_to_memory(s, h_ptr, h); // Copy the jet of derivatives to jet_ptr. - taylor_write_tc(s, fp_t, diff_variant, ev_dc, svf_ptr, jet_ptr, n_eq, n_uvars, order, batch_size); + taylor_write_tc(s, fp_t, diff_variant, ev_dc, svf_ptr, jet_ptr, n_eq, n_uvars, order, batch_size, tape_ptr); // End the lifetime of the array of derivatives, if we are in compact mode. if (compact_mode) { - builder.CreateLifetimeEnd(std::get<0>(diff_variant).first, - builder.getInt64(get_size(md, std::get<0>(diff_variant).second))); + const auto [sz, al] = std::get<0>(diff_variant).first; + builder.CreateLifetimeEnd(tape_ptr, builder.getInt64(boost::numeric_cast(sz))); } // Create the return value. builder.CreateRetVoid(); - return dc; + if (compact_mode) { + return {std::move(dc), std::move(std::get<0>(diff_variant).first), std::move(std::get<0>(diff_variant).second)}; + } else { + return {std::move(dc), {}, {}}; + } } // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) -std::tuple>, std::array> -taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name, +std::tuple, std::vector> +taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name, const std::vector> &sys, std::uint32_t batch_size, bool high_accuracy, bool compact_mode, bool parallel_mode, std::uint32_t order) { - assert(!tplt.is_compiled()); + assert(!s.is_compiled()); assert(batch_size > 0u); - // Setup the return state(s) and fetch the main state. - auto ret_states = [compact_mode, &tplt]() -> std::variant> { - if (compact_mode) { - return std::vector{tplt.make_similar()}; - } else { - return tplt.make_similar(); - } - }(); - auto &s = compact_mode ? std::get<1>(ret_states)[0] : std::get<0>(ret_states); - // Record the number of equations/variables. const auto n_eq = boost::numeric_cast(sys.size()); @@ -788,18 +801,19 @@ taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Typ assert(bb != nullptr); builder.SetInsertPoint(bb); - // Compute the jet of derivatives at the given order. + // Generate the code for the computation of the jet of derivatives at the given order. auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, tape_ptr, dc, {}, n_eq, n_uvars, order, batch_size, compact_mode, high_accuracy, parallel_mode); // Determine the integration timestep. auto *h = taylor_determine_h(s, fp_t, diff_variant, sv_funcs_dc, nullptr, h_ptr, n_eq, n_uvars, order, batch_size, - nullptr); + nullptr, tape_ptr); // Evaluate the Taylor polynomials, producing the updated state of the system. auto new_state_var - = high_accuracy ? taylor_run_ceval(s, fp_t, diff_variant, h, n_eq, n_uvars, order, high_accuracy, batch_size) - : taylor_run_multihorner(s, fp_t, diff_variant, h, n_eq, n_uvars, order, batch_size); + = high_accuracy + ? taylor_run_ceval(s, fp_t, diff_variant, h, n_eq, n_uvars, order, high_accuracy, batch_size, tape_ptr) + : taylor_run_multihorner(s, fp_t, diff_variant, h, n_eq, n_uvars, order, batch_size, tape_ptr); // Store the new state. // NOTE: no need to perform overflow check on n_eq * batch_size, @@ -837,7 +851,7 @@ taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Typ [&]() { // tc_ptr is not null: copy the Taylor coefficients // for the state variables. - taylor_write_tc(s, fp_t, diff_variant, {}, nullptr, tc_ptr, n_eq, n_uvars, order, batch_size); + taylor_write_tc(s, fp_t, diff_variant, {}, nullptr, tc_ptr, n_eq, n_uvars, order, batch_size, tape_ptr); }, []() { // Taylor coefficients were not requested, @@ -846,14 +860,18 @@ taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Typ // End the lifetime of the array of derivatives, if we are in compact mode. if (compact_mode) { - builder.CreateLifetimeEnd(std::get<0>(diff_variant).first, - builder.getInt64(get_size(md, std::get<0>(diff_variant).second))); + const auto [sz, al] = std::get<0>(diff_variant).first; + builder.CreateLifetimeEnd(tape_ptr, builder.getInt64(boost::numeric_cast(sz))); } // Create the return value. builder.CreateRetVoid(); - return dc; + if (compact_mode) { + return {std::move(dc), std::move(std::get<0>(diff_variant).first), std::move(std::get<0>(diff_variant).second)}; + } else { + return {std::move(dc), {}, {}}; + } } } // namespace detail diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 0b824b892..0a3c03a95 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -100,8 +101,6 @@ namespace // that do not represent state variables) into parallelisable segments. Within a segment, // the definition of a u variable does not depend on any u variable defined within that segment. // NOTE: the hidden deps are not considered as dependencies. -// NOTE: the segments in the return value will contain shallow copies of the -// expressions in dc. std::vector taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t n_eq) { // Log runtime in trace mode. @@ -177,8 +176,8 @@ std::vector taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t } #if !defined(NDEBUG) - // Verify s_dc. + // Verify s_dc. decltype(dc.size()) counter = 0; for (const auto &s : s_dc) { // No segment can be empty. @@ -199,6 +198,7 @@ std::vector taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t } assert(counter == dc.size() - static_cast(n_eq) * 2u); + #endif get_logger()->debug("Taylor decomposition N of segments: {}", s_dc.size()); @@ -502,45 +502,309 @@ void taylor_c_compute_sv_diffs(llvm_state &s, llvm::Type *fp_t, }); } -// Helper to perform the computation of the Taylor derivatives in compact mode across -// multiple LLVM states. -auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *main_par_ptr, - llvm::Value *main_time_ptr, llvm::Value *main_tape_ptr, const taylor_dc_t &dc, - const std::vector &s_dc, const std::vector &sv_funcs_dc, - std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, - bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx) +// Helper to create and return the prototype of a driver function for +// the computation of Taylor derivatives in compact mode. s is the llvm state +// in which we are operating, cur_idx the index of the driver. +llvm::Function *taylor_cm_make_driver_proto(llvm_state &s, unsigned cur_idx) { - // TODO implement. - (void)parallel_mode; + auto &builder = s.builder(); + auto &md = s.module(); + auto &ctx = s.context(); + + // The arguments to the driver are: + // - a pointer to the tape, + // - pointers to par and time, + // - the current diff order. + auto *ptr_tp = llvm::PointerType::getUnqual(ctx); + std::vector fargs{ptr_tp, ptr_tp, ptr_tp, builder.getInt32Ty()}; + + // The driver does not return anything. + auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); + assert(ft != nullptr); // LCOV_EXCL_LINE + + // Now create the driver. + const auto cur_name = fmt::format("heyoka.cm_jet.driver_{}", cur_idx); + auto *f = llvm_func_create(ft, llvm::Function::ExternalLinkage, cur_name, &md); + // NOTE: the driver cannot call itself recursively. + f->addFnAttr(llvm::Attribute::NoRecurse); + + // Add the arguments' attributes. + // NOTE: no aliasing is assumed between the pointer + // arguments. + auto *tape_arg = f->args().begin(); + tape_arg->setName("tape_ptr"); + tape_arg->addAttr(llvm::Attribute::NoCapture); + tape_arg->addAttr(llvm::Attribute::NoAlias); + + auto *par_ptr_arg = tape_arg + 1; + par_ptr_arg->setName("par_ptr"); + par_ptr_arg->addAttr(llvm::Attribute::NoCapture); + par_ptr_arg->addAttr(llvm::Attribute::NoAlias); + par_ptr_arg->addAttr(llvm::Attribute::ReadOnly); + + auto *time_ptr_arg = tape_arg + 2; + time_ptr_arg->setName("time_ptr"); + time_ptr_arg->addAttr(llvm::Attribute::NoCapture); + time_ptr_arg->addAttr(llvm::Attribute::NoAlias); + time_ptr_arg->addAttr(llvm::Attribute::ReadOnly); + + auto *order_arg = tape_arg + 3; + order_arg->setName("order"); + + return f; +} - // Generate the global arrays for the computation of the derivatives - // of the state variables in the main state. - const auto svd_gl = taylor_c_make_sv_diff_globals(main_state, main_fp_t, dc, n_uvars); +// Helper to codegen the computation of the Taylor derivatives for a block. +// +// s is the llvm state in which we are operating, func is the LLVM function for the computation of the Taylor +// derivative in the block, ncalls the number of times it must be called, gens the generators for the +// function arguments, tape/par/time_ptr the pointers to the tape/parameter value(s)/time value(s), +// cur_order the order of the derivative, fp_vec_type the internal vector type used for computations, +// n_uvars the total number of u variables. +void taylor_cm_codegen_block_diff(llvm_state &s, llvm::Function *func, std::uint32_t ncalls, const auto &gens, + llvm::Value *tape_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr, + llvm::Value *cur_order, llvm::Type *fp_vec_type, std::uint32_t n_uvars) +{ + // LCOV_EXCL_START + assert(ncalls > 0u); + assert(!gens.empty()); + assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast(f); })); + // LCOV_EXCL_STOP - // Structure used to log, in trace mode, the breakdown of each segment. - // For each segment, this structure contains the number of invocations - // of each function in the segment. It will be unused if we are not tracing. - std::vector> segment_bd; + // Fetch the builder for the current state. + auto &bld = s.builder(); + + // We will be manually unrolling loops if ncalls is small enough. + // This seems to help with compilation times. + constexpr auto max_unroll_n = 5u; + + if (ncalls > max_unroll_n) { + // Loop over the number of calls. + llvm_loop_u32(s, bld.getInt32(0), bld.getInt32(ncalls), [&](llvm::Value *cur_call_idx) { + // Create the u variable index from the first generator. + auto u_idx = gens[0](cur_call_idx); + + // Initialise the vector of arguments with which func must be called. The following + // initial arguments are always present: + // - current Taylor order, + // - u index of the variable, + // - tape of derivatives, + // - pointer to the param values, + // - pointer to the time value(s). + std::vector args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr}; + + // Create the other arguments via the generators. + for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { + args.push_back(gens[i](cur_call_idx)); + } - // Are we tracing? - const auto is_tracing = get_logger()->should_log(spdlog::level::trace); + // Calculate the derivative and store the result. + taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args)); + }); + } else { + // The manually-unrolled version of the above. + for (std::uint32_t idx = 0; idx < ncalls; ++idx) { + auto *cur_call_idx = bld.getInt32(idx); + auto u_idx = gens[0](cur_call_idx); + std::vector args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr}; + + for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { + args.push_back(gens[i](cur_call_idx)); + } - // List of evaluation functions in a segment. - // - // This map contains a list of functions for the compact-mode evaluation of Taylor derivatives. - // Each function is mapped to a pair, containing: - // - // - the number of times the function is to be invoked, - // - a list of functors (generators) that generate the arguments for - // the invocation. - // - // NOTE: we use maps with name-based comparison for the functions. This ensures that the order in which these - // functions are invoked is always the same. If we used directly pointer - // comparisons instead, the order could vary across different executions and different platforms. The name - // mangling we do when creating the function names should ensure that there are no possible name collisions. - using seg_f_list_t - = std::map>>, - llvm_func_name_compare>; + taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args)); + } + } +} + +// List of evaluation functions in a segment. +// +// This map contains a list of functions for the compact-mode evaluation of Taylor derivatives. +// Each function is mapped to a pair, containing: +// +// - the number of times the function is to be invoked, +// - a list of functors (generators) that generate the arguments for +// the invocation. +// +// NOTE: we use maps with name-based comparison for the functions. This ensures that the order in which these +// functions are invoked is always the same. If we used directly pointer +// comparisons instead, the order could vary across different executions and different platforms. The name +// mangling we do when creating the function names should ensure that there are no possible name collisions. +using taylor_cm_seg_f_list_t + = std::map>>, + llvm_func_name_compare>; + +// Helper to codegen the computation of the Taylor derivatives for a segment. +// +// seg is the segment, start_u_idx the index of the first u variable in the segment, s the llvm state +// we are operating in, fp_t the internal scalar floating-point type, batch_size the batch size, n_uvars +// the total number of u variables, high_accuracy the high accuracy flag. +taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint32_t start_u_idx, llvm_state &s, + llvm::Type *fp_t, std::uint32_t batch_size, std::uint32_t n_uvars, + bool high_accuracy) +{ + // Fetch the internal vector type. + auto *fp_vec_type = make_vector_type(fp_t, batch_size); + + // Fetch the current builder. + auto &bld = s.builder(); + + // This structure maps a function to sets of arguments + // with which the function is to be called. For instance, if function + // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map + // will contain {f : [[a, b, c], [d, e, f]]}. + // After construction, we have verified that for each function + // in the map the sets of arguments have all the same size. + // NOTE: again, here and below we use name-based ordered maps for the functions. + // This ensures that the invocations of cm_make_arg_gen_*(), which create several + // global variables, always happen in a well-defined order. If we used an unordered map instead, + // the variables would be created in a "random" order, which would result in a + // unnecessary miss for the in-memory cache machinery when two logically-identical + // LLVM modules are considered different because of the difference in the order + // of declaration of global variables. + std::map>>, llvm_func_name_compare> + tmp_map; + + for (const auto &ex : seg) { + // Get the function for the computation of the derivative. + auto *func = taylor_c_diff_func(s, fp_t, ex.first, n_uvars, batch_size, high_accuracy); + + // Insert the function into tmp_map. + const auto [it, is_new_func] = tmp_map.try_emplace(func); + + assert(is_new_func || !it->second.empty()); // LCOV_EXCL_LINE + + // Convert the variables/constants in the current dc + // element into a set of indices/constants. + const auto cdiff_args = udef_to_variants(ex.first, ex.second); + + // LCOV_EXCL_START + if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) { + throw std::invalid_argument( + fmt::format("Inconsistent arity detected in a Taylor derivative function in compact " + "mode: the same function is being called with both {} and {} arguments", + it->second.back().size() - 1u, cdiff_args.size())); + } + // LCOV_EXCL_STOP + + // Add the new set of arguments. + it->second.emplace_back(); + // Add the idx of the u variable. + it->second.back().emplace_back(start_u_idx); + // Add the actual function arguments. + it->second.back().insert(it->second.back().end(), cdiff_args.begin(), cdiff_args.end()); + + // Update start_u_idx. + ++start_u_idx; + } + + // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]} + // to {f : [[a, d], [b, e], [c, f]]}. + std::map, std::vector>>, + llvm_func_name_compare> + tmp_map_transpose; + for (const auto &[func, vv] : tmp_map) { + assert(!vv.empty()); // LCOV_EXCL_LINE + + // Add the function. + const auto [it, ins_status] = tmp_map_transpose.try_emplace(func); + assert(ins_status); // LCOV_EXCL_LINE + + const auto n_calls = vv.size(); + const auto n_args = vv[0].size(); + // NOTE: n_args must be at least 1 because the u idx + // is prepended to the actual function arguments in + // the tmp_map entries. + assert(n_args >= 1u); // LCOV_EXCL_LINE + + for (decltype(vv[0].size()) i = 0; i < n_args; ++i) { + // Build the vector of values corresponding + // to the current argument index. + std::vector> tmp_c_vec; + for (decltype(vv.size()) j = 0; j < n_calls; ++j) { + tmp_c_vec.push_back(vv[j][i]); + } + + // Turn tmp_c_vec (a vector of variants) into a variant + // of vectors, and insert the result. + it->second.push_back(vv_transpose(tmp_c_vec)); + } + } + + // Create the taylor_cm_seg_f_list_t for the current segment. + taylor_cm_seg_f_list_t seg_map; + + for (const auto &[func, vv] : tmp_map_transpose) { + // NOTE: vv.size() is now the number of arguments. We know it cannot + // be zero because the functions to compute the Taylor derivatives + // in compact mode always have at least 1 argument (i.e., the index + // of the u variable whose derivative is being computed). + assert(!vv.empty()); // LCOV_EXCL_LINE + + // Add the function. + const auto [it, ins_status] = seg_map.try_emplace(func); + assert(ins_status); // LCOV_EXCL_LINE + + // Set the number of calls for this function. + it->second.first + = std::visit([](const auto &x) { return boost::numeric_cast(x.size()); }, vv[0]); + assert(it->second.first > 0u); // LCOV_EXCL_LINE + + // Create the g functions for each argument. + for (const auto &v : vv) { + it->second.second.push_back(std::visit( + [&s, fp_t](const auto &x) { + using type = uncvref_t; + + if constexpr (std::is_same_v>) { + return cm_make_arg_gen_vidx(s, x); + } else { + return cm_make_arg_gen_vc(s, fp_t, x); + } + }, + v)); + } + } + + // Fetch the arguments from the driver prototype. + auto *driver_f = bld.GetInsertBlock()->getParent(); + assert(driver_f != nullptr); + assert(driver_f->arg_size() == 4u); + auto *tape_ptr = driver_f->args().begin(); + auto *par_ptr = driver_f->args().begin() + 1; + auto *time_ptr = driver_f->args().begin() + 2; + auto *cur_order = driver_f->args().begin() + 3; + + // Compute the derivatives for this segment. + for (const auto &[func, fpair] : seg_map) { + const auto &[ncalls, gens] = fpair; + + taylor_cm_codegen_block_diff(s, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type, + n_uvars); + } + + return seg_map; +} + +// Helper to codegen the computation of the Taylor derivatives in compact mode via +// driver functions implemented across multiple LLVM states. main_state is the state in which the stepper is defined, +// main_fp_t the internal scalar floating-point type as defined in the main state, +// main_par/main_time/main_tape_ptr the parameters/time/tape pointers as defined in the +// main state, dc the Taylor decomposition, s_dc its segmented counterpart, n_eq the number +// of equations/state variables, order the Taylor order, batch_size the batch size, +// high_accuracy the high accuracy flag, parallel_mode the parallel mode flag, max_svf_idx +// the maximum index in the decomposition of the sv funcs (or zero if there are no sv funcs). +// +// The return value is a list of states in which the driver functions have been defined. +std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, + llvm::Value *main_par_ptr, llvm::Value *main_time_ptr, + llvm::Value *main_tape_ptr, const taylor_dc_t &dc, + const std::vector &s_dc, std::uint32_t n_eq, + std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, + bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx) +{ + // TODO implement. + (void)parallel_mode; // Init the list of states. // NOTE: we use lists here because it is convenient to have @@ -557,115 +821,23 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv states.push_back(main_state.make_similar()); auto *cur_state = &states.back(); - // Index of the state we are currently operating on. - boost::safe_numerics::safe cur_state_idx = 0; - - // Helper to create and return the prototype of a driver function in the state s. - auto make_driver_proto = [](llvm_state &s, unsigned cur_idx) { - auto &builder = s.builder(); - auto &md = s.module(); - auto &ctx = s.context(); - - // The arguments to the driver are: - // - a pointer to the tape, - // - pointers to par and time, - // - the current diff order. - auto *ptr_tp = llvm::PointerType::getUnqual(ctx); - std::vector fargs{ptr_tp, ptr_tp, ptr_tp, builder.getInt32Ty()}; - - // The driver does not return anything. - auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false); - assert(ft != nullptr); // LCOV_EXCL_LINE - - // Now create the driver. - const auto cur_name = fmt::format("heyoka.cm_jet.driver_{}", cur_idx); - auto *f = llvm_func_create(ft, llvm::Function::ExternalLinkage, cur_name, &md); - // NOTE: the driver cannot call itself recursively. - f->addFnAttr(llvm::Attribute::NoRecurse); - - // Add the arguments' attributes. - // NOTE: no aliasing is assumed between the pointer - // arguments. - auto *tape_arg = f->args().begin(); - tape_arg->setName("tape_ptr"); - tape_arg->addAttr(llvm::Attribute::NoCapture); - tape_arg->addAttr(llvm::Attribute::NoAlias); - - auto *par_ptr_arg = tape_arg + 1; - par_ptr_arg->setName("par_ptr"); - par_ptr_arg->addAttr(llvm::Attribute::NoCapture); - par_ptr_arg->addAttr(llvm::Attribute::NoAlias); - par_ptr_arg->addAttr(llvm::Attribute::ReadOnly); - - auto *time_ptr_arg = tape_arg + 2; - time_ptr_arg->setName("time_ptr"); - time_ptr_arg->addAttr(llvm::Attribute::NoCapture); - time_ptr_arg->addAttr(llvm::Attribute::NoAlias); - time_ptr_arg->addAttr(llvm::Attribute::ReadOnly); - - return f; - }; + // Generate the global arrays for the computation of the derivatives + // of the state variables in the main state. + const auto svd_gl = taylor_c_make_sv_diff_globals(main_state, main_fp_t, dc, n_uvars); - // TODO doc fix. - // Helper to compute the Taylor derivatives for a block. - // func is the LLVM function for the computation of the Taylor derivative in the block, - // ncalls the number of times it must be called, gens the generators for the - // function arguments and cur_order the order of the derivative. s is the llvm state - // in which we are computing the derivatives. - auto block_diff = [n_uvars](llvm_state &s, llvm::Function *func, std::uint32_t ncalls, const auto &gens, - llvm::Value *tape_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr, - llvm::Value *cur_order, llvm::Type *fp_vec_type) { - // LCOV_EXCL_START - assert(ncalls > 0u); - assert(!gens.empty()); - assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast(f); })); - // LCOV_EXCL_STOP + // Structure used to log, in trace mode, the breakdown of each segment. + // For each segment, this structure contains the number of invocations + // of each function in the segment. It will be unused if we are not tracing. + std::vector> segment_bd; - // Fetch the builder for the current state. - auto &bld = s.builder(); - - // We will be manually unrolling loops if ncalls is small enough. - // This seems to help with compilation times. - constexpr auto max_unroll_n = 5u; - - if (ncalls > max_unroll_n) { - // Loop over the number of calls. - llvm_loop_u32(s, bld.getInt32(0), bld.getInt32(ncalls), [&](llvm::Value *cur_call_idx) { - // Create the u variable index from the first generator. - auto u_idx = gens[0](cur_call_idx); - - // Initialise the vector of arguments with which func must be called. The following - // initial arguments are always present: - // - current Taylor order, - // - u index of the variable, - // - tape of derivatives, - // - pointer to the param values, - // - pointer to the time value(s). - std::vector args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr}; - - // Create the other arguments via the generators. - for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { - args.push_back(gens[i](cur_call_idx)); - } + // Are we tracing? + const auto is_tracing = get_logger()->should_log(spdlog::level::trace); - // Calculate the derivative and store the result. - taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args)); - }); - } else { - // The manually-unrolled version of the above. - for (std::uint32_t idx = 0; idx < ncalls; ++idx) { - auto *cur_call_idx = bld.getInt32(idx); - auto u_idx = gens[0](cur_call_idx); - std::vector args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr}; - - for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { - args.push_back(gens[i](cur_call_idx)); - } + // Do we need to compute the last-order derivatives for the sv_funcs? + const auto need_svf_lo = max_svf_idx >= n_eq; - taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args)); - } - } - }; + // Index of the state we are currently operating on. + boost::safe_numerics::safe cur_state_idx = 0; // NOTE: unlike in compiled functions, we cannot at the same time // declare and invoke the drivers from the main module as the invocation @@ -677,12 +849,17 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv // Declarations of the drivers in the main state. std::vector main_driver_decls; // Add the declaration for the first driver. - main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx)); + main_driver_decls.push_back(taylor_cm_make_driver_proto(main_state, cur_state_idx)); + + // The driver function for the evaluation of the segment + // containing max_svf_idx. Will remain null if we do not need + // to compute the last-order derivatives for the sv funcs. + llvm::Function *max_svf_driver = nullptr; // Add the driver declaration to the current state, // and start insertion into the driver. - cur_state->builder().SetInsertPoint( - llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx))); + cur_state->builder().SetInsertPoint(llvm::BasicBlock::Create( + cur_state->context(), "entry", taylor_cm_make_driver_proto(*cur_state, cur_state_idx))); // Variable to keep track of how many blocks have been codegenned // in the current state. @@ -693,175 +870,81 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv // needs more investigation. constexpr auto max_n_cg_blocks = 20u; - // Variable to keep track of the u variable - // on whose definition we are operating. - auto cur_u_idx = n_eq; + // Variable to keep track of the index of the first u variable + // in a segment. + auto start_u_idx = n_eq; - // Iterate over the segments in s_dc. - for (const auto &seg : s_dc) { - if (n_cg_blocks > max_n_cg_blocks) { - // We have codegenned enough blocks for this state. Create the return - // value for the current driver, and move to the next one. - cur_state->builder().CreateRetVoid(); - - // Create the new current state. - states.push_back(main_state.make_similar()); - cur_state = &states.back(); - - // Reset/update the counters. - n_cg_blocks = 0; - ++cur_state_idx; - - // Add the driver declaration to the main state. - main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx)); - - // Add the driver declaration to the current state, - // and start insertion into the driver. - cur_state->builder().SetInsertPoint( - llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx))); - } - - // Fetch the internal fp type and its vector counterpart for the current state. - auto *fp_t = llvm_clone_type(*cur_state, main_fp_t); - auto *fp_vec_type = make_vector_type(fp_t, batch_size); - - // Fetch the current builder. - auto &cur_builder = cur_state->builder(); - - // This structure maps a function to sets of arguments - // with which the function is to be called. For instance, if function - // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map - // will contain {f : [[a, b, c], [d, e, f]]}. - // After construction, we have verified that for each function - // in the map the sets of arguments have all the same size. - // NOTE: again, here and below we use name-based ordered maps for the functions. - // This ensures that the invocations of cm_make_arg_gen_*(), which create several - // global variables, always happen in a well-defined order. If we used an unordered map instead, - // the variables would be created in a "random" order, which would result in a - // unnecessary miss for the in-memory cache machinery when two logically-identical - // LLVM modules are considered different because of the difference in the order - // of declaration of global variables. - std::map>>, - llvm_func_name_compare> - tmp_map; - - for (const auto &ex : seg) { - // Get the function for the computation of the derivative. - auto *func = taylor_c_diff_func(*cur_state, fp_t, ex.first, n_uvars, batch_size, high_accuracy); - - // Insert the function into tmp_map. - const auto [it, is_new_func] = tmp_map.try_emplace(func); - - assert(is_new_func || !it->second.empty()); // LCOV_EXCL_LINE - - // Convert the variables/constants in the current dc - // element into a set of indices/constants. - const auto cdiff_args = udef_to_variants(ex.first, ex.second); - - // LCOV_EXCL_START - if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) { - throw std::invalid_argument( - fmt::format("Inconsistent arity detected in a Taylor derivative function in compact " - "mode: the same function is being called with both {} and {} arguments", - it->second.back().size() - 1u, cdiff_args.size())); - } - // LCOV_EXCL_STOP + // Helper to finalise the current driver function and create a new one. + auto start_new_driver = [&cur_state, &states, &main_state, &n_cg_blocks, &cur_state_idx, &main_driver_decls]() { + // Finalise the current driver. + cur_state->builder().CreateRetVoid(); - // Add the new set of arguments. - it->second.emplace_back(); - // Add the idx of the u variable. - it->second.back().emplace_back(cur_u_idx); - // Add the actual function arguments. - it->second.back().insert(it->second.back().end(), cdiff_args.begin(), cdiff_args.end()); + // Create the new current state. + states.push_back(main_state.make_similar()); + cur_state = &states.back(); - ++cur_u_idx; - } + // Reset/update the counters. + n_cg_blocks = 0; + ++cur_state_idx; - // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]} - // to {f : [[a, d], [b, e], [c, f]]}. - std::map, std::vector>>, - llvm_func_name_compare> - tmp_map_transpose; - for (const auto &[func, vv] : tmp_map) { - assert(!vv.empty()); // LCOV_EXCL_LINE - - // Add the function. - const auto [it, ins_status] = tmp_map_transpose.try_emplace(func); - assert(ins_status); // LCOV_EXCL_LINE - - const auto n_calls = vv.size(); - const auto n_args = vv[0].size(); - // NOTE: n_args must be at least 1 because the u idx - // is prepended to the actual function arguments in - // the tmp_map entries. - assert(n_args >= 1u); // LCOV_EXCL_LINE - - for (decltype(vv[0].size()) i = 0; i < n_args; ++i) { - // Build the vector of values corresponding - // to the current argument index. - std::vector> tmp_c_vec; - for (decltype(vv.size()) j = 0; j < n_calls; ++j) { - tmp_c_vec.push_back(vv[j][i]); - } + // Add the driver declaration to the main state. + main_driver_decls.push_back(taylor_cm_make_driver_proto(main_state, cur_state_idx)); - // Turn tmp_c_vec (a vector of variants) into a variant - // of vectors, and insert the result. - it->second.push_back(vv_transpose(tmp_c_vec)); - } - } + // Add the driver declaration to the current state, + // and start insertion into the driver. + cur_state->builder().SetInsertPoint(llvm::BasicBlock::Create( + cur_state->context(), "entry", taylor_cm_make_driver_proto(*cur_state, cur_state_idx))); + }; - // Create the seg_f_list_t for the current segment. - seg_f_list_t seg_map; - - for (const auto &[func, vv] : tmp_map_transpose) { - // NOTE: vv.size() is now the number of arguments. We know it cannot - // be zero because the functions to compute the Taylor derivatives - // in compact mode always have at least 1 argument (i.e., the index - // of the u variable whose derivative is being computed). - assert(!vv.empty()); // LCOV_EXCL_LINE - - // Add the function. - const auto [it, ins_status] = seg_map.try_emplace(func); - assert(ins_status); // LCOV_EXCL_LINE - - // Set the number of calls for this function. - it->second.first - = std::visit([](const auto &x) { return boost::numeric_cast(x.size()); }, vv[0]); - assert(it->second.first > 0u); // LCOV_EXCL_LINE - - // Create the g functions for each argument. - for (const auto &v : vv) { - it->second.second.push_back(std::visit( - [cur_state, fp_t](const auto &x) { - using type = uncvref_t; - - if constexpr (std::is_same_v>) { - return cm_make_arg_gen_vidx(*cur_state, x); - } else { - return cm_make_arg_gen_vc(*cur_state, fp_t, x); - } - }, - v)); + // Iterate over the segments in s_dc and codegen the code for the + // computation of Taylor derivatives. + for (const auto &seg : s_dc) { + // Cache the number of expressions in the segment. + const auto seg_n_ex = static_cast(seg.size()); + + // Are we in the segment containing max_svf_idx? We are if: + // + // - we need to compute the last-order derivatives of the sv funcs, + // - max_svf_idx is somewhere within this segment. + // + // In such a case, we create a driver specifically for this segment, which we will + // invoke again at the end of this function to compute the last-order derivatives + // of the sv funcs. + const auto is_svf_seg = need_svf_lo && max_svf_idx >= start_u_idx && max_svf_idx < (start_u_idx + seg_n_ex); + + if (n_cg_blocks > max_n_cg_blocks || is_svf_seg) { + // Either we have codegenned enough blocks for this state, or we are + // in the max_svf_idx state. Finalise the current driver and start the new one. + start_new_driver(); + + // Assign max_svf_driver if needed. + if (is_svf_seg) { + assert(max_svf_driver == nullptr); + max_svf_driver = main_driver_decls.back(); } } - // Fetch the arguments from the driver prototype. - auto *driver_f = cur_builder.GetInsertBlock()->getParent(); - auto *tape_ptr = driver_f->args().begin(); - auto *par_ptr = driver_f->args().begin() + 1; - auto *time_ptr = driver_f->args().begin() + 2; - auto *cur_order = driver_f->args().begin() + 3; - - // Compute the derivatives for this segment. - for (const auto &[func, fpair] : seg_map) { - const auto &[ncalls, gens] = fpair; + // Fetch the internal fp type for the current state. + auto *fp_t = llvm_clone_type(*cur_state, main_fp_t); - block_diff(*cur_state, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type); - } + // Codegen the computation of the derivatives for this segment. + const auto seg_map + = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars, high_accuracy); // Update the number of codegenned blocks. n_cg_blocks += seg_map.size(); + // Update start_u_idx. + start_u_idx += seg_n_ex; + + // If we codegenned the max_svf_idx driver, start immediately a new driver. + // We want the max_svf_idx driver to contain the codegen for a single segment + // and nothing more, otherwise we end up doing unnecessary work when computing + // the last-order derivatives of the sv funcs. + if (is_svf_seg) { + start_new_driver(); + } + // LCOV_EXCL_START // Update segment_bd if needed. if (is_tracing) { @@ -907,40 +990,43 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv taylor_c_compute_sv_diffs(main_state, main_fp_t, svd_gl, main_tape_ptr, main_par_ptr, n_uvars, main_bld.getInt32(order), batch_size); - // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs + // Finally, we compute the last-order derivatives for the sv_funcs, if needed. Because the sv funcs // correspond to u variables somewhere in the decomposition, we will have to compute the // last-order derivatives of the u variables until we are sure all sv_funcs derivatives // have been properly computed. - if (max_svf_idx >= n_eq) { - // Monitor the starting index of the current - // segment while iterating on the segments. - auto cur_start_u_idx = n_eq; - - for (decltype(s_dc.size()) seg_idx = 0; seg_idx < s_dc.size(); ++seg_idx) { - if (cur_start_u_idx > max_svf_idx) { - // We computed all the necessary derivatives, break out. - break; - } + if (need_svf_lo) { + assert(max_svf_driver != nullptr); - // Invoke the driver for the current segment. - main_bld.CreateCall(main_driver_decls[seg_idx], - {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(order)}); + // What we do here is to iterate over all the drivers, invoke them one by one, + // and break out when we have detected max_svf_driver. + for (auto *cur_driver_f : main_driver_decls) { + main_bld.CreateCall(cur_driver_f, {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(order)}); - // Update cur_start_u_idx. - cur_start_u_idx += static_cast(s_dc[seg_idx].size()); + if (cur_driver_f == max_svf_driver) { + break; + } } } + + // Return the states. + // NOTE: in C++23 we could use std::ranges::views::as_rvalue instead of + // the custom transform: + // + // https://en.cppreference.com/w/cpp/ranges/as_rvalue_view + auto sview = states | std::views::transform([](auto &s) -> auto && { return std::move(s); }); + return std::vector(std::ranges::begin(sview), std::ranges::end(sview)); } // Helper for the computation of a jet of derivatives in compact mode, -// used in taylor_compute_jet(). The return value are the size/alignment -// requirements for the tape of derivatives. All LLVM values and types -// passed to this function are defined in the main state. -std::array taylor_compute_jet_compact_mode( +// used in taylor_compute_jet(). The return values are the size/alignment +// requirements for the tape of derivatives and the list of states in which +// the drivers are implemented. All LLVM values and types passed to this function are defined in the main state. +std::pair, std::vector> taylor_compute_jet_compact_mode( // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr, - llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector &sv_funcs_dc, std::uint32_t n_eq, - std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode) + llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *order0, llvm::Value *main_par_ptr, + llvm::Value *main_time_ptr, llvm::Value *main_tape_ptr, const taylor_dc_t &dc, + const std::vector &sv_funcs_dc, std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, + std::uint32_t batch_size, bool high_accuracy, bool parallel_mode) { auto &main_bld = main_state.builder(); auto &main_md = main_state.module(); @@ -990,9 +1076,9 @@ std::array taylor_compute_jet_compact_mode( // lifetime of tape_ptr begins here and ends at the end of the function, // so that LLVM can assume that any value stored in it cannot be possibly // used outside this function. - main_bld.CreateLifetimeStart(tape_ptr, main_bld.getInt64(tape_sz)); + main_bld.CreateLifetimeStart(main_tape_ptr, main_bld.getInt64(tape_sz)); - // Copy over the order-0 derivatives of the state variables. + // Copy the order-0 derivatives of the state variables into the tape. // NOTE: overflow checking is already done in the parent function. llvm_loop_u32(main_state, main_bld.getInt32(0), main_bld.getInt32(n_eq), [&](llvm::Value *cur_var_idx) { // Fetch the pointer from order0. @@ -1003,13 +1089,17 @@ std::array taylor_compute_jet_compact_mode( auto *vec = ext_load_vector_from_memory(main_state, main_fp_t, ptr, batch_size); // Store into tape_ptr. - taylor_c_store_diff(main_state, main_fp_vec_t, tape_ptr, n_uvars, main_bld.getInt32(0), cur_var_idx, vec); + taylor_c_store_diff(main_state, main_fp_vec_t, main_tape_ptr, n_uvars, main_bld.getInt32(0), cur_var_idx, vec); }); + // Codegen the computation of the Taylor derivatives across multiple states. + auto states = taylor_compute_jet_multi(main_state, main_fp_t, main_par_ptr, main_time_ptr, main_tape_ptr, dc, s_dc, + n_eq, n_uvars, order, batch_size, high_accuracy, parallel_mode, max_svf_idx); + get_logger()->trace("Taylor IR creation compact mode runtime: {}", sw); - // Return the array of derivatives of the u variables and its type. - return std::make_pair(diff_arr, static_cast(diff_array_type)); + // Return the tape size/alignment and the list of states containing the drivers. + return std::make_pair(std::array{tape_sz, tape_al}, std::move(states)); } // Given an input pointer 'in', load the first n * batch_size values in it as n vectors @@ -1054,10 +1144,11 @@ auto taylor_load_values(llvm_state &s, llvm::Type *fp_t, llvm::Value *in, std::u // order0, par_ptr and time_ptr are all external pointers. // // The return value is a variant containing either: -// - in compact mode, the size/alignment requirements for the tape of derivatives, -// - otherwise, the jet of derivatives of the state variables and sv_funcs +// - in compact mode, the size/alignment requirements for the tape of derivatives +// and the list of states in which the driver functions are implemented, or +// - the jet of derivatives of the state variables and sv_funcs // up to order 'order'. -std::variant, std::vector> +std::variant, std::vector>, std::vector> taylor_compute_jet(llvm_state &s, llvm::Type *fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr, llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector &sv_funcs_dc, std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp index ffaa6b5f2..5416a9278 100644 --- a/src/taylor_adaptive.cpp +++ b/src/taylor_adaptive.cpp @@ -43,6 +43,7 @@ #endif #include +#include #include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -174,13 +176,13 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, std::vector ntes, bool parallel_mode, [[maybe_unused]] std::optional prec) { - HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_state); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_pars); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_time); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_last_h); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tol); @@ -191,6 +193,8 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_vsys); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tm_data); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape_sa); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape); // NOTE: this must hold because tol == 0 is interpreted // as undefined in finalise_ctor(). @@ -394,7 +398,8 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, auto *fp_t = detail::internal_llvm_type_like(std::get<0>(m_llvm_state), m_tol); // The state(s) which will be returned by the construction of the stepper function. - std::variant> states; + // If we are not in compact mode, this vector will remain empty. + std::vector states; // Add the stepper function. if (with_events) { @@ -407,11 +412,10 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, ee.push_back(ev.get_expression()); } - std::tie(m_dc, states) - = detail::taylor_add_adaptive_step_with_events(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step_e", sys, 1, - compact_mode, ee, high_accuracy, parallel_mode, m_order); + std::tie(m_dc, m_tape_sa, states) = detail::taylor_add_adaptive_step_with_events( + std::get<0>(m_llvm_state), fp_t, "step_e", sys, 1, compact_mode, ee, high_accuracy, parallel_mode, m_order); } else { - std::tie(m_dc, states) + std::tie(m_dc, m_tape_sa, states) = detail::taylor_add_adaptive_step(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step", sys, 1, high_accuracy, compact_mode, parallel_mode, m_order); } @@ -440,27 +444,43 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, // Log runtimes in trace mode. spdlog::stopwatch sw; - // Add the function for the computation of - // the dense output. - detail::taylor_add_d_out_function(m_llvm, fp_t, m_dim, m_order, 1, high_accuracy); + // Add the function for the computation of the dense output. + // NOTE: in compact mode, the dense output function will be added to the main state. + detail::taylor_add_d_out_function(std::get<0>(m_llvm_state), fp_t, m_dim, m_order, 1, high_accuracy); detail::get_logger()->trace("Taylor dense output runtime: {}", sw); sw.reset(); - // Run the jit. - m_llvm.compile(); + // Run the jit compilation. + if (compact_mode) { + // Add the main state to the list of states. + states.push_back(std::move(std::get<0>(m_llvm_state))); - detail::get_logger()->trace("Taylor LLVM compilation runtime: {}", sw); + // Reverse the list of states so that we start with the + // compilation of the main state first, which may be bigger. + std::ranges::reverse(states); - // Fetch the stepper. - if (with_events) { - m_step_f = reinterpret_cast(m_llvm.jit_lookup("step_e")); + // Create the multi state and assign it. + m_llvm_state = llvm_multi_state(std::move(states)); + + // Compile. + std::get<1>(m_llvm_state).compile(); + + // Create the storage for the tape of derivatives. + const auto [sz, al] = m_tape_sa; + m_tape = detail::make_aligned_buffer(sz, al); } else { - m_step_f = reinterpret_cast(m_llvm.jit_lookup("step")); + std::get<0>(m_llvm_state).compile(); } + detail::get_logger()->trace("Taylor LLVM compilation runtime: {}", sw); + + // Fetch the stepper. + assign_stepper(with_events); + // Fetch the function to compute the dense output. - m_d_out_f = reinterpret_cast(m_llvm.jit_lookup("d_out_f")); + m_d_out_f = std::visit( + [](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); // Setup the vector for the Taylor coefficients. using su32_t = boost::safe_numerics::safe; @@ -489,27 +509,27 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, #endif // Init the event data structure if needed. - // NOTE: this can be done in parallel with the rest of the constructor, + // NOTE: in principle this can be done in parallel with the rest of the constructor, // once we have m_order/m_dim and we are done using tes/ntes. if (with_events) { - m_ed_data = std::make_unique(m_llvm.make_similar(), std::move(tes), std::move(ntes), m_order, m_dim, - m_state[0]); + m_ed_data = std::make_unique(m_tplt_state.make_similar(), std::move(tes), std::move(ntes), m_order, + m_dim, m_state[0]); } if (auto_ic_setup) { // Finish the automatic setup of the ics for a variational // integrator. - detail::setup_variational_ics_t0(m_llvm, m_state, m_pars, &m_time.hi, std::get<1>(vsys), 1, m_high_accuracy, - m_compact_mode); + detail::setup_variational_ics_t0(m_tplt_state, m_state, m_pars, &m_time.hi, std::get<1>(vsys), 1, + m_high_accuracy, m_compact_mode); } if (is_variational) { #if defined(HEYOKA_HAVE_REAL) if constexpr (std::is_same_v) { - m_tm_data.emplace(std::get<1>(vsys), static_cast(this->get_prec()), m_llvm, 1); + m_tm_data.emplace(std::get<1>(vsys), static_cast(this->get_prec()), m_tplt_state, 1); } else { #endif - m_tm_data.emplace(std::get<1>(vsys), 0, m_llvm, 1); + m_tm_data.emplace(std::get<1>(vsys), 0, m_tplt_state, 1); #if defined(HEYOKA_HAVE_REAL) } #endif @@ -548,11 +568,7 @@ taylor_adaptive::taylor_adaptive(const taylor_adaptive &other) : base_t(static_cast(other)), m_i_data(std::make_unique(*other.m_i_data)), m_ed_data(other.m_ed_data ? std::make_unique(*other.m_ed_data) : nullptr) { - if (m_ed_data) { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step_e")); - } else { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step")); - } + assign_stepper(static_cast(m_ed_data)); } template @@ -615,12 +631,8 @@ void taylor_adaptive::load_impl(Archive &ar, unsigned version) ar >> m_i_data; ar >> m_ed_data; - // Recover the function pointers. - if (m_ed_data) { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step_e")); - } else { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step")); - } + // Recover the stepper. + assign_stepper(static_cast(m_ed_data)); // LCOV_EXCL_START } catch (...) { // Reset to def-cted state in case of exceptions. @@ -706,14 +718,20 @@ std::tuple taylor_adaptive::step_impl(T max_delta_t, bool HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape); auto h = max_delta_t; - if (m_step_f.index() == 0u) { + if (m_step_f.index() == 0u || m_step_f.index() == 2u) { assert(!m_ed_data); // LCOV_EXCL_LINE // Invoke the vanilla stepper. - std::get<0>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr); + if (m_step_f.index() == 0u) { + std::get<0>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr); + } else { + std::get<2>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr, + m_tape.get()); + } // Update the time. m_time += h; @@ -737,7 +755,12 @@ std::tuple taylor_adaptive::step_impl(T max_delta_t, bool // Invoke the stepper for event handling. We will record the norm infinity of the state vector + // event equations at the beginning of the timestep for later use. auto max_abs_state = detail::num_zero_like(h); - std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state); + if (m_step_f.index() == 1u) { + std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state); + } else { + std::get<3>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state, + m_tape.get()); + } // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that @@ -1043,7 +1066,7 @@ taylor_adaptive::propagate_until_impl(detail::dfloat t, std::size_t max_st HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order); - HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy); // Check the current time. @@ -1129,7 +1152,7 @@ taylor_adaptive::propagate_until_impl(detail::dfloat t, std::size_t max_st } // Construct the return value. - continuous_output ret(m_llvm.make_similar()); + continuous_output ret(m_tplt_state.make_similar()); // Fill in the data. ret.m_tcs = std::move(c_out_tcs); @@ -1661,9 +1684,9 @@ taylor_adaptive::propagate_grid_impl(std::vector grid, std::size_t max_ste } template -const llvm_state &taylor_adaptive::get_llvm_state() const +const std::variant &taylor_adaptive::get_llvm_state() const { - return m_i_data->m_llvm; + return m_i_data->m_llvm_state; } template @@ -1894,6 +1917,29 @@ void taylor_adaptive::check_variational(const char *fname) const } } +// Helper to fetch the stepper function from m_llvm_state. +template +void taylor_adaptive::assign_stepper(bool with_events) +{ + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state); + + if (with_events) { + if (m_compact_mode) { + m_step_f = reinterpret_cast(std::get<1>(m_llvm_state).jit_lookup("step_e")); + } else { + m_step_f = reinterpret_cast(std::get<0>(m_llvm_state).jit_lookup("step_e")); + } + } else { + if (m_compact_mode) { + m_step_f = reinterpret_cast(std::get<1>(m_llvm_state).jit_lookup("step")); + } else { + m_step_f = reinterpret_cast(std::get<0>(m_llvm_state).jit_lookup("step")); + } + } +} + template const std::vector &taylor_adaptive::get_vargs() const { From 0744c61387606ab83e9845638aadb122dfa75e11 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Mon, 26 Aug 2024 15:16:08 +0200 Subject: [PATCH 04/30] [skip ci] From 94ef42a76ba06d9fdcce945a5d916373c045cee8 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Mon, 26 Aug 2024 16:53:42 +0200 Subject: [PATCH 05/30] Implement parallel compilation for batch-mode integrators too, test adaptations. --- include/heyoka/detail/i_data.hpp | 22 ++++- include/heyoka/taylor.hpp | 3 +- src/detail/i_data.cpp | 54 ++++++++++-- src/taylor_adaptive_batch.cpp | 143 +++++++++++++++++++++---------- test/llvm_state.cpp | 2 +- test/logical.cpp | 18 ++-- test/model_nbody.cpp | 8 +- test/opt_checks.cpp | 25 +++--- test/rel.cpp | 8 +- test/select.cpp | 12 +-- test/taylor_adaptive.cpp | 4 +- test/taylor_adaptive_batch.cpp | 6 +- test/taylor_adaptive_mp.cpp | 2 +- test/taylor_atan2.cpp | 8 +- test/taylor_kepE.cpp | 8 +- test/taylor_kepF.cpp | 16 ++-- test/taylor_kepF_mp.cpp | 3 +- test/taylor_pow.cpp | 32 +++---- test/taylor_prod.cpp | 12 +-- test/taylor_relu.cpp | 20 ++--- test/taylor_relu_mp.cpp | 10 +-- test/taylor_square.cpp | 16 ++-- test/test_utils.hpp | 14 +++ 23 files changed, 283 insertions(+), 163 deletions(-) diff --git a/include/heyoka/detail/i_data.hpp b/include/heyoka/detail/i_data.hpp index be49f7f49..f86c49034 100644 --- a/include/heyoka/detail/i_data.hpp +++ b/include/heyoka/detail/i_data.hpp @@ -145,8 +145,12 @@ struct taylor_adaptive_batch::i_data { std::vector m_state; // Times. std::vector m_time_hi, m_time_lo; - // The LLVM machinery. - llvm_state m_llvm; + // The LLVM (multi)state. + std::variant m_llvm_state; + // A template LLVM state we keep around to create states + // similar to m_llvm_state as needed. This is created with the + // same settings as m_llvm_state. + llvm_state m_tplt_state; // Dimension of the system. std::uint32_t m_dim{}; // Taylor decomposition. @@ -159,10 +163,18 @@ struct taylor_adaptive_batch::i_data { bool m_high_accuracy{}; // Compact mode. bool m_compact_mode{}; - // The steppers. + // The stepper types (non-compact mode). using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept; using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept; - std::variant m_step_f; + // The stepper types (compact mode). These have an additional argument - the tape pointer. + using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept; + using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept; + // The stepper. + std::variant m_step_f; + // Size/alignment for the compact mode tape. + std::array m_tape_sa{}; + // Compact mode tape. + detail::aligned_buffer_t m_tape; // The vector of parameters. std::vector m_pars; // The vector for the Taylor coefficients. @@ -221,6 +233,8 @@ struct taylor_adaptive_batch::i_data { i_data &operator=(i_data &&) noexcept = delete; ~i_data(); + + void init_cm_tape(); }; HEYOKA_END_NAMESPACE diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp index 39498c1e1..b3de93017 100644 --- a/include/heyoka/taylor.hpp +++ b/include/heyoka/taylor.hpp @@ -909,6 +909,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch explicit taylor_adaptive_batch(private_ctor_t, llvm_state); HEYOKA_DLL_LOCAL void check_variational(const char *) const; + HEYOKA_DLL_LOCAL void assign_stepper(bool); // Input type for Taylor map computation. using tm_input_t = mdspan>; @@ -952,7 +953,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch ~taylor_adaptive_batch(); - [[nodiscard]] const llvm_state &get_llvm_state() const; + [[nodiscard]] const std::variant &get_llvm_state() const; [[nodiscard]] const taylor_dc_t &get_decomposition() const; diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp index cbb27b9ac..45e431c9a 100644 --- a/src/detail/i_data.cpp +++ b/src/detail/i_data.cpp @@ -217,6 +217,25 @@ HEYOKA_TAYLOR_ADAPTIVE_I_DATA_INST(mppp::real) #undef HEYOKA_TAYLOR_ADAPTIVE_I_DATA_INST +// Helper to initialise the compact-mode tape. Assumes an empty tape. +template +void taylor_adaptive_batch::i_data::init_cm_tape() +{ + assert(!m_tape); + + const auto [sz, al] = m_tape_sa; + + if (m_compact_mode) { + assert(sz != 0u); + assert(al != 0u); + + m_tape = detail::make_aligned_buffer(sz, al); + } else { + assert(sz == 0u); + assert(al == 0u); + } +} + template void taylor_adaptive_batch::i_data::save(boost::archive::binary_oarchive &ar, unsigned) const { @@ -224,13 +243,15 @@ void taylor_adaptive_batch::i_data::save(boost::archive::binary_oarchive &ar, ar << m_state; ar << m_time_hi; ar << m_time_lo; - ar << m_llvm; + ar << m_llvm_state; + ar << m_tplt_state; ar << m_dim; ar << m_dc; ar << m_order; ar << m_tol; ar << m_high_accuracy; ar << m_compact_mode; + ar << m_tape_sa; ar << m_pars; ar << m_tc; ar << m_last_h; @@ -262,13 +283,15 @@ void taylor_adaptive_batch::i_data::load(boost::archive::binary_iarchive &ar, ar >> m_state; ar >> m_time_hi; ar >> m_time_lo; - ar >> m_llvm; + ar >> m_llvm_state; + ar >> m_tplt_state; ar >> m_dim; ar >> m_dc; ar >> m_order; ar >> m_tol; ar >> m_high_accuracy; ar >> m_compact_mode; + ar >> m_tape_sa; ar >> m_pars; ar >> m_tc; ar >> m_last_h; @@ -293,20 +316,31 @@ void taylor_adaptive_batch::i_data::load(boost::archive::binary_iarchive &ar, ar >> m_tm_data; // Recover the function pointers. - m_d_out_f = reinterpret_cast(m_llvm.jit_lookup("d_out_f")); + m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); + + // Reconstruct the compact mode tape, if necessary. + m_tape.reset(); + init_cm_tape(); } +// NOTE: this ctor provides only partial initialisation of the data members. +// The rest of the initialisation is performed from the integrator ctor. +// NOTE: m_llvm_state is inited as a single llvm_state regardless of the use +// of compact mode. It will be converted into a multi state if needed at a +// later stage. template -taylor_adaptive_batch::i_data::i_data(llvm_state s) : m_llvm(std::move(s)) +taylor_adaptive_batch::i_data::i_data(llvm_state s) + : m_llvm_state(std::move(s)), m_tplt_state(std::get<0>(m_llvm_state).make_similar()) { } template taylor_adaptive_batch::i_data::i_data(const i_data &other) : m_batch_size(other.m_batch_size), m_state(other.m_state), m_time_hi(other.m_time_hi), m_time_lo(other.m_time_lo), - m_llvm(other.m_llvm), m_dim(other.m_dim), m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol), - m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode), m_pars(other.m_pars), - m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_pinf(other.m_pinf), m_minf(other.m_minf), + m_llvm_state(other.m_llvm_state), m_tplt_state(other.m_tplt_state), m_dim(other.m_dim), m_dc(other.m_dc), + m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy), + m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa), m_pars(other.m_pars), m_tc(other.m_tc), + m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_pinf(other.m_pinf), m_minf(other.m_minf), m_delta_ts(other.m_delta_ts), m_step_res(other.m_step_res), m_prop_res(other.m_prop_res), m_ts_count(other.m_ts_count), m_min_abs_h(other.m_min_abs_h), m_max_abs_h(other.m_max_abs_h), m_cur_max_delta_ts(other.m_cur_max_delta_ts), m_pfor_ts(other.m_pfor_ts), m_t_dir(other.m_t_dir), @@ -314,7 +348,11 @@ taylor_adaptive_batch::i_data::i_data(const i_data &other) m_nf_detected(other.m_nf_detected), m_d_out_time(other.m_d_out_time), m_vsys(other.m_vsys), m_tm_data(other.m_tm_data) { - m_d_out_f = reinterpret_cast(m_llvm.jit_lookup("d_out_f")); + // Recover the function pointers. + m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); + + // Init the compact mode tape, if necessary. + init_cm_tape(); } template diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp index e97b4d0df..43e2f9788 100644 --- a/src/taylor_adaptive_batch.cpp +++ b/src/taylor_adaptive_batch.cpp @@ -41,6 +41,7 @@ #endif #include +#include #include #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -113,9 +115,9 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tol); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order); - HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dc); - HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_last_h); @@ -138,6 +140,8 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_time); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_vsys); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tm_data); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape_sa); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape); // Init the data members. m_batch_size = batch_size; @@ -243,10 +247,14 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta m_order = detail::taylor_order_from_tol(m_tol); // Determine the external fp type. - auto *ext_fp_t = detail::to_external_llvm_type(m_llvm.context()); + auto *ext_fp_t = detail::to_external_llvm_type(std::get<0>(m_llvm_state).context()); // Determine the internal fp type. - auto *fp_t = detail::internal_llvm_type_like(m_llvm, m_tol); + auto *fp_t = detail::internal_llvm_type_like(std::get<0>(m_llvm_state), m_tol); + + // The state(s) which will be returned by the construction of the stepper function. + // If we are not in compact mode, this vector will remain empty. + std::vector states; // Add the stepper function. if (with_events) { @@ -259,11 +267,13 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta ee.push_back(ev.get_expression()); } - m_dc = detail::taylor_add_adaptive_step_with_events(m_llvm, ext_fp_t, fp_t, "step_e", sys, batch_size, - compact_mode, ee, high_accuracy, parallel_mode, m_order); + std::tie(m_dc, m_tape_sa, states) + = detail::taylor_add_adaptive_step_with_events(std::get<0>(m_llvm_state), fp_t, "step_e", sys, batch_size, + compact_mode, ee, high_accuracy, parallel_mode, m_order); } else { - m_dc = detail::taylor_add_adaptive_step(m_llvm, ext_fp_t, fp_t, "step", sys, batch_size, high_accuracy, - compact_mode, parallel_mode, m_order); + std::tie(m_dc, m_tape_sa, states) + = detail::taylor_add_adaptive_step(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step", sys, batch_size, + high_accuracy, compact_mode, parallel_mode, m_order); } // Fix m_pars' size, if necessary. @@ -282,27 +292,43 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta // Log runtimes in trace mode. spdlog::stopwatch sw; - // Add the function for the computation of - // the dense output. - detail::taylor_add_d_out_function(m_llvm, ext_fp_t, m_dim, m_order, m_batch_size, high_accuracy); + // Add the function for the computation of the dense output. + // NOTE: in compact mode, the dense output function will be added to the main state. + detail::taylor_add_d_out_function(std::get<0>(m_llvm_state), ext_fp_t, m_dim, m_order, m_batch_size, high_accuracy); detail::get_logger()->trace("Taylor batch dense output runtime: {}", sw); sw.reset(); - // Run the jit. - m_llvm.compile(); + // Run the jit compilation. + if (compact_mode) { + // Add the main state to the list of states. + states.push_back(std::move(std::get<0>(m_llvm_state))); - detail::get_logger()->trace("Taylor batch LLVM compilation runtime: {}", sw); + // Reverse the list of states so that we start with the + // compilation of the main state first, which may be bigger. + std::ranges::reverse(states); - // Fetch the stepper. - if (with_events) { - m_step_f = reinterpret_cast(m_llvm.jit_lookup("step_e")); + // Create the multi state and assign it. + m_llvm_state = llvm_multi_state(std::move(states)); + + // Compile. + std::get<1>(m_llvm_state).compile(); + + // Create the storage for the tape of derivatives. + const auto [sz, al] = m_tape_sa; + m_tape = detail::make_aligned_buffer(sz, al); } else { - m_step_f = reinterpret_cast(m_llvm.jit_lookup("step")); + std::get<0>(m_llvm_state).compile(); } + detail::get_logger()->trace("Taylor batch LLVM compilation runtime: {}", sw); + + // Fetch the stepper. + assign_stepper(with_events); + // Fetch the function to compute the dense output. - m_d_out_f = reinterpret_cast(m_llvm.jit_lookup("d_out_f")); + m_d_out_f = std::visit( + [](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); // Setup the vector for the Taylor coefficients. // NOTE: the size of m_state.size() already takes @@ -351,22 +377,22 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta m_d_out_time.resize(m_batch_size); // Init the event data structure if needed. - // NOTE: this can be done in parallel with the rest of the constructor, + // NOTE: in principle this can be done in parallel with the rest of the constructor, // once we have m_order/m_dim/m_batch_size and we are done using tes/ntes. if (with_events) { - m_ed_data = std::make_unique(m_llvm.make_similar(), std::move(tes), std::move(ntes), m_order, m_dim, - m_batch_size); + m_ed_data = std::make_unique(m_tplt_state.make_similar(), std::move(tes), std::move(ntes), m_order, + m_dim, m_batch_size); } if (auto_ic_setup) { // Finish the automatic setup of the ics for a variational // integrator. - detail::setup_variational_ics_t0(m_llvm, m_state, m_pars, m_time_hi.data(), std::get<1>(vsys), m_batch_size, - m_high_accuracy, m_compact_mode); + detail::setup_variational_ics_t0(m_tplt_state, m_state, m_pars, m_time_hi.data(), std::get<1>(vsys), + m_batch_size, m_high_accuracy, m_compact_mode); } if (is_variational) { - m_tm_data.emplace(std::get<1>(vsys), 0, m_llvm, m_batch_size); + m_tm_data.emplace(std::get<1>(vsys), 0, m_tplt_state, m_batch_size); } // Move vsys in. @@ -384,11 +410,7 @@ taylor_adaptive_batch::taylor_adaptive_batch(const taylor_adaptive_batch &oth : m_i_data(std::make_unique(*other.m_i_data)), m_ed_data(other.m_ed_data ? std::make_unique(*other.m_ed_data) : nullptr) { - if (m_ed_data) { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step_e")); - } else { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step")); - } + assign_stepper(static_cast(m_ed_data)); } template @@ -447,12 +469,8 @@ void taylor_adaptive_batch::load_impl(Archive &ar, unsigned version) ar >> m_i_data; ar >> m_ed_data; - // Recover the function pointers. - if (m_ed_data) { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step_e")); - } else { - m_i_data->m_step_f = reinterpret_cast(m_i_data->m_llvm.jit_lookup("step")); - } + // Recover the stepper. + assign_stepper(static_cast(m_ed_data)); // LCOV_EXCL_START } catch (...) { // Reset to def-cted state in case of exceptions. @@ -602,6 +620,7 @@ void taylor_adaptive_batch::step_impl(const std::vector &max_delta_ts, boo HEYOKA_TAYLOR_REF_FROM_I_DATA(m_time_copy_lo); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_nf_detected); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape); using std::abs; using std::isfinite; @@ -633,11 +652,17 @@ void taylor_adaptive_batch::step_impl(const std::vector &max_delta_ts, boo return false; }; - if (m_step_f.index() == 0u) { + if (m_step_f.index() == 0u || m_step_f.index() == 2u) { assert(!m_ed_data); // LCOV_EXCL_LINE - std::get<0>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(), - wtc ? m_tc.data() : nullptr); + // Invoke the vanilla stepper. + if (m_step_f.index() == 0u) { + std::get<0>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(), + wtc ? m_tc.data() : nullptr); + } else { + std::get<2>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(), + wtc ? m_tc.data() : nullptr, m_tape.get()); + } // Update the times and the last timesteps, and write out the result. for (std::uint32_t i = 0; i < m_batch_size; ++i) { @@ -673,8 +698,13 @@ void taylor_adaptive_batch::step_impl(const std::vector &max_delta_ts, boo // Invoke the stepper for event handling. We will record the norm infinity of the state vector + // event equations at the beginning of the timestep for later use. - std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(), - edd.m_max_abs_state.data()); + if (m_step_f.index() == 1u) { + std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(), + m_delta_ts.data(), edd.m_max_abs_state.data()); + } else { + std::get<3>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(), + m_delta_ts.data(), edd.m_max_abs_state.data(), m_tape.get()); + } // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that @@ -1081,7 +1111,7 @@ taylor_adaptive_batch::propagate_until_impl(const puntil_arg_t &ts_, std::siz HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order); - HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_pinf); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_res); @@ -1215,7 +1245,7 @@ taylor_adaptive_batch::propagate_until_impl(const puntil_arg_t &ts_, std::siz } // Construct the return value. - continuous_output_batch ret(m_llvm.make_similar()); + continuous_output_batch ret(m_tplt_state.make_similar()); // Fill in the data. ret.m_batch_size = m_batch_size; @@ -1986,9 +2016,9 @@ taylor_adaptive_batch::propagate_grid_impl(const std::vector &grid, std::s } template -const llvm_state &taylor_adaptive_batch::get_llvm_state() const +const std::variant &taylor_adaptive_batch::get_llvm_state() const { - return m_i_data->m_llvm; + return m_i_data->m_llvm_state; } template @@ -2291,6 +2321,29 @@ void taylor_adaptive_batch::check_variational(const char *fname) const } } +// Helper to fetch the stepper function from m_llvm_state. +template +void taylor_adaptive_batch::assign_stepper(bool with_events) +{ + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f); + HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state); + + if (with_events) { + if (m_compact_mode) { + m_step_f = reinterpret_cast(std::get<1>(m_llvm_state).jit_lookup("step_e")); + } else { + m_step_f = reinterpret_cast(std::get<0>(m_llvm_state).jit_lookup("step_e")); + } + } else { + if (m_compact_mode) { + m_step_f = reinterpret_cast(std::get<1>(m_llvm_state).jit_lookup("step")); + } else { + m_step_f = reinterpret_cast(std::get<0>(m_llvm_state).jit_lookup("step")); + } + } +} + template const std::vector &taylor_adaptive_batch::get_vargs() const { diff --git a/test/llvm_state.cpp b/test/llvm_state.cpp index 57d61782f..821524df3 100644 --- a/test/llvm_state.cpp +++ b/test/llvm_state.cpp @@ -108,7 +108,7 @@ TEST_CASE("copy semantics") kw::fast_math = true, kw::mname = "sample state"}; - const auto &s = ta.get_llvm_state(); + const auto &s = std::get<0>(ta.get_llvm_state()); REQUIRE(s.module_name() == "sample state"); REQUIRE(s.get_opt_level() == 2u); diff --git a/test/logical.cpp b/test/logical.cpp index bf7a39ce9..a45329e8e 100644 --- a/test/logical.cpp +++ b/test/logical.cpp @@ -460,9 +460,8 @@ TEST_CASE("taylor_adaptive") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE( - boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and.var_var_num.")); - REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_and.var_var_num.")); + REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_or")); } ta1.propagate_until(5.); @@ -481,8 +480,8 @@ TEST_CASE("taylor_adaptive") kw::pars = {1.24}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or.var_var_par.")); - REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_or.var_var_par.")); + REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_and")); } ta1.propagate_until(5.); @@ -514,9 +513,8 @@ TEST_CASE("taylor_adaptive_batch") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE( - boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and.var_var_num.")); - REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_and.var_var_num.")); + REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_or")); } ta1.propagate_until(5.); @@ -541,8 +539,8 @@ TEST_CASE("taylor_adaptive_batch") kw::pars = {1.24, 1.25}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or.var_var_par.")); - REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_or.var_var_par.")); + REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_and")); } ta1.propagate_until(5.); diff --git a/test/model_nbody.cpp b/test/model_nbody.cpp index 7b89d0485..1911c0123 100644 --- a/test/model_nbody.cpp +++ b/test/model_nbody.cpp @@ -67,9 +67,11 @@ TEST_CASE("nbody") // Check that llvm.pow appears only maximum 3 times: its declaration plus 2 uses // for determining the timestep size. Vectorisation may further reduce this number. - std::vector> pow_matches; - boost::find_all(pow_matches, ta.get_llvm_state().get_ir(), "@llvm.pow"); - REQUIRE(pow_matches.size() <= 3u); + for (auto cur_ir : std::get<1>(ta.get_llvm_state()).get_ir()) { + std::vector> pow_matches; + boost::find_all(pow_matches, cur_ir, "@llvm.pow"); + REQUIRE(pow_matches.size() <= 3u); + } llvm_state s; std::vector vars; diff --git a/test/opt_checks.cpp b/test/opt_checks.cpp index df46a5049..1a60ef906 100644 --- a/test/opt_checks.cpp +++ b/test/opt_checks.cpp @@ -6,6 +6,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#include #include #include @@ -30,20 +31,20 @@ TEST_CASE("function inlining") auto ta = taylor_adaptive{sys, std::vector(36u, 0.), kw::compact_mode = true}; - auto md_ir = ta.get_llvm_state().get_ir(); + for (auto md_ir : std::get<1>(ta.get_llvm_state()).get_ir()) { + using string_find_iterator = boost::find_iterator; - using string_find_iterator = boost::find_iterator; + auto count = 0u; + for (auto it = boost::make_find_iterator(md_ir, boost::first_finder("define ", boost::is_iequal())); + it != string_find_iterator(); ++it) { + ++count; + } - auto count = 0u; - for (auto it = boost::make_find_iterator(md_ir, boost::first_finder("define ", boost::is_iequal())); - it != string_find_iterator(); ++it) { - ++count; + // NOTE: in general we expect 3 functions definitions, but auto-vectorization + // could bump up this number. I think 6 is the maximum right now (3 possible + // vector width on x86 - 2, 4, 8). + REQUIRE(count <= 6u); } - - // NOTE: in general we expect 3 functions definitions, but auto-vectorization - // could bump up this number. I think 6 is the maximum right now (3 possible - // vector width on x86 - 2, 4, 8). - REQUIRE(count <= 6u); } // Vectorization of the pow() function when determining @@ -54,7 +55,7 @@ TEST_CASE("pow vect") #if defined(HEYOKA_WITH_SLEEF) - auto md_ir = ta.get_llvm_state().get_ir(); + auto md_ir = std::get<0>(ta.get_llvm_state()).get_ir(); const auto &tf = detail::get_target_features(); diff --git a/test/rel.cpp b/test/rel.cpp index 3b9123113..0b74b77eb 100644 --- a/test/rel.cpp +++ b/test/rel.cpp @@ -308,7 +308,7 @@ TEST_CASE("taylor_adaptive") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_gt.var_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_gt.var_num.")); } ta1.propagate_until(5.); @@ -326,7 +326,7 @@ TEST_CASE("taylor_adaptive") kw::pars = {1.24}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_lt.var_par.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_lt.var_par.")); } ta1.propagate_until(5.); @@ -358,7 +358,7 @@ TEST_CASE("taylor_adaptive_batch") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_gt.var_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_gt.var_num.")); } ta1.propagate_until(5.); @@ -382,7 +382,7 @@ TEST_CASE("taylor_adaptive_batch") kw::pars = {1.24, 1.25}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_lt.var_par.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_lt.var_par.")); } ta1.propagate_until(5.); diff --git a/test/select.cpp b/test/select.cpp index df2afa337..be9c2c068 100644 --- a/test/select.cpp +++ b/test/select.cpp @@ -269,7 +269,7 @@ TEST_CASE("taylor_adaptive") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_var_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_var_num.")); } ta1.propagate_until(5.); @@ -287,7 +287,7 @@ TEST_CASE("taylor_adaptive") kw::pars = {1.}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_par_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_par_num.")); } ta1.propagate_until(5.); @@ -305,7 +305,7 @@ TEST_CASE("taylor_adaptive") kw::pars = {1.}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.par_par_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.par_par_num.")); } ta1.propagate_until(5.); @@ -337,7 +337,7 @@ TEST_CASE("taylor_adaptive_batch") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_var_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_var_num.")); } ta1.propagate_until(5.); @@ -362,7 +362,7 @@ TEST_CASE("taylor_adaptive_batch") kw::pars = {1., 1.}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_par_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_par_num.")); } ta1.propagate_until(5.); @@ -386,7 +386,7 @@ TEST_CASE("taylor_adaptive_batch") kw::pars = {1., 1.}}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.par_par_num.")); + REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.par_par_num.")); } ta1.propagate_until(5.); diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp index 7c2f9406c..9229f4c44 100644 --- a/test/taylor_adaptive.cpp +++ b/test/taylor_adaptive.cpp @@ -1688,7 +1688,7 @@ void s11n_test_impl() ia >> ta; } - REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir()); + REQUIRE(std::get<1>(ta.get_llvm_state()).get_ir() == std::get<1>(ta_copy.get_llvm_state()).get_ir()); REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition()); REQUIRE(ta.get_order() == ta_copy.get_order()); REQUIRE(ta.get_tol() == ta_copy.get_tol()); @@ -1753,7 +1753,7 @@ void s11n_test_impl() ia >> ta; } - REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir()); + REQUIRE(std::get<0>(ta.get_llvm_state()).get_ir() == std::get<0>(ta_copy.get_llvm_state()).get_ir()); REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition()); REQUIRE(ta.get_order() == ta_copy.get_order()); REQUIRE(ta.get_dim() == ta_copy.get_dim()); diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp index b1bd279b9..acb4ab50a 100644 --- a/test/taylor_adaptive_batch.cpp +++ b/test/taylor_adaptive_batch.cpp @@ -1070,7 +1070,7 @@ void s11n_test_impl() ia >> ta; } - REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir()); + REQUIRE(std::get<1>(ta.get_llvm_state()).get_ir() == std::get<1>(ta_copy.get_llvm_state()).get_ir()); REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition()); REQUIRE(ta.get_order() == ta_copy.get_order()); REQUIRE(ta.get_tol() == ta_copy.get_tol()); @@ -1143,7 +1143,7 @@ void s11n_test_impl() ia >> ta; } - REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir()); + REQUIRE(std::get<0>(ta.get_llvm_state()).get_ir() == std::get<0>(ta_copy.get_llvm_state()).get_ir()); REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition()); REQUIRE(ta.get_order() == ta_copy.get_order()); REQUIRE(ta.get_tol() == ta_copy.get_tol()); @@ -2130,7 +2130,7 @@ TEST_CASE("pow rho sleef") auto ta = taylor_adaptive_batch{ {prime(x) = rhs_x, prime(v) = rhs_v}, std::vector(8u, 0.), 4u, kw::tol = 1e-6}; - const auto ir = ta.get_llvm_state().get_ir(); + const auto ir = std::get<0>(ta.get_llvm_state()).get_ir(); // NOTE: run the check only if avx2 is available. if (!boost::algorithm::contains(ir, "+avx2")) { diff --git a/test/taylor_adaptive_mp.cpp b/test/taylor_adaptive_mp.cpp index 5d8c5c78d..aadf9990a 100644 --- a/test/taylor_adaptive_mp.cpp +++ b/test/taylor_adaptive_mp.cpp @@ -1113,7 +1113,7 @@ TEST_CASE("s11n") ia >> ta; } - REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir()); + REQUIRE(std::get<1>(ta.get_llvm_state()).get_ir() == std::get<1>(ta_copy.get_llvm_state()).get_ir()); REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition()); REQUIRE(ta.get_order() == ta_copy.get_order()); REQUIRE(ta.get_tol() == ta_copy.get_tol()); diff --git a/test/taylor_atan2.cpp b/test/taylor_atan2.cpp index 5eaf92aa2..e7fa2b102 100644 --- a/test/taylor_atan2.cpp +++ b/test/taylor_atan2.cpp @@ -116,7 +116,7 @@ TEST_CASE("taylor atan2") kw::pars = {b}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.num_par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.num_par")); } ta.step(true); @@ -321,7 +321,7 @@ TEST_CASE("taylor atan2") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.var_num")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.var_num")); } ta.step(true); @@ -545,7 +545,7 @@ TEST_CASE("taylor atan2") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.num_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.num_var")); } ta.step(true); @@ -770,7 +770,7 @@ TEST_CASE("taylor atan2") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.var_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.var_var")); } ta.step(true); diff --git a/test/taylor_kepE.cpp b/test/taylor_kepE.cpp index 6b7ec9ea7..9f9b15c4f 100644 --- a/test/taylor_kepE.cpp +++ b/test/taylor_kepE.cpp @@ -104,7 +104,7 @@ TEST_CASE("taylor kepE") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.num_num")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.num_num")); } ta.step(true); @@ -306,7 +306,7 @@ TEST_CASE("taylor kepE") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.var_num")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.var_num")); } ta.step(true); @@ -552,7 +552,7 @@ TEST_CASE("taylor kepE") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.num_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.num_var")); } ta.step(true); @@ -773,7 +773,7 @@ TEST_CASE("taylor kepE") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.var_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.var_var")); } ta.step(true); diff --git a/test/taylor_kepF.cpp b/test/taylor_kepF.cpp index a951ce2be..dad9899d4 100644 --- a/test/taylor_kepF.cpp +++ b/test/taylor_kepF.cpp @@ -93,7 +93,7 @@ TEST_CASE("taylor kepF") kw::pars = {fp_t(.1), fp_t(.2)}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.num_par_num")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.num_par_num")); } ta.step(true); @@ -133,7 +133,7 @@ TEST_CASE("taylor kepF") kw::pars = {fp_t(.1), fp_t(.2)}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.num_par_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.num_par_var")); } ta.step(true); @@ -185,7 +185,7 @@ TEST_CASE("taylor kepF") kw::pars = {fp_t(.2), fp_t(.2)}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.num_var_par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.num_var_par")); } ta.step(true); @@ -246,7 +246,7 @@ TEST_CASE("taylor kepF") kw::pars = {fp_t(.2), fp_t(.2)}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_num_par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_num_par")); } ta.step(true); @@ -307,7 +307,7 @@ TEST_CASE("taylor kepF") kw::pars = {fp_t(.2), fp_t(.2)}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.par_var_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.par_var_var")); } ta.step(true); @@ -368,7 +368,7 @@ TEST_CASE("taylor kepF") kw::pars = {fp_t(.2), fp_t(.2)}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_par_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_par_var")); } ta.step(true); @@ -429,7 +429,7 @@ TEST_CASE("taylor kepF") kw::pars = {fp_t(.2), fp_t(.2)}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_var_par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_var_par")); } ta.step(true); @@ -496,7 +496,7 @@ TEST_CASE("taylor kepF") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_var_var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_var_var")); } ta.step(true); diff --git a/test/taylor_kepF_mp.cpp b/test/taylor_kepF_mp.cpp index e775377fe..57e7b37ba 100644 --- a/test/taylor_kepF_mp.cpp +++ b/test/taylor_kepF_mp.cpp @@ -62,8 +62,7 @@ TEST_CASE("kepF") kw::pars = {fp_t(.1, prec)}}; if (opt_level == 0u && cm) { - REQUIRE( - boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.kepF.num_par_var")); + REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.kepF.num_par_var")); } ta.step(true); diff --git a/test/taylor_pow.cpp b/test/taylor_pow.cpp index 3b2de6300..be2e6f00b 100644 --- a/test/taylor_pow.cpp +++ b/test/taylor_pow.cpp @@ -59,35 +59,35 @@ TEST_CASE("taylor pow approx") { auto ta = taylor_adaptive{{prime(x) = pow(x, -1.5) + pow(x, 1 / 3.)}, {2.}, kw::tol = .1, kw::opt_level = 0}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.pow")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt")); + REQUIRE(ir_contains(ta, "@llvm.pow")); + REQUIRE(ir_contains(ta, "@llvm.sqrt")); } { auto ta = taylor_adaptive{std::vector{std::pair{x, pow(par[0], -1.5)}}, {2.}, kw::tol = .1, kw::opt_level = 0}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt")); + REQUIRE(ir_contains(ta, "@llvm.sqrt")); } { auto ta = taylor_adaptive{std::vector{std::pair{x, pow(-1.5_dbl, par[0])}}, {2.}, kw::tol = .1, kw::opt_level = 0}; - REQUIRE(!boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt")); + REQUIRE(!ir_contains(ta, "@llvm.sqrt")); } { auto ta = taylor_adaptive{ std::vector{std::pair{x, pow(x, -1.5) + pow(x, 1 / 3.)}}, {2.}, kw::tol = .1, kw::opt_level = 0}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.pow")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt")); + REQUIRE(ir_contains(ta, "@llvm.pow")); + REQUIRE(ir_contains(ta, "@llvm.sqrt")); } { auto ta = taylor_adaptive{std::vector{std::pair{x, pow(par[0], -1.5)}}, {2.}, kw::tol = .1, kw::opt_level = 0}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt")); + REQUIRE(ir_contains(ta, "@llvm.sqrt")); } { @@ -97,28 +97,28 @@ TEST_CASE("taylor pow approx") kw::opt_level = 0, kw::compact_mode = true}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.pow.")); } { auto ta = taylor_adaptive{ std::vector{std::pair{x, pow(x, 2_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_square.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.pow_square.")); } { auto ta = taylor_adaptive{ std::vector{std::pair{x, pow(x, .5_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_sqrt.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.pow_sqrt.")); } { auto ta = taylor_adaptive{ std::vector{std::pair{x, pow(x, 1.5_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_pos_small_half_3.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.pow_pos_small_half_3.")); } { @@ -128,21 +128,21 @@ TEST_CASE("taylor pow approx") kw::opt_level = 0, kw::compact_mode = true}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_neg_small_half_3.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.pow_neg_small_half_3.")); } { auto ta = taylor_adaptive{ std::vector{std::pair{x, pow(x, 4_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_pos_small_int_4.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.pow_pos_small_int_4.")); } { auto ta = taylor_adaptive{ std::vector{std::pair{x, pow(x, -4_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true}; - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_neg_small_int_4.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.pow_neg_small_int_4.")); } } @@ -168,7 +168,7 @@ TEST_CASE("taylor pow") kw::pars = {fp_t{1} / fp_t{3}}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.pow.num_par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.pow.num_par")); } ta.step(true); @@ -422,7 +422,7 @@ TEST_CASE("taylor pow") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.pow.var_num")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.pow.var_num")); } ta.step(true); diff --git a/test/taylor_prod.cpp b/test/taylor_prod.cpp index 271edfac5..0d301aa39 100644 --- a/test/taylor_prod.cpp +++ b/test/taylor_prod.cpp @@ -158,7 +158,7 @@ TEST_CASE("taylor mul") kw::pars = {fp_t{0}, fp_t{0}, fp_t{3}, fp_t{3}}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg.")); } ta.step(true); @@ -205,7 +205,7 @@ TEST_CASE("taylor mul") kw::pars = {fp_t{-2}}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg.")); } ta.step(true); @@ -603,8 +603,8 @@ TEST_CASE("taylor mul") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg.")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.prod.")); } ta.step(true); @@ -675,8 +675,8 @@ TEST_CASE("taylor mul") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg.")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg.")); + REQUIRE(ir_contains(ta, "taylor_c_diff.prod.")); } ta.step(true); diff --git a/test/taylor_relu.cpp b/test/taylor_relu.cpp index 43ac00302..5da516713 100644 --- a/test/taylor_relu.cpp +++ b/test/taylor_relu.cpp @@ -72,8 +72,8 @@ TEST_CASE("taylor relu relup") kw::pars = {fp_t{-1}, fp_t{2}, fp_t{4}, fp_t{-3}}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu.par")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup.par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu.par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup.par")); } ta.step(true); @@ -118,8 +118,8 @@ TEST_CASE("taylor relu relup") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu.var")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup.var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu.var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup.var")); } ta.step(true); @@ -182,9 +182,9 @@ TEST_CASE("taylor relu relup leaky") kw::pars = {fp_t{-1}, fp_t{2}, fp_t{4}, fp_t{-3}}}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu_0x")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup_0x")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), ".par")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu_0x")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup_0x")); + REQUIRE(ir_contains(ta, ".par")); } ta.step(true); @@ -229,9 +229,9 @@ TEST_CASE("taylor relu relup leaky") kw::opt_level = opt_level}; if (opt_level == 0u && compact_mode) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu_0x")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup_0x")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), ".var")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu_0x")); + REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup_0x")); + REQUIRE(ir_contains(ta, ".var")); } ta.step(true); diff --git a/test/taylor_relu_mp.cpp b/test/taylor_relu_mp.cpp index be5a6872b..f07638a8e 100644 --- a/test/taylor_relu_mp.cpp +++ b/test/taylor_relu_mp.cpp @@ -54,8 +54,8 @@ TEST_CASE("relu") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relu.var")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relup.var")); + REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relu.var")); + REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relup.var")); } ta.step(true); @@ -95,9 +95,9 @@ TEST_CASE("relu leaky") kw::opt_level = opt_level}; if (opt_level == 0u && cm) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relu_0x")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relup_0x")); - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), ".var")); + REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relu_0x")); + REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relup_0x")); + REQUIRE(ir_contains(ta, ".var")); } ta.step(true); diff --git a/test/taylor_square.cpp b/test/taylor_square.cpp index f39b6b822..871142bf3 100644 --- a/test/taylor_square.cpp +++ b/test/taylor_square.cpp @@ -90,7 +90,7 @@ TEST_CASE("taylor square") kw::pars = {fp_t{2}}}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_pos_small_int_2.par_num")); + REQUIRE(ir_contains(ta, "pow_pos_small_int_2.par_num")); } ta.step(true); @@ -141,7 +141,7 @@ TEST_CASE("taylor square") kw::pars = {fp_t{0}, fp_t{0}, fp_t{2}, fp_t{2}}}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_pos_small_int_2.par_num")); + REQUIRE(ir_contains(ta, "pow_pos_small_int_2.par_num")); } ta.step(true); @@ -272,7 +272,7 @@ TEST_CASE("taylor square") kw::pars = {fp_t{2}, fp_t{2}, fp_t{2}}}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_pos_small_int_2.par_num")); + REQUIRE(ir_contains(ta, "pow_pos_small_int_2.par_num")); } ta.step(true); @@ -326,7 +326,7 @@ TEST_CASE("taylor square") kw::opt_level = opt_level}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num")); + REQUIRE(ir_contains(ta, "pow_square.var_num")); } ta.step(true); @@ -349,7 +349,7 @@ TEST_CASE("taylor square") kw::opt_level = opt_level}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num")); + REQUIRE(ir_contains(ta, "pow_square.var_num")); } ta.step(true); @@ -378,7 +378,7 @@ TEST_CASE("taylor square") kw::opt_level = opt_level}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num")); + REQUIRE(ir_contains(ta, "pow_square.var_num")); } ta.step(true); @@ -403,7 +403,7 @@ TEST_CASE("taylor square") kw::opt_level = opt_level}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num")); + REQUIRE(ir_contains(ta, "pow_square.var_num")); } ta.step(true); @@ -439,7 +439,7 @@ TEST_CASE("taylor square") kw::opt_level = opt_level}; if (compact_mode && opt_level == 0u) { - REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num")); + REQUIRE(ir_contains(ta, "pow_square.var_num")); } ta.step(true); diff --git a/test/test_utils.hpp b/test/test_utils.hpp index 52df16fb6..c4a10b132 100644 --- a/test/test_utils.hpp +++ b/test/test_utils.hpp @@ -11,6 +11,7 @@ #include +#include #include #include #include @@ -21,8 +22,11 @@ #include #include #include +#include #include +#include + #include #include #include @@ -285,6 +289,16 @@ template void compare_batch_scalar(const std::vector> &, unsigned, bool, bool, std::mt19937 &, float, float, T = T(1000.)); +bool ir_contains(const auto &ta, const char *str) +{ + if (ta.get_compact_mode()) { + return std::ranges::any_of(std::get<1>(ta.get_llvm_state()).get_ir(), + [&](const auto &ir) { return boost::contains(ir, str); }); + } else { + return boost::contains(std::get<0>(ta.get_llvm_state()).get_ir(), str); + } +} + } // namespace heyoka_test #endif From 4d6f0133721f1f0e13dc6e5047cd83a84021b1bc Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Tue, 27 Aug 2024 09:28:27 +0200 Subject: [PATCH 06/30] Update the known issues page. --- doc/known_issues.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/known_issues.rst b/doc/known_issues.rst index a822d395b..599f17127 100644 --- a/doc/known_issues.rst +++ b/doc/known_issues.rst @@ -22,6 +22,12 @@ Unsolved Solved ====== +* Due to an `upstream bug `__, + the option for selecting the code used model for JIT compilation + (added in heyoka 6.0.0) is ignored by LLVM and the default code model + is always used. This issue affects all LLVM versions up to and including LLVM 18. + A patch for LLVM 18 that rectifies the issue is available + `here `__. * Certain LLVM versions fail to correctly free memory when objects used to implement just-in-time compilation are destroyed. In practice this may result in exhausting the available RAM if many integrators and/or compiled functions From 0900a07c4ff5710bc613425e6094c4acd035aed0 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Tue, 27 Aug 2024 18:35:45 +0200 Subject: [PATCH 07/30] Initial work on re-enabling parallel mode for the integrators. --- src/detail/llvm_helpers.cpp | 4 + src/taylor_01.cpp | 92 +++++++-------- src/taylor_02.cpp | 218 ++++++++++++++++++++++++++++++++---- 3 files changed, 241 insertions(+), 73 deletions(-) diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp index c5a4afc2c..bf81c3e93 100644 --- a/src/detail/llvm_helpers.cpp +++ b/src/detail/llvm_helpers.cpp @@ -1358,6 +1358,8 @@ llvm::CallInst *llvm_invoke_external(llvm_state &s, const std::string &name, llv // Add the function attributes. callee_f->setAttributes(attrs); } else { + // LCOV_EXCL_START + // The function declaration exists already. Check that it is only a // declaration and not a definition. if (!callee_f->isDeclaration()) { @@ -1374,6 +1376,8 @@ llvm::CallInst *llvm_invoke_external(llvm_state &s, const std::string &name, llv } // NOTE: in the future we should consider adding more checks here // (e.g., argument types, return type, attributes, etc.). + + // LCOV_EXCL_STOP } // Create the function call. diff --git a/src/taylor_01.cpp b/src/taylor_01.cpp index 3ebec981d..3d1498396 100644 --- a/src/taylor_01.cpp +++ b/src/taylor_01.cpp @@ -1106,64 +1106,58 @@ void taylor_add_d_out_function(llvm_state &s, llvm::Type *fp_scal_t, std::uint32 builder.SetInsertPoint(orig_bb); } +namespace +{ + +// NOTE: this is the function which computes the +// Taylor derivatives for a subrange in a block. +// A block consists of ncalls invocations of the same +// Taylor derivative function with different arguments. +// [begin, end) is a subrange of [0, ncalls). tape_ptr +// is a pointer to the tape of derivatives, par_ptr and +// time_ptr are pointers to the arrays of parameter value(s) +// and time value(s). cur_order is the current Taylor order. +using block_subrange_f = void (*)(std::uint32_t begin, std::uint32_t end, void *tape_ptr, const void *par_ptr, + const void *time_ptr, std::uint32_t cur_order) noexcept; + +} // namespace + } // namespace detail HEYOKA_END_NAMESPACE -// NOTE: this is the worker function that is invoked to compute -// in parallel all the derivatives of a block in parallel mode. -extern "C" HEYOKA_DLL_PUBLIC void heyoka_cm_par_looper(std::uint32_t ncalls, - void (*fptr)(std::uint32_t, std::uint32_t) noexcept) noexcept +// This function computes the Taylor derivatives for a segment in parallel mode. +// +// f_arr is the array of functions for the computations of the derivatives in the block +// subranges, ncalls_ptr is an array containing the number of times each function in +// f_arr must be called. Both f_arr and ncalls_ptr are arrays of size nfuncs. +// tape/par/time_ptr are pointers to the tape/parameter/time values. cur_order is the Taylor +// order at which the computation of the derivatives must be performed. +extern "C" HEYOKA_DLL_PUBLIC void heyoka_taylor_cm_par_segment(const heyoka::detail::block_subrange_f *f_arr, + const std::uint32_t *ncalls_ptr, std::uint32_t nfuncs, + void *tape_ptr, const void *par_ptr, + const void *time_ptr, std::uint32_t cur_order) noexcept { try { - oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(0, ncalls), - [fptr](const auto &range) { fptr(range.begin(), range.end()); }); + oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(0, nfuncs), + [ncalls_ptr, f_arr, tape_ptr, par_ptr, time_ptr, cur_order](const auto &func_range) { + for (auto f_idx = func_range.begin(); f_idx != func_range.end(); ++f_idx) { + const auto cur_ncalls = ncalls_ptr[f_idx]; + auto *cur_f = f_arr[f_idx]; + + oneapi::tbb::parallel_for( + oneapi::tbb::blocked_range(0, cur_ncalls), + [cur_f, tape_ptr, par_ptr, time_ptr, cur_order](const auto &call_range) { + cur_f(call_range.begin(), call_range.end(), tape_ptr, par_ptr, + time_ptr, cur_order); + }); + } + }); // LCOV_EXCL_START } catch (const std::exception &ex) { - heyoka::detail::get_logger()->critical("Exception caught in the parallel mode looper: {}", ex.what()); + heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker: {}", ex.what()); } catch (...) { - heyoka::detail::get_logger()->critical("Exception caught in the parallel mode looper"); + heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker"); } // LCOV_EXCL_STOP } - -HEYOKA_BEGIN_NAMESPACE - -namespace detail -{ - -namespace -{ - -// NOTE: use typedef to minimise issues -// when mucking around with the preprocessor. -using par_f_ptr = void (*)() noexcept; - -} // namespace - -} // namespace detail - -HEYOKA_END_NAMESPACE - -// NOTE: this is the parallel invoker that gets called from LLVM -// to run multiple parallel workers within a segment at the same time, i.e., -// to process multiple blocks within a segment concurrently. -// We need to generate multiple instantiatiation of this function -// up to the limit HEYOKA_CM_PAR_MAX_INVOKE_N defined in config.hpp. - -#define HEYOKA_CM_PAR_INVOKE(_0, N, _1) \ - extern "C" HEYOKA_DLL_PUBLIC void heyoka_cm_par_invoke_##N( \ - BOOST_PP_ENUM_PARAMS(N, heyoka::detail::par_f_ptr f)) noexcept \ - { \ - try { \ - BOOST_PP_IF(BOOST_PP_SUB(N, 1), oneapi::tbb::parallel_invoke(BOOST_PP_ENUM_PARAMS(N, f)), f0()); \ - } catch (const std::exception &ex) { \ - heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker: {}", ex.what()); \ - } catch (...) { \ - heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker"); \ - } \ - } - -BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_ADD(HEYOKA_CM_PAR_MAX_INVOKE_N, 1), HEYOKA_CM_PAR_INVOKE, _0) - -#undef HEYOKA_CM_PAR_INVOKE diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 0a3c03a95..0adc738a0 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -634,6 +634,193 @@ using taylor_cm_seg_f_list_t = std::map>>, llvm_func_name_compare>; +// Helper to codegen the computation of the Taylor derivatives for a segment +// from a taylor_cm_seg_f_list_t in sequential mode. +// +// s is the llvm state in which we are operating, fp_vec_type the internal vector type we are using +// for computations, seg_map is the taylor_cm_seg_f_list_t containing the list of functions for the computation +// of Taylor derivatives within a segment, n_uvars the total number of u variables in the decomposition. +void taylor_cm_codegen_segment_diff_sequential(llvm_state &s, llvm::Type *fp_vec_type, + const taylor_cm_seg_f_list_t &seg_map, std::uint32_t n_uvars) +{ + // Fetch the current builder. + auto &bld = s.builder(); + + // Fetch the arguments from the driver prototype. + auto *driver_f = bld.GetInsertBlock()->getParent(); + assert(driver_f != nullptr); + assert(driver_f->arg_size() == 4u); + auto *tape_ptr = driver_f->args().begin(); + auto *par_ptr = driver_f->args().begin() + 1; + auto *time_ptr = driver_f->args().begin() + 2; + auto *cur_order = driver_f->args().begin() + 3; + + // Compute the derivatives for this segment. + for (const auto &[func, fpair] : seg_map) { + const auto &[ncalls, gens] = fpair; + + taylor_cm_codegen_block_diff(s, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type, + n_uvars); + } +} + +// Helper to codegen the computation of the Taylor derivatives for a segment +// from a taylor_cm_seg_f_list_t in parallel mode. +// +// s is the llvm state in which we are operating, fp_vec_type the internal vector type we are using +// for computations, seg_map is the taylor_cm_seg_f_list_t containing the list of functions for the computation +// of Taylor derivatives within a segment, n_uvars the total number of u variables in the decomposition. +void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_type, + const taylor_cm_seg_f_list_t &seg_map, std::uint32_t n_uvars) +{ + // NOTE: in parallel mode, we introduce worker functions that operate similarly to + // taylor_cm_codegen_block_diff(), except that they do not process an entire block + // but only a subrange of a block. These worker functions are then invoked in parallel + // by heyoka_taylor_cm_par_segment(). In order to pass the worker functions to + // heyoka_taylor_cm_par_segment(), we need to store pointers to them in global arrays, + // together with the information on how many times each function must be called. + + auto &bld = s.builder(); + auto &ctx = s.context(); + auto &md = s.module(); + + // Fetch the current insertion block, so that we can restore it later. + auto *orig_bb = bld.GetInsertBlock(); + + // Fetch several types for the current context. + auto *ptr_tp = llvm::PointerType::getUnqual(ctx); + auto *i32_tp = bld.getInt32Ty(); + auto *void_tp = bld.getVoidTy(); + + // Init the vectors with the constant initializers for the workers/ncalls arrays. + std::vector workers_arr, ncalls_arr; + + // Generate the workers for each block. + for (const auto &[func, fpair] : seg_map) { + const auto &[ncalls, gens] = fpair; + + // Create the prototype for the current worker. The arguments are: + // + // - int32 begin/end call indices, + // - tape pointer (read & write), + // - par pointer (read-only), + // - time pointer (read-only), + // - int32 current Taylor order. + // + // The pointer arguments cannot overlap. + std::vector worker_args{i32_tp, i32_tp, ptr_tp, ptr_tp, ptr_tp, i32_tp}; + + // The worker does not return anything. + auto *worker_proto = llvm::FunctionType::get(void_tp, worker_args, false); + assert(worker_proto != nullptr); // LCOV_EXCL_LINE + + // Create the worker. + auto *worker = llvm::Function::Create(worker_proto, llvm::Function::InternalLinkage, "", &md); + + // NOTE: the worker cannot call itself recursively. + worker->addFnAttr(llvm::Attribute::NoRecurse); + + // Add the arguments' attributes. + auto *begin_arg = worker->args().begin(); + begin_arg->setName("begin"); + + auto *end_arg = worker->args().begin() + 1; + end_arg->setName("end"); + + auto *tape_ptr_arg = worker->args().begin() + 2; + tape_ptr_arg->setName("tape_ptr"); + tape_ptr_arg->addAttr(llvm::Attribute::NoCapture); + tape_ptr_arg->addAttr(llvm::Attribute::NoAlias); + + auto *par_ptr_arg = worker->args().begin() + 3; + par_ptr_arg->setName("par_ptr"); + par_ptr_arg->addAttr(llvm::Attribute::NoCapture); + par_ptr_arg->addAttr(llvm::Attribute::NoAlias); + par_ptr_arg->addAttr(llvm::Attribute::ReadOnly); + + auto *time_ptr_arg = worker->args().begin() + 4; + time_ptr_arg->setName("time_ptr"); + time_ptr_arg->addAttr(llvm::Attribute::NoCapture); + time_ptr_arg->addAttr(llvm::Attribute::NoAlias); + time_ptr_arg->addAttr(llvm::Attribute::ReadOnly); + + auto *order_arg = worker->args().begin() + 5; + order_arg->setName("order"); + + // Create a new basic block to start insertion into. + auto *bb = llvm::BasicBlock::Create(ctx, "entry", worker); + assert(bb != nullptr); // LCOV_EXCL_LINE + bld.SetInsertPoint(bb); + + // Loop over the begin/end range. + llvm_loop_u32(s, begin_arg, end_arg, [&](llvm::Value *cur_call_idx) { + // Create the u variable index from the first generator. + auto u_idx = gens[0](cur_call_idx); + + // Initialise the vector of arguments with which func must be called. The following + // initial arguments are always present: + // - current Taylor order, + // - u index of the variable, + // - tape of derivatives, + // - pointer to the param values, + // - pointer to the time value(s). + std::vector args{order_arg, u_idx, tape_ptr_arg, par_ptr_arg, time_ptr_arg}; + + // Create the other arguments via the generators. + for (decltype(gens.size()) i = 1; i < gens.size(); ++i) { + args.push_back(gens[i](cur_call_idx)); + } + + // Calculate the derivative and store the result. + taylor_c_store_diff(s, fp_vec_type, tape_ptr_arg, n_uvars, order_arg, u_idx, bld.CreateCall(func, args)); + }); + + // Return. + bld.CreateRetVoid(); + + // Add a pointer to the current worker to workers_arr. + workers_arr.push_back(worker); + + // Add ncalls to ncalls_arr. + ncalls_arr.push_back(bld.getInt32(boost::numeric_cast(ncalls))); + } + + // Restore the original insertion block in the driver. + bld.SetInsertPoint(orig_bb); + + // Generate the global variables for workers_arr and ncalls_arr, and fetch pointers + // to their first elements. + auto *workers_arr_tp = llvm::ArrayType::get(ptr_tp, boost::numeric_cast(workers_arr.size())); + auto *workers_arr_carr = llvm::ConstantArray::get(workers_arr_tp, workers_arr); + auto *workers_arr_gv = new llvm::GlobalVariable(md, workers_arr_carr->getType(), true, + llvm::GlobalVariable::InternalLinkage, workers_arr_carr); + auto *workers_ptr + = bld.CreateInBoundsGEP(workers_arr_carr->getType(), workers_arr_gv, {bld.getInt32(0), bld.getInt32(0)}); + + auto *ncalls_arr_tp = llvm::ArrayType::get(ptr_tp, boost::numeric_cast(ncalls_arr.size())); + auto *ncalls_arr_carr = llvm::ConstantArray::get(ncalls_arr_tp, ncalls_arr); + auto *ncalls_arr_gv = new llvm::GlobalVariable(md, ncalls_arr_carr->getType(), true, + llvm::GlobalVariable::InternalLinkage, ncalls_arr_carr); + auto *ncalls_ptr + = bld.CreateInBoundsGEP(ncalls_arr_carr->getType(), ncalls_arr_gv, {bld.getInt32(0), bld.getInt32(0)}); + + // Fetch the arguments for heyoka_taylor_cm_par_segment() from the driver prototype. + auto *driver_f = bld.GetInsertBlock()->getParent(); + assert(driver_f != nullptr); + assert(driver_f->arg_size() == 4u); + auto *tape_ptr = driver_f->args().begin(); + auto *par_ptr = driver_f->args().begin() + 1; + auto *time_ptr = driver_f->args().begin() + 2; + auto *cur_order = driver_f->args().begin() + 3; + + // Invoke heyoka_taylor_cm_par_segment(). + llvm_invoke_external(s, "heyoka_taylor_cm_par_segment", void_tp, + {workers_ptr, ncalls_ptr, bld.getInt32(boost::numeric_cast(seg_map.size())), + tape_ptr, par_ptr, time_ptr, cur_order}, + llvm::AttributeList::get(ctx, llvm::AttributeList::FunctionIndex, + {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn})); +} + // Helper to codegen the computation of the Taylor derivatives for a segment. // // seg is the segment, start_u_idx the index of the first u variable in the segment, s the llvm state @@ -641,14 +828,11 @@ using taylor_cm_seg_f_list_t // the total number of u variables, high_accuracy the high accuracy flag. taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint32_t start_u_idx, llvm_state &s, llvm::Type *fp_t, std::uint32_t batch_size, std::uint32_t n_uvars, - bool high_accuracy) + bool high_accuracy, bool parallel_mode) { // Fetch the internal vector type. auto *fp_vec_type = make_vector_type(fp_t, batch_size); - // Fetch the current builder. - auto &bld = s.builder(); - // This structure maps a function to sets of arguments // with which the function is to be called. For instance, if function // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map @@ -766,21 +950,10 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint } } - // Fetch the arguments from the driver prototype. - auto *driver_f = bld.GetInsertBlock()->getParent(); - assert(driver_f != nullptr); - assert(driver_f->arg_size() == 4u); - auto *tape_ptr = driver_f->args().begin(); - auto *par_ptr = driver_f->args().begin() + 1; - auto *time_ptr = driver_f->args().begin() + 2; - auto *cur_order = driver_f->args().begin() + 3; - - // Compute the derivatives for this segment. - for (const auto &[func, fpair] : seg_map) { - const auto &[ncalls, gens] = fpair; - - taylor_cm_codegen_block_diff(s, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type, - n_uvars); + if (parallel_mode) { + taylor_cm_codegen_segment_diff_parallel(s, fp_vec_type, seg_map, n_uvars); + } else { + taylor_cm_codegen_segment_diff_sequential(s, fp_vec_type, seg_map, n_uvars); } return seg_map; @@ -803,9 +976,6 @@ std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::T std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx) { - // TODO implement. - (void)parallel_mode; - // Init the list of states. // NOTE: we use lists here because it is convenient to have // pointer/reference stability when iteratively constructing @@ -928,8 +1098,8 @@ std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::T auto *fp_t = llvm_clone_type(*cur_state, main_fp_t); // Codegen the computation of the derivatives for this segment. - const auto seg_map - = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars, high_accuracy); + const auto seg_map = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars, + high_accuracy, parallel_mode); // Update the number of codegenned blocks. n_cg_blocks += seg_map.size(); From b9e1a161fc60247214494af0463d2e6fd2f932db Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 09:13:37 +0200 Subject: [PATCH 08/30] Small cleanups, internal renames and docs. --- src/taylor_01.cpp | 51 ++++++++++++++++++++++++----------------------- src/taylor_02.cpp | 14 +++++++------ 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/taylor_01.cpp b/src/taylor_01.cpp index 3d1498396..679a4a33c 100644 --- a/src/taylor_01.cpp +++ b/src/taylor_01.cpp @@ -25,11 +25,6 @@ #include #include #include -#include -#include -#include -#include -#include #include #include @@ -1109,16 +1104,20 @@ void taylor_add_d_out_function(llvm_state &s, llvm::Type *fp_scal_t, std::uint32 namespace { -// NOTE: this is the function which computes the +// NOTE: this is the worker function type which computes the // Taylor derivatives for a subrange in a block. // A block consists of ncalls invocations of the same // Taylor derivative function with different arguments. +// Workers are created on the LLVM side when parallel mode is +// active. +// // [begin, end) is a subrange of [0, ncalls). tape_ptr // is a pointer to the tape of derivatives, par_ptr and // time_ptr are pointers to the arrays of parameter value(s) -// and time value(s). cur_order is the current Taylor order. -using block_subrange_f = void (*)(std::uint32_t begin, std::uint32_t end, void *tape_ptr, const void *par_ptr, - const void *time_ptr, std::uint32_t cur_order) noexcept; +// and time value(s). order is the desired Taylor order for +// the computation of the derivatives. +using block_worker_f = void (*)(std::uint32_t begin, std::uint32_t end, void *tape_ptr, const void *par_ptr, + const void *time_ptr, std::uint32_t order) noexcept; } // namespace @@ -1126,38 +1125,40 @@ using block_subrange_f = void (*)(std::uint32_t begin, std::uint32_t end, void * HEYOKA_END_NAMESPACE -// This function computes the Taylor derivatives for a segment in parallel mode. +// This function computes the Taylor derivatives for a segment in parallel mode. It is invoked +// from LLVM after the creation of the worker functions that compute the Taylor derivatives +// for a subrange in a block. // -// f_arr is the array of functions for the computations of the derivatives in the block -// subranges, ncalls_ptr is an array containing the number of times each function in -// f_arr must be called. Both f_arr and ncalls_ptr are arrays of size nfuncs. -// tape/par/time_ptr are pointers to the tape/parameter/time values. cur_order is the Taylor -// order at which the computation of the derivatives must be performed. -extern "C" HEYOKA_DLL_PUBLIC void heyoka_taylor_cm_par_segment(const heyoka::detail::block_subrange_f *f_arr, - const std::uint32_t *ncalls_ptr, std::uint32_t nfuncs, +// worker_arr is the array of worker functions for the computations of the derivatives in the block +// subranges, ncalls_arr is an array containing the number of times each function in +// worker_arr must be called. Both worker_arr and ncalls_arr are arrays of size nfuncs. +// tape/par/time_ptr are pointers to the tape/parameter/time values. order is the desired Taylor order for +// the computation of the derivatives. +extern "C" HEYOKA_DLL_PUBLIC void heyoka_taylor_cm_par_segment(const heyoka::detail::block_worker_f *worker_arr, + const std::uint32_t *ncalls_arr, std::uint32_t nfuncs, void *tape_ptr, const void *par_ptr, - const void *time_ptr, std::uint32_t cur_order) noexcept + const void *time_ptr, std::uint32_t order) noexcept { try { oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(0, nfuncs), - [ncalls_ptr, f_arr, tape_ptr, par_ptr, time_ptr, cur_order](const auto &func_range) { + [ncalls_arr, worker_arr, tape_ptr, par_ptr, time_ptr, order](const auto &func_range) { for (auto f_idx = func_range.begin(); f_idx != func_range.end(); ++f_idx) { - const auto cur_ncalls = ncalls_ptr[f_idx]; - auto *cur_f = f_arr[f_idx]; + const auto cur_ncalls = ncalls_arr[f_idx]; + auto *cur_f = worker_arr[f_idx]; oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(0, cur_ncalls), - [cur_f, tape_ptr, par_ptr, time_ptr, cur_order](const auto &call_range) { + [cur_f, tape_ptr, par_ptr, time_ptr, order](const auto &call_range) { cur_f(call_range.begin(), call_range.end(), tape_ptr, par_ptr, - time_ptr, cur_order); + time_ptr, order); }); } }); // LCOV_EXCL_START } catch (const std::exception &ex) { - heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker: {}", ex.what()); + heyoka::detail::get_logger()->critical("Exception caught in heyoka_taylor_cm_par_segment(): {}", ex.what()); } catch (...) { - heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker"); + heyoka::detail::get_logger()->critical("Exception caught in heyoka_taylor_cm_par_segment()"); } // LCOV_EXCL_STOP } diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 0adc738a0..4c311d726 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -655,7 +655,7 @@ void taylor_cm_codegen_segment_diff_sequential(llvm_state &s, llvm::Type *fp_vec auto *time_ptr = driver_f->args().begin() + 2; auto *cur_order = driver_f->args().begin() + 3; - // Compute the derivatives for this segment. + // Generate the code for the computation of the derivatives for this segment. for (const auto &[func, fpair] : seg_map) { const auto &[ncalls, gens] = fpair; @@ -680,6 +680,7 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t // heyoka_taylor_cm_par_segment(), we need to store pointers to them in global arrays, // together with the information on how many times each function must be called. + // Fetch builder/context/module. auto &bld = s.builder(); auto &ctx = s.context(); auto &md = s.module(); @@ -687,7 +688,7 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t // Fetch the current insertion block, so that we can restore it later. auto *orig_bb = bld.GetInsertBlock(); - // Fetch several types for the current context. + // Fetch several types from the current context. auto *ptr_tp = llvm::PointerType::getUnqual(ctx); auto *i32_tp = bld.getInt32Ty(); auto *void_tp = bld.getVoidTy(); @@ -701,11 +702,11 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t // Create the prototype for the current worker. The arguments are: // - // - int32 begin/end call indices, + // - int32 begin/end call range, // - tape pointer (read & write), // - par pointer (read-only), // - time pointer (read-only), - // - int32 current Taylor order. + // - int32 Taylor order. // // The pointer arguments cannot overlap. std::vector worker_args{i32_tp, i32_tp, ptr_tp, ptr_tp, ptr_tp, i32_tp}; @@ -811,12 +812,12 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t auto *tape_ptr = driver_f->args().begin(); auto *par_ptr = driver_f->args().begin() + 1; auto *time_ptr = driver_f->args().begin() + 2; - auto *cur_order = driver_f->args().begin() + 3; + auto *order = driver_f->args().begin() + 3; // Invoke heyoka_taylor_cm_par_segment(). llvm_invoke_external(s, "heyoka_taylor_cm_par_segment", void_tp, {workers_ptr, ncalls_ptr, bld.getInt32(boost::numeric_cast(seg_map.size())), - tape_ptr, par_ptr, time_ptr, cur_order}, + tape_ptr, par_ptr, time_ptr, order}, llvm::AttributeList::get(ctx, llvm::AttributeList::FunctionIndex, {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn})); } @@ -950,6 +951,7 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint } } + // Generate the code for the computation of the Taylor derivatives. if (parallel_mode) { taylor_cm_codegen_segment_diff_parallel(s, fp_vec_type, seg_map, n_uvars); } else { From 148b2fbecfc5d44a09d7deb731b1acf6c80d4433 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 10:07:05 +0200 Subject: [PATCH 09/30] Add a flag to the llvm_multi_state constructor to enable parallel JIT. --- include/heyoka/llvm_state.hpp | 16 +++++++- src/llvm_state.cpp | 74 +++++++++++++++++++++++------------ test/llvm_multi_state.cpp | 13 ++++-- 3 files changed, 73 insertions(+), 30 deletions(-) diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp index 23ebb8536..086e0c631 100644 --- a/include/heyoka/llvm_state.hpp +++ b/include/heyoka/llvm_state.hpp @@ -347,6 +347,19 @@ struct llvm_mc_value { std::optional llvm_state_mem_cache_lookup(const std::vector &, unsigned); void llvm_state_mem_cache_try_insert(std::vector, unsigned, llvm_mc_value); +// The default setting for the parjit flag for llvm_multi_state. +// There is evidence of an LLVM thread scheduling bug when parallel compilation +// is active, that rarely results in multiply-defined symbols for external C +// functions, which leads to compilation failure. So far, we have been able to +// trigger this issue only on Linux aarch64. +inline constexpr bool default_parjit = +#if defined(HEYOKA_ARCH_ARM) && defined(__linux__) + false +#else + true +#endif + ; + } // namespace detail class HEYOKA_DLL_PUBLIC llvm_multi_state @@ -371,7 +384,7 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state public: llvm_multi_state(); - explicit llvm_multi_state(std::vector); + explicit llvm_multi_state(std::vector, bool = detail::default_parjit); template requires std::ranges::input_range && std::same_as>> @@ -393,6 +406,7 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state [[nodiscard]] unsigned get_opt_level() const noexcept; [[nodiscard]] bool get_slp_vectorize() const noexcept; [[nodiscard]] code_model get_code_model() const noexcept; + [[nodiscard]] bool get_parjit() const noexcept; [[nodiscard]] std::vector get_ir() const; [[nodiscard]] std::vector get_bc() const; diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp index 8cf7139f3..da7908e14 100644 --- a/src/llvm_state.cpp +++ b/src/llvm_state.cpp @@ -1551,6 +1551,8 @@ struct multi_jit { // NOTE: this is the total number of modules, including // the master module. const unsigned m_n_modules = 0; + // Flag to signal that we are enabling parallel compilation. + const bool m_parjit; // NOTE: enumerate the LLVM members here in the same order // as llvm_state, as this is important to ensure proper // destruction order. @@ -1570,7 +1572,7 @@ struct multi_jit { std::vector m_ir_snapshots; std::vector m_bc_snapshots; - explicit multi_jit(unsigned, unsigned, code_model, bool, bool); + explicit multi_jit(unsigned, unsigned, code_model, bool, bool, bool); multi_jit(const multi_jit &) = delete; multi_jit(multi_jit &&) noexcept = delete; llvm_multi_state &operator=(const multi_jit &) = delete; @@ -1613,8 +1615,9 @@ constexpr auto master_module_name = "heyoka.master"; // NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit. // NOTE: make sure to coordinate changes in this constructor with llvm_state::jit. -multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize) - : m_n_modules(n_modules) +multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize, + bool parjit) + : m_n_modules(n_modules), m_parjit(parjit) { assert(n_modules >= 2u); @@ -1637,31 +1640,37 @@ multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, lljit_builder.setJITTargetMachineBuilder(jtmb); #if 0 - // Create a task dispatcher. - auto tdisp = std::make_unique(); - // Create an ExecutorProcessControl. - auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp)); - // LCOV_EXCL_START - if (!epc) { - auto err = epc.takeError(); + if (m_parjit) { + // Create a task dispatcher. + auto tdisp = std::make_unique(); - std::string err_report; - llvm::raw_string_ostream ostr(err_report); + // Create an ExecutorProcessControl. + auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp)); + // LCOV_EXCL_START + if (!epc) { + auto err = epc.takeError(); - ostr << err; + std::string err_report; + llvm::raw_string_ostream ostr(err_report); - throw std::invalid_argument( - fmt::format("Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str())); + ostr << err; + + throw std::invalid_argument(fmt::format( + "Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str())); + } + // LCOV_EXCL_STOP + + // Set it in the lljit builder. + lljit_builder.setExecutorProcessControl(std::move(*epc)); } - // LCOV_EXCL_STOP - // Set it in the lljit builder. - lljit_builder.setExecutorProcessControl(std::move(*epc)); #else - // Set the number of compilation threads. - lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency()); + if (m_parjit) { + // Set the number of compilation threads. + lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency()); + } #endif @@ -1815,6 +1824,9 @@ struct llvm_multi_state::impl { // Store the states. ar << m_states; + // Store the parjit flag. + ar << m_jit->m_parjit; + // Store the object files and the snapshots. These may be empty. ar << m_jit->m_object_files; ar << m_jit->m_ir_snapshots; @@ -1837,10 +1849,14 @@ struct llvm_multi_state::impl { assert(!m_states.empty()); + // Load the parjit flag. + bool parjit{}; + ar >> parjit; + // Reset the jit with a new one. - m_jit = std::make_unique(boost::safe_numerics::safe(m_states.size()) + 1, - m_states[0].get_opt_level(), m_states[0].get_code_model(), - m_states[0].force_avx512(), m_states[0].get_slp_vectorize()); + m_jit = std::make_unique( + boost::safe_numerics::safe(m_states.size()) + 1, m_states[0].get_opt_level(), + m_states[0].get_code_model(), m_states[0].force_avx512(), m_states[0].get_slp_vectorize(), parjit); // Load the object files and the snapshots. ar >> m_jit->m_object_files; @@ -1871,7 +1887,7 @@ struct llvm_multi_state::impl { llvm_multi_state::llvm_multi_state() = default; -llvm_multi_state::llvm_multi_state(std::vector states_) +llvm_multi_state::llvm_multi_state(std::vector states_, bool parjit) { // Fetch a const ref, as we want to make extra sure we do not modify // states_ until we move it to construct the impl. @@ -1940,7 +1956,7 @@ llvm_multi_state::llvm_multi_state(std::vector states_) // Create the multi_jit. auto jit = std::make_unique(boost::safe_numerics::safe(states.size()) + 1, opt_level, - c_model, force_avx512, slp_vectorize); + c_model, force_avx512, slp_vectorize, parjit); // Build and assign the implementation. impl imp{.m_states = std::move(states_), .m_jit = std::move(jit)}; @@ -1956,7 +1972,7 @@ llvm_multi_state::llvm_multi_state(const llvm_multi_state &other) impl imp{.m_states = other.m_impl->m_states, .m_jit = std::make_unique(other.m_impl->m_jit->m_n_modules, other.get_opt_level(), other.get_code_model(), other.force_avx512(), - other.get_slp_vectorize())}; + other.get_slp_vectorize(), other.get_parjit())}; m_impl = std::make_unique(std::move(imp)); if (other.is_compiled()) { @@ -2131,6 +2147,11 @@ code_model llvm_multi_state::get_code_model() const noexcept return m_impl->m_states[0].get_code_model(); } +bool llvm_multi_state::get_parjit() const noexcept +{ + return m_impl->m_jit->m_parjit; +} + bool llvm_multi_state::is_compiled() const noexcept { return !m_impl->m_jit->m_module; @@ -2346,6 +2367,7 @@ std::ostream &operator<<(std::ostream &os, const llvm_multi_state &s) oss << "SLP vectorization : " << s.get_slp_vectorize() << '\n'; oss << "Code model : " << s.get_code_model() << '\n'; oss << "Optimisation level: " << s.get_opt_level() << '\n'; + oss << "Parallel JIT : " << s.get_parjit() << '\n'; oss << "Data layout : " << s.m_impl->m_states[0].m_jitter->m_lljit->getDataLayout().getStringRepresentation() << '\n'; oss << "Target triple : " << s.m_impl->m_states[0].m_jitter->get_target_triple().str() << '\n'; diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp index bda7b476c..f8589b046 100644 --- a/test/llvm_multi_state.cpp +++ b/test/llvm_multi_state.cpp @@ -100,6 +100,7 @@ TEST_CASE("basic") REQUIRE(ms.get_code_model() == code_model::large); REQUIRE(ms.get_n_modules() == 5u); REQUIRE(!ms.is_compiled()); + REQUIRE(ms.get_parjit() == detail::default_parjit); ms.compile(); @@ -121,7 +122,7 @@ TEST_CASE("basic") llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true, kw::code_model = code_model::large}; - llvm_multi_state ms{{s, s, s, s}}; + llvm_multi_state ms{{s, s, s, s}, false}; auto ms2 = std::move(ms); @@ -132,6 +133,7 @@ TEST_CASE("basic") REQUIRE(ms2.get_code_model() == code_model::large); REQUIRE(ms2.get_n_modules() == 5u); REQUIRE(!ms2.is_compiled()); + REQUIRE(!ms2.get_parjit()); ms2.compile(); @@ -163,7 +165,7 @@ TEST_CASE("copy semantics") add_cfunc(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true); add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true); - llvm_multi_state ms{{s1, s2}}; + llvm_multi_state ms{{s1, s2}, false}; auto ms_copy = ms; @@ -175,6 +177,7 @@ TEST_CASE("copy semantics") REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level()); REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize()); REQUIRE(ms_copy.get_code_model() == ms.get_code_model()); + REQUIRE(!ms_copy.get_parjit()); REQUIRE_THROWS_MATCHES( ms_copy.get_object_code(), std::invalid_argument, Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled")); @@ -227,6 +230,7 @@ TEST_CASE("copy semantics") REQUIRE(ms_copy2.get_opt_level() == ms.get_opt_level()); REQUIRE(ms_copy2.get_slp_vectorize() == ms.get_slp_vectorize()); REQUIRE(ms_copy2.get_code_model() == ms.get_code_model()); + REQUIRE(!ms_copy2.get_parjit()); REQUIRE_NOTHROW(ms_copy2.jit_lookup("f1")); REQUIRE_NOTHROW(ms_copy2.jit_lookup("f2")); @@ -259,6 +263,7 @@ TEST_CASE("copy semantics") REQUIRE(ms_copy3.get_opt_level() == ms.get_opt_level()); REQUIRE(ms_copy3.get_slp_vectorize() == ms.get_slp_vectorize()); REQUIRE(ms_copy3.get_code_model() == ms.get_code_model()); + REQUIRE(!ms_copy3.get_parjit()); REQUIRE_NOTHROW(ms_copy3.jit_lookup("f1")); REQUIRE_NOTHROW(ms_copy3.jit_lookup("f2")); @@ -298,7 +303,7 @@ TEST_CASE("s11n") add_cfunc(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true); // Uncompiled. - llvm_multi_state ms{{s1, s2}}; + llvm_multi_state ms{{s1, s2}, false}; std::stringstream ss; @@ -322,6 +327,7 @@ TEST_CASE("s11n") REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level()); REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize()); REQUIRE(ms_copy.get_code_model() == ms.get_code_model()); + REQUIRE(!ms_copy.get_parjit()); REQUIRE_THROWS_MATCHES( ms_copy.get_object_code(), std::invalid_argument, Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled")); @@ -353,6 +359,7 @@ TEST_CASE("s11n") REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level()); REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize()); REQUIRE(ms_copy.get_code_model() == ms.get_code_model()); + REQUIRE(!ms_copy.get_parjit()); REQUIRE_NOTHROW(ms_copy.jit_lookup("f1")); REQUIRE_NOTHROW(ms_copy.jit_lookup("f2")); From 5668e21f63ae48dec0e0faaabd430892f7b09435 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 10:09:32 +0200 Subject: [PATCH 10/30] Bump up the class s11n version numbers for the adaptive integrators. --- include/heyoka/taylor.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp index b3de93017..be7d7f77c 100644 --- a/include/heyoka/taylor.hpp +++ b/include/heyoka/taylor.hpp @@ -1193,7 +1193,8 @@ namespace detail // which resulted also in changes in the event detection data structure. // - 4: switched to pimpl implementation for i_data. // - 5: removed m_state_vars/m_rhs, variational ODE data. -inline constexpr int taylor_adaptive_s11n_version = 5; +// - 6: added parallel JIT compilation for compact mode. +inline constexpr int taylor_adaptive_s11n_version = 6; // Boost s11n class version history for taylor_adaptive_batch: // - 1: added the m_state_vars and m_rhs members. @@ -1201,7 +1202,8 @@ inline constexpr int taylor_adaptive_s11n_version = 5; // which resulted also in changes in the event detection data structure. // - 3: switched to pimpl implementation for i_data. // - 4: removed m_state_vars/m_rhs, variational ODE data. -inline constexpr int taylor_adaptive_batch_s11n_version = 4; +// - 5: added parallel JIT compilation for compact mode. +inline constexpr int taylor_adaptive_batch_s11n_version = 5; } // namespace detail From 58071994525d3ddaed8faea9b6d2e7b4db721836 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 11:14:13 +0200 Subject: [PATCH 11/30] Internal doc bits, coverage fixes. --- src/detail/i_data.cpp | 28 ++++++++++++++++++++-------- src/taylor_adaptive_batch.cpp | 2 ++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp index 45e431c9a..3eaf66e31 100644 --- a/src/detail/i_data.cpp +++ b/src/detail/i_data.cpp @@ -103,13 +103,13 @@ void taylor_adaptive::i_data::init_cm_tape() const auto [sz, al] = m_tape_sa; if (m_compact_mode) { - assert(sz != 0u); - assert(al != 0u); + assert(sz != 0u); // LCOV_EXCL_LINE + assert(al != 0u); // LCOV_EXCL_LINE m_tape = detail::make_aligned_buffer(sz, al); } else { - assert(sz == 0u); - assert(al == 0u); + assert(sz == 0u); // LCOV_EXCL_LINE + assert(al == 0u); // LCOV_EXCL_LINE } } @@ -157,6 +157,9 @@ void taylor_adaptive::i_data::load(boost::archive::binary_iarchive &ar, unsig ar >> m_tm_data; // Recover the function pointers. + // NOTE: here we are recovering only the dense output function pointer because recovering + // the correct stepper requires information which is available only from the integrator + // class (hence, we do it from there). m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); // Reconstruct the compact mode tape, if necessary. @@ -184,6 +187,9 @@ taylor_adaptive::i_data::i_data(const i_data &other) m_tm_data(other.m_tm_data) { // Recover the function pointers. + // NOTE: here we are recovering only the dense output function pointer because recovering + // the correct stepper requires information which is available only from the integrator + // class (hence, we do it from there). m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); // Init the compact mode tape, if necessary. @@ -226,13 +232,13 @@ void taylor_adaptive_batch::i_data::init_cm_tape() const auto [sz, al] = m_tape_sa; if (m_compact_mode) { - assert(sz != 0u); - assert(al != 0u); + assert(sz != 0u); // LCOV_EXCL_LINE + assert(al != 0u); // LCOV_EXCL_LINE m_tape = detail::make_aligned_buffer(sz, al); } else { - assert(sz == 0u); - assert(al == 0u); + assert(sz == 0u); // LCOV_EXCL_LINE + assert(al == 0u); // LCOV_EXCL_LINE } } @@ -316,6 +322,9 @@ void taylor_adaptive_batch::i_data::load(boost::archive::binary_iarchive &ar, ar >> m_tm_data; // Recover the function pointers. + // NOTE: here we are recovering only the dense output function pointer because recovering + // the correct stepper requires information which is available only from the integrator + // class (hence, we do it from there). m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); // Reconstruct the compact mode tape, if necessary. @@ -349,6 +358,9 @@ taylor_adaptive_batch::i_data::i_data(const i_data &other) m_tm_data(other.m_tm_data) { // Recover the function pointers. + // NOTE: here we are recovering only the dense output function pointer because recovering + // the correct stepper requires information which is available only from the integrator + // class (hence, we do it from there). m_d_out_f = std::visit([](auto &s) { return reinterpret_cast(s.jit_lookup("d_out_f")); }, m_llvm_state); // Init the compact mode tape, if necessary. diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp index 43e2f9788..1c9c6a042 100644 --- a/src/taylor_adaptive_batch.cpp +++ b/src/taylor_adaptive_batch.cpp @@ -2322,6 +2322,8 @@ void taylor_adaptive_batch::check_variational(const char *fname) const } // Helper to fetch the stepper function from m_llvm_state. +// NOTE: this is exactly identical to the scalar integrator code. +// Should we write a separate common helper for this at one point? template void taylor_adaptive_batch::assign_stepper(bool with_events) { From ddccd2069b925ffae0ae270e55c0b22b4ea3cc2a Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 12:14:28 +0200 Subject: [PATCH 12/30] A couple of test additions. --- test/taylor_adaptive.cpp | 20 ++++++++++++++++++++ test/taylor_adaptive_batch.cpp | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp index 9229f4c44..150a2296f 100644 --- a/test/taylor_adaptive.cpp +++ b/test/taylor_adaptive.cpp @@ -1701,6 +1701,8 @@ void s11n_test_impl() REQUIRE(ta.get_tc() == ta_copy.get_tc()); REQUIRE(ta.get_last_h() == ta_copy.get_last_h()); REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); REQUIRE(value_type_index(ta.get_t_events()[0].get_callback()) == value_type_index(ta_copy.get_t_events()[0].get_callback())); @@ -1763,6 +1765,8 @@ void s11n_test_impl() REQUIRE(ta.get_tc() == ta_copy.get_tc()); REQUIRE(ta.get_last_h() == ta_copy.get_last_h()); REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); + REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_ir() == std::get<0>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_bc() == std::get<0>(ta.get_llvm_state()).get_bc()); // Take a step in ta and in ta_copy. ta.step(true); @@ -1826,6 +1830,14 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_tol() == ta.get_tol()); REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy()); REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + + ta.step(); + ta_copy.step(); + + REQUIRE(ta.get_state() == ta_copy.get_state()); + REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); ta_copy = taylor_adaptive{}; ta_copy = ta; @@ -1835,6 +1847,14 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_tol() == ta.get_tol()); REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy()); REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + + ta.step(); + ta_copy.step(); + + REQUIRE(ta.get_state() == ta_copy.get_state()); + REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); } #if defined(HEYOKA_ARCH_PPC) diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp index acb4ab50a..047191b87 100644 --- a/test/taylor_adaptive_batch.cpp +++ b/test/taylor_adaptive_batch.cpp @@ -1083,6 +1083,8 @@ void s11n_test_impl() REQUIRE(ta.get_tc() == ta_copy.get_tc()); REQUIRE(ta.get_last_h() == ta_copy.get_last_h()); REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); REQUIRE(ta.get_step_res() == ta_copy.get_step_res()); REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res()); @@ -1154,6 +1156,8 @@ void s11n_test_impl() REQUIRE(ta.get_tc() == ta_copy.get_tc()); REQUIRE(ta.get_last_h() == ta_copy.get_last_h()); REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); + REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_ir() == std::get<0>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_bc() == std::get<0>(ta.get_llvm_state()).get_bc()); REQUIRE(value_type_index(ta.get_t_events()[0].get_callback()) == value_type_index(ta_copy.get_t_events()[0].get_callback())); @@ -1717,6 +1721,14 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_tol() == ta.get_tol()); REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy()); REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + + ta.step(); + ta_copy.step(); + + REQUIRE(ta.get_state() == ta_copy.get_state()); + REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); ta_copy = taylor_adaptive_batch{}; ta_copy = ta; @@ -1726,6 +1738,14 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_tol() == ta.get_tol()); REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy()); REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + + ta.step(); + ta_copy.step(); + + REQUIRE(ta.get_state() == ta_copy.get_state()); + REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); } // Test case for the propagate_*() functions not considering From eec949abb7587268255f484c8481d0ba405e2f64 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 12:37:36 +0200 Subject: [PATCH 13/30] More test additions. --- test/taylor_adaptive.cpp | 16 ++++++++++++++++ test/taylor_adaptive_batch.cpp | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp index 150a2296f..ee77593f5 100644 --- a/test/taylor_adaptive.cpp +++ b/test/taylor_adaptive.cpp @@ -1727,6 +1727,12 @@ void s11n_test_impl() ta_copy.update_d_output(-.1, true); REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); + + // Also run a propagation with continuous output to test that + // the m_tplt_state member is correctly copied. + auto prop_res = ta.propagate_for(10., kw::c_output = true); + auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true); + REQUIRE((*std::get<4>(prop_res))(4.1) == (*std::get<4>(prop_copy_res))(4.1)); } // Test without events. @@ -1839,6 +1845,12 @@ TEST_CASE("copy semantics") REQUIRE(ta.get_state() == ta_copy.get_state()); REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); + // Also run a propagation with continuous output to test that + // the m_tplt_state member is correctly copied. + auto prop_res = ta.propagate_for(10., kw::c_output = true); + auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true); + REQUIRE((*std::get<4>(prop_res))(4.1) == (*std::get<4>(prop_copy_res))(4.1)); + ta_copy = taylor_adaptive{}; ta_copy = ta; @@ -1855,6 +1867,10 @@ TEST_CASE("copy semantics") REQUIRE(ta.get_state() == ta_copy.get_state()); REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); + + prop_res = ta.propagate_for(10., kw::c_output = true); + prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true); + REQUIRE((*std::get<4>(prop_res))(14.1) == (*std::get<4>(prop_copy_res))(14.1)); } #if defined(HEYOKA_ARCH_PPC) diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp index 047191b87..a05788c6b 100644 --- a/test/taylor_adaptive_batch.cpp +++ b/test/taylor_adaptive_batch.cpp @@ -1105,6 +1105,12 @@ void s11n_test_impl() ta_copy.update_d_output({-.1, -.11}, true); REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); + + // Also run a propagation with continuous output to test that + // the m_tplt_state member is correctly copied. + auto prop_res = ta.propagate_for(10., kw::c_output = true); + auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true); + REQUIRE((*std::get<0>(prop_res))(4.1) == (*std::get<0>(prop_copy_res))(4.1)); } // A test with events. @@ -1730,6 +1736,12 @@ TEST_CASE("copy semantics") REQUIRE(ta.get_state() == ta_copy.get_state()); REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); + // Also run a propagation with continuous output to test that + // the m_tplt_state member is correctly copied. + auto prop_res = ta.propagate_for(10., kw::c_output = true); + auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true); + REQUIRE((*std::get<0>(prop_res))(4.1) == (*std::get<0>(prop_copy_res))(4.1)); + ta_copy = taylor_adaptive_batch{}; ta_copy = ta; @@ -1746,6 +1758,10 @@ TEST_CASE("copy semantics") REQUIRE(ta.get_state() == ta_copy.get_state()); REQUIRE(ta.get_dtime() == ta_copy.get_dtime()); + + prop_res = ta.propagate_for(10., kw::c_output = true); + prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true); + REQUIRE((*std::get<0>(prop_res))(14.1) == (*std::get<0>(prop_copy_res))(14.1)); } // Test case for the propagate_*() functions not considering From 1c0f1493ab2e0301657dbb9798488d68626ec01b Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 14:58:06 +0200 Subject: [PATCH 14/30] Make sure to test compact mode propagation too in the batch integrator. --- test/taylor_adaptive_batch.cpp | 820 +++++++++++++++++---------------- 1 file changed, 419 insertions(+), 401 deletions(-) diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp index a05788c6b..ec6cb755c 100644 --- a/test/taylor_adaptive_batch.cpp +++ b/test/taylor_adaptive_batch.cpp @@ -162,270 +162,282 @@ TEST_CASE("propagate grid") { using Catch::Matchers::Message; - auto [x, v] = make_vars("x", "v"); - - auto ta = taylor_adaptive_batch{ - {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}, 4u}; - - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({}), std::invalid_argument, - Message( - "Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if the time grid is empty")); - - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({1.}), std::invalid_argument, - Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: " - "the grid has a size of 1, which is not a multiple of the batch size (4)")); - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({1., 2.}), std::invalid_argument, - Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: " - "the grid has a size of 2, which is not a multiple of the batch size (4)")); - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({1., 2., 3., 4., 5.}), std::invalid_argument, - Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: " - "the grid has a size of 5, which is not a multiple of the batch size (4)")); - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({0., 0., 1., 4.}), std::invalid_argument, - Message("When invoking propagate_grid(), the first element of the time grid " - "must match the current time coordinate - however, the first element of the time grid at " - "batch index 2 has a " - "value of 1, while the current time coordinate is 0")); - - ta.set_time({0., 0., std::numeric_limits::infinity(), 0.}); - - REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0.}), std::invalid_argument, - Message("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if " - "the current time is not finite")); - - ta.set_time({0., 0., 0., 0.}); + for (auto cm : {true, false}) { + auto [x, v] = make_vars("x", "v"); - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({0., 0., std::numeric_limits::infinity(), 0.}), std::invalid_argument, - Message( - "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({0., 0., 0., 0., 0., std::numeric_limits::infinity(), 0., 0.}), std::invalid_argument, - Message( - "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., -1., 1.}), std::invalid_argument, - Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " - "Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES( - ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., std::numeric_limits::infinity()}), - std::invalid_argument, - Message( - "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 0., 0., 2.}), std::invalid_argument, - Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " - "Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 2.}), std::invalid_argument, - Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " - "Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 0., 1., 1., 2., 2., 2., 2.}), std::invalid_argument, - Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " - "Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 0., 2., 2., 2., 2.}), std::invalid_argument, - Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " - "Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 2., 1., 2.}), std::invalid_argument, - Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " - "Taylor integrator in batch mode")); - - // Set an infinity in the state. - ta.get_state_data()[0] = std::numeric_limits::infinity(); - - auto [cb, ret] = ta.propagate_grid({.0, .0, .0, .0}); - REQUIRE(!cb); - REQUIRE(ret.size() == 8u); - REQUIRE(std::get<0>(ta.get_propagate_res()[0]) == taylor_outcome::err_nf_state); - REQUIRE(std::get<0>(ta.get_propagate_res()[1]) == taylor_outcome::time_limit); - REQUIRE(std::get<0>(ta.get_propagate_res()[2]) == taylor_outcome::time_limit); - REQUIRE(std::get<0>(ta.get_propagate_res()[3]) == taylor_outcome::time_limit); + auto ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -9.8 * sin(x)}, + {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}, + 4u, + kw::compact_mode = cm}; - // Reset the integrator. - ta = taylor_adaptive_batch{ - {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}, 4u}; + REQUIRE_THROWS_MATCHES(ta.propagate_grid({}), std::invalid_argument, + Message("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode " + "if the time grid is empty")); - // Propagate to the initial time. - std::tie(cb, ret) = ta.propagate_grid({0., 0., 0., 0.}); - REQUIRE(!cb); - REQUIRE(ret.size() == 8u); - REQUIRE(ret == std::vector{0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}); - for (auto i = 0u; i < 4u; ++i) { - auto [oc, min_h, max_h, nsteps] = ta.get_propagate_res()[i]; + REQUIRE_THROWS_MATCHES( + ta.propagate_grid({1.}), std::invalid_argument, + Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: " + "the grid has a size of 1, which is not a multiple of the batch size (4)")); + REQUIRE_THROWS_MATCHES( + ta.propagate_grid({1., 2.}), std::invalid_argument, + Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: " + "the grid has a size of 2, which is not a multiple of the batch size (4)")); + REQUIRE_THROWS_MATCHES( + ta.propagate_grid({1., 2., 3., 4., 5.}), std::invalid_argument, + Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: " + "the grid has a size of 5, which is not a multiple of the batch size (4)")); + REQUIRE_THROWS_MATCHES( + ta.propagate_grid({0., 0., 1., 4.}), std::invalid_argument, + Message("When invoking propagate_grid(), the first element of the time grid " + "must match the current time coordinate - however, the first element of the time grid at " + "batch index 2 has a " + "value of 1, while the current time coordinate is 0")); - REQUIRE(oc == taylor_outcome::time_limit); - REQUIRE(min_h == std::numeric_limits::infinity()); - REQUIRE(max_h == 0); - REQUIRE(nsteps == 0u); - } + ta.set_time({0., 0., std::numeric_limits::infinity(), 0.}); - // Switch to the harmonic oscillator. - ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; + REQUIRE_THROWS_MATCHES( + ta.propagate_grid({0., 0., 0., 0.}), std::invalid_argument, + Message("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if " + "the current time is not finite")); + + ta.set_time({0., 0., 0., 0.}); + + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., std::numeric_limits::infinity(), 0.}), + std::invalid_argument, + Message("A non-finite time value was passed to propagate_grid() in an adaptive Taylor " + "integrator in batch mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 0., std::numeric_limits::infinity(), 0., 0.}), + std::invalid_argument, + Message("A non-finite time value was passed to propagate_grid() in an adaptive Taylor " + "integrator in batch mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., -1., 1.}), std::invalid_argument, + Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " + "Taylor integrator in batch mode")); + REQUIRE_THROWS_MATCHES( + ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., std::numeric_limits::infinity()}), + std::invalid_argument, + Message("A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch " + "mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 0., 0., 2.}), + std::invalid_argument, + Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " + "Taylor integrator in batch mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 2.}), + std::invalid_argument, + Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " + "Taylor integrator in batch mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 0., 1., 1., 2., 2., 2., 2.}), + std::invalid_argument, + Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " + "Taylor integrator in batch mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 0., 2., 2., 2., 2.}), + std::invalid_argument, + Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " + "Taylor integrator in batch mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 2., 1., 2.}), + std::invalid_argument, + Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive " + "Taylor integrator in batch mode")); + + // Set an infinity in the state. + ta.get_state_data()[0] = std::numeric_limits::infinity(); + + auto [cb, ret] = ta.propagate_grid({.0, .0, .0, .0}); + REQUIRE(!cb); + REQUIRE(ret.size() == 8u); + REQUIRE(std::get<0>(ta.get_propagate_res()[0]) == taylor_outcome::err_nf_state); + REQUIRE(std::get<0>(ta.get_propagate_res()[1]) == taylor_outcome::time_limit); + REQUIRE(std::get<0>(ta.get_propagate_res()[2]) == taylor_outcome::time_limit); + REQUIRE(std::get<0>(ta.get_propagate_res()[3]) == taylor_outcome::time_limit); + + // Reset the integrator. + ta = taylor_adaptive_batch{ + {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}, 4u}; + + // Propagate to the initial time. + std::tie(cb, ret) = ta.propagate_grid({0., 0., 0., 0.}); + REQUIRE(!cb); + REQUIRE(ret.size() == 8u); + REQUIRE(ret == std::vector{0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}); + for (auto i = 0u; i < 4u; ++i) { + auto [oc, min_h, max_h, nsteps] = ta.get_propagate_res()[i]; + + REQUIRE(oc == taylor_outcome::time_limit); + REQUIRE(min_h == std::numeric_limits::infinity()); + REQUIRE(max_h == 0); + REQUIRE(nsteps == 0u); + } - // Integrate forward over a dense grid from ~0 to ~10. - std::vector grid; - for (auto i = 0u; i < 1000u; ++i) { - for (auto j = 0; j < 4; ++j) { - grid.push_back(i / 100.); - if (i != 0u) { - grid.back() += j / 10.; + // Switch to the harmonic oscillator. + ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; + + // Integrate forward over a dense grid from ~0 to ~10. + std::vector grid; + for (auto i = 0u; i < 1000u; ++i) { + for (auto j = 0; j < 4; ++j) { + grid.push_back(i / 100.); + if (i != 0u) { + grid.back() += j / 10.; + } } } - } - std::tie(cb, ret) = ta.propagate_grid(grid); + std::tie(cb, ret) = ta.propagate_grid(grid); - REQUIRE(!cb); - REQUIRE(ret.size() == 8000ull); + REQUIRE(!cb); + REQUIRE(ret.size() == 8000ull); - for (auto i = 0u; i < 4u; ++i) { - REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); - REQUIRE(ta.get_time()[i] == grid[3996u + i]); - } + for (auto i = 0u; i < 4u; ++i) { + REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); + REQUIRE(ta.get_time()[i] == grid[3996u + i]); + } - for (auto i = 0u; i < 1000u; ++i) { - for (auto j = 0u; j < 4u; ++j) { - REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.)); - REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.)); + for (auto i = 0u; i < 1000u; ++i) { + for (auto j = 0u; j < 4u; ++j) { + REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.)); + REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.)); + } } - } - // Do the same backwards. - ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; - grid.clear(); - for (auto i = 0u; i < 1000u; ++i) { - for (auto j = 0; j < 4; ++j) { - grid.push_back(i / -100.); - if (i != 0u) { - grid.back() += j / -10.; + // Do the same backwards. + ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; + grid.clear(); + for (auto i = 0u; i < 1000u; ++i) { + for (auto j = 0; j < 4; ++j) { + grid.push_back(i / -100.); + if (i != 0u) { + grid.back() += j / -10.; + } } } - } - std::tie(cb, ret) = ta.propagate_grid(grid); + std::tie(cb, ret) = ta.propagate_grid(grid); - REQUIRE(!cb); - REQUIRE(ret.size() == 8000ull); + REQUIRE(!cb); + REQUIRE(ret.size() == 8000ull); - for (auto i = 0u; i < 4u; ++i) { - REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); - REQUIRE(ta.get_time()[i] == grid[3996u + i]); - } + for (auto i = 0u; i < 4u; ++i) { + REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); + REQUIRE(ta.get_time()[i] == grid[3996u + i]); + } - for (auto i = 0u; i < 1000u; ++i) { - for (auto j = 0u; j < 4u; ++j) { - REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.)); - REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.)); + for (auto i = 0u; i < 1000u; ++i) { + for (auto j = 0u; j < 4u; ++j) { + REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.)); + REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.)); + } } - } - // Random testing. - ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; - std::fill(grid.begin(), grid.begin() + 4, 0.); - std::uniform_real_distribution rdist(0., .1); - for (auto i = 1u; i < 1000u; ++i) { - for (auto j = 0u; j < 4u; ++j) { - grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng); + // Random testing. + ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; + std::fill(grid.begin(), grid.begin() + 4, 0.); + std::uniform_real_distribution rdist(0., .1); + for (auto i = 1u; i < 1000u; ++i) { + for (auto j = 0u; j < 4u; ++j) { + grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng); + } } - } - std::tie(cb, ret) = ta.propagate_grid(grid); + std::tie(cb, ret) = ta.propagate_grid(grid); - REQUIRE(!cb); - REQUIRE(ret.size() == 8000ull); + REQUIRE(!cb); + REQUIRE(ret.size() == 8000ull); - for (auto i = 0u; i < 4u; ++i) { - REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); - REQUIRE(ta.get_time()[i] == grid[3996u + i]); - } + for (auto i = 0u; i < 4u; ++i) { + REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); + REQUIRE(ta.get_time()[i] == grid[3996u + i]); + } - for (auto i = 0u; i < 1000u; ++i) { - for (auto j = 0u; j < 4u; ++j) { - REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 400000.)); - REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 400000.)); + for (auto i = 0u; i < 1000u; ++i) { + for (auto j = 0u; j < 4u; ++j) { + REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 400000.)); + REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 400000.)); + } } - } - // Do it backwards too. - ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; - std::fill(grid.begin(), grid.begin() + 4, 0.); - rdist = std::uniform_real_distribution(-.1, 0.); - for (auto i = 1u; i < 1000u; ++i) { - for (auto j = 0u; j < 4u; ++j) { - grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng); + // Do it backwards too. + ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u}; + std::fill(grid.begin(), grid.begin() + 4, 0.); + rdist = std::uniform_real_distribution(-.1, 0.); + for (auto i = 1u; i < 1000u; ++i) { + for (auto j = 0u; j < 4u; ++j) { + grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng); + } } - } - std::tie(cb, ret) = ta.propagate_grid(grid); + std::tie(cb, ret) = ta.propagate_grid(grid); - REQUIRE(!cb); - REQUIRE(ret.size() == 8000ull); + REQUIRE(!cb); + REQUIRE(ret.size() == 8000ull); - for (auto i = 0u; i < 4u; ++i) { - REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); - REQUIRE(ta.get_time()[i] == grid[3996u + i]); - } + for (auto i = 0u; i < 4u; ++i) { + REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit); + REQUIRE(ta.get_time()[i] == grid[3996u + i]); + } - for (auto i = 0u; i < 1000u; ++i) { - for (auto j = 0u; j < 4u; ++j) { - REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 800000.)); - REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 800000.)); + for (auto i = 0u; i < 1000u; ++i) { + for (auto j = 0u; j < 4u; ++j) { + REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 800000.)); + REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 800000.)); + } } - } - // Test the callback is moved. - ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4}; - step_callback_batch f_cb_grid(cb_functor_grid{}); - value_ptr(f_cb_grid)->n_copies_after = value_ptr(f_cb_grid)->n_copies; - auto [out_cb, _] = ta.propagate_grid({0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, - kw::callback = std::move(f_cb_grid)); - // Invoke again the callback to ensure no copies have been made. - out_cb(ta); - REQUIRE(value_isa(out_cb)); - - // Do the same test with the range overload, moving in the callbacks initially stored - // in a range. This will check that the logic that converts the input range into - // a step callback does proper forwarding. - std::vector cf_vec = {cb_functor_grid{}, cb_functor_grid{}}; - cf_vec[0].n_copies_after = cf_vec[0].n_copies; - cf_vec[1].n_copies_after = cf_vec[1].n_copies; - std::tie(out_cb, _) = ta.propagate_grid( - {100., 100., 100., 100., 101., 101., 101., 101., 102., 102., 102., 102.}, - kw::callback - = cf_vec | std::views::transform([](cb_functor_grid &c) -> cb_functor_grid && { return std::move(c); })); - out_cb(ta); - REQUIRE(value_isa>(out_cb)); - REQUIRE(value_isa(value_ref>(out_cb)[0])); - - // Callback attempts to change the time coordinate. - ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4}; - REQUIRE_THROWS_MATCHES( - ta.propagate_grid( - {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, kw::callback = - [](auto &tint) { - tint.set_time(-100.); + // Test the callback is moved. + ta = taylor_adaptive_batch{ + {prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4}; + step_callback_batch f_cb_grid(cb_functor_grid{}); + value_ptr(f_cb_grid)->n_copies_after = value_ptr(f_cb_grid)->n_copies; + auto [out_cb, _] = ta.propagate_grid({0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, + kw::callback = std::move(f_cb_grid)); + // Invoke again the callback to ensure no copies have been made. + out_cb(ta); + REQUIRE(value_isa(out_cb)); - return true; - }), - std::runtime_error, - Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the " - "time coordinate of the integrator - this is not supported")); + // Do the same test with the range overload, moving in the callbacks initially stored + // in a range. This will check that the logic that converts the input range into + // a step callback does proper forwarding. + std::vector cf_vec = {cb_functor_grid{}, cb_functor_grid{}}; + cf_vec[0].n_copies_after = cf_vec[0].n_copies; + cf_vec[1].n_copies_after = cf_vec[1].n_copies; + std::tie(out_cb, _) = ta.propagate_grid( + {100., 100., 100., 100., 101., 101., 101., 101., 102., 102., 102., 102.}, + kw::callback + = cf_vec | std::views::transform([](cb_functor_grid &c) -> cb_functor_grid && { return std::move(c); })); + out_cb(ta); + REQUIRE(value_isa>(out_cb)); + REQUIRE(value_isa(value_ref>(out_cb)[0])); - // Try also with a single time coord. - ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4}; - REQUIRE_THROWS_MATCHES( - ta.propagate_grid( - {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, - kw::callback = - [](auto &tint) { - tint.set_time({tint.get_time()[0], -100., tint.get_time()[2], tint.get_time()[3]}); - - return true; - }), - std::runtime_error, - Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the " - "time coordinate of the integrator - this is not supported")); + // Callback attempts to change the time coordinate. + ta = taylor_adaptive_batch{ + {prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4}; + REQUIRE_THROWS_MATCHES( + ta.propagate_grid( + {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, kw::callback = + [](auto &tint) { + tint.set_time(-100.); + + return true; + }), + std::runtime_error, + Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the " + "time coordinate of the integrator - this is not supported")); + + // Try also with a single time coord. + ta = taylor_adaptive_batch{ + {prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4}; + REQUIRE_THROWS_MATCHES( + ta.propagate_grid( + {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, + kw::callback = + [](auto &tint) { + tint.set_time({tint.get_time()[0], -100., tint.get_time()[2], tint.get_time()[3]}); + + return true; + }), + std::runtime_error, + Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the " + "time coordinate of the integrator - this is not supported")); + } } // A test to make sure the propagate functions deal correctly @@ -518,196 +530,202 @@ TEST_CASE("propagate for_until") auto [x, v] = make_vars("x", "v"); - auto ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.06, 0.025, 0.026}, 2u}; - auto ta_copy = ta; + for (auto cm : {true, false}) { + auto ta = taylor_adaptive_batch{ + {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.06, 0.025, 0.026}, 2u, kw::compact_mode = cm}; + auto ta_copy = ta; - // Error modes. - REQUIRE_THROWS_MATCHES(ta.propagate_until({0., std::numeric_limits::infinity()}), std::invalid_argument, - Message("A non-finite time was passed to the propagate_until() function of an adaptive " - "Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES( - ta.propagate_until({10., 11.}, kw::max_delta_t = std::vector{1}), std::invalid_argument, - Message("Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, " + // Error modes. + REQUIRE_THROWS_MATCHES(ta.propagate_until({0., std::numeric_limits::infinity()}), std::invalid_argument, + Message("A non-finite time was passed to the propagate_until() function of an adaptive " + "Taylor integrator in batch mode")); + REQUIRE_THROWS_MATCHES( + ta.propagate_until({10., 11.}, kw::max_delta_t = std::vector{1}), std::invalid_argument, + Message( + "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, " "but the number of specified timesteps is 1")); - REQUIRE_THROWS_MATCHES( - ta.propagate_until({10., 11.}, kw::max_delta_t = {1., 2., 3.}), std::invalid_argument, - Message("Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, " + REQUIRE_THROWS_MATCHES( + ta.propagate_until({10., 11.}, kw::max_delta_t = {1., 2., 3.}), std::invalid_argument, + Message( + "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, " "but the number of specified timesteps is 3")); - REQUIRE_THROWS_MATCHES( - ta.propagate_until({10., 11.}, kw::max_delta_t = {1., std::numeric_limits::quiet_NaN()}), - std::invalid_argument, - Message("A nan max_delta_t was passed to the propagate_until() function of an adaptive " - "Taylor integrator in batch mode")); - REQUIRE_THROWS_MATCHES(ta.propagate_until({10., 11.}, kw::max_delta_t = {1., -1.}), std::invalid_argument, - Message("A non-positive max_delta_t was passed to the propagate_until() function of an " - "adaptive Taylor integrator in batch mode")); - - ta.set_time({0., std::numeric_limits::lowest()}); - - REQUIRE_THROWS_MATCHES( - ta.propagate_until({10., std::numeric_limits::max()}, kw::max_delta_t = std::vector{}), - std::invalid_argument, - Message("The final time passed to the propagate_until() function of an adaptive Taylor " - "integrator in batch mode results in an overflow condition")); - - ta.set_time({0., 0.}); - - // Propagate forward in time limiting the timestep size and passing in a callback. - auto counter0 = 0ul, counter1 = counter0; - - auto cb = [&counter0, &counter1](taylor_adaptive_batch &t) { - if (t.get_last_h()[0] != 0) { - ++counter0; - } - if (t.get_last_h()[1] != 0) { - ++counter1; - } - - return true; - }; - - ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb); - ta_copy.propagate_until({10., 11.}); - - REQUIRE(ta.get_time() == std::vector{10., 11.}); - REQUIRE(counter0 == 100000ul); - REQUIRE(counter1 == 220000ul); - REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); - - REQUIRE(ta_copy.get_time() == std::vector{10., 11.}); - REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); - - REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); - REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); - REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); - REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); - - // Scalar input time. - auto ta_copy2 = ta, ta_copy3 = ta; - ta_copy2.propagate_until(20.); - ta_copy3.propagate_until({20., 20.}); - REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); - - // Try also with max_delta_t. - ta_copy2.propagate_until(30., kw::max_delta_t = std::vector{1e-4, 5e-5}); - ta_copy3.propagate_until({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5}); - REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); - - // Do propagate_for() too. - ta.propagate_for({10., 11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb); - ta_copy.propagate_for({10., 11.}); - - // Scalar input time. - ta_copy2.propagate_for(20.); - ta_copy3.propagate_for({20., 20.}); - REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); - - // Try also with max_delta_t. - ta_copy2.propagate_for(30., kw::max_delta_t = std::vector{1e-4, 5e-5}); - ta_copy3.propagate_for({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5}); - REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); - - REQUIRE(ta.get_time() == std::vector{20., 22.}); - REQUIRE(counter0 == 200000ul); - REQUIRE(counter1 == 440000ul); - REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); - - REQUIRE(ta_copy.get_time() == std::vector{20., 22.}); - REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); - - REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); - REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); - REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); - REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); + REQUIRE_THROWS_MATCHES( + ta.propagate_until({10., 11.}, kw::max_delta_t = {1., std::numeric_limits::quiet_NaN()}), + std::invalid_argument, + Message("A nan max_delta_t was passed to the propagate_until() function of an adaptive " + "Taylor integrator in batch mode")); + REQUIRE_THROWS_MATCHES(ta.propagate_until({10., 11.}, kw::max_delta_t = {1., -1.}), std::invalid_argument, + Message("A non-positive max_delta_t was passed to the propagate_until() function of an " + "adaptive Taylor integrator in batch mode")); - // Do backwards in time too. - ta.propagate_for({-10., -11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb); - ta_copy.propagate_for({-10., -11.}); + ta.set_time({0., std::numeric_limits::lowest()}); - REQUIRE(ta.get_time() == std::vector{10., 11.}); - REQUIRE(counter0 == 300000ul); - REQUIRE(counter1 == 660000ul); - REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + REQUIRE_THROWS_MATCHES( + ta.propagate_until({10., std::numeric_limits::max()}, kw::max_delta_t = std::vector{}), + std::invalid_argument, + Message("The final time passed to the propagate_until() function of an adaptive Taylor " + "integrator in batch mode results in an overflow condition")); - REQUIRE(ta_copy.get_time() == std::vector{10., 11.}); - REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + ta.set_time({0., 0.}); - REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); - REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); - REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); - REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); + // Propagate forward in time limiting the timestep size and passing in a callback. + auto counter0 = 0ul, counter1 = counter0; - ta.propagate_until({0., 0.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb); - ta_copy.propagate_until({0., 0.}); + auto cb = [&counter0, &counter1](taylor_adaptive_batch &t) { + if (t.get_last_h()[0] != 0) { + ++counter0; + } + if (t.get_last_h()[1] != 0) { + ++counter1; + } - REQUIRE(ta.get_time() == std::vector{0., 0.}); - REQUIRE(counter0 == 400000ul); - REQUIRE(counter1 == 880000ul); - REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + return true; + }; - REQUIRE(ta_copy.get_time() == std::vector{0., 0.}); - REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), - [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb); + ta_copy.propagate_until({10., 11.}); + + REQUIRE(ta.get_time() == std::vector{10., 11.}); + REQUIRE(counter0 == 100000ul); + REQUIRE(counter1 == 220000ul); + REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta_copy.get_time() == std::vector{10., 11.}); + REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); + REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); + REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); + REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); + + // Scalar input time. + auto ta_copy2 = ta, ta_copy3 = ta; + ta_copy2.propagate_until(20.); + ta_copy3.propagate_until({20., 20.}); + REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); + + // Try also with max_delta_t. + ta_copy2.propagate_until(30., kw::max_delta_t = std::vector{1e-4, 5e-5}); + ta_copy3.propagate_until({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5}); + REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); + + // Do propagate_for() too. + ta.propagate_for({10., 11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb); + ta_copy.propagate_for({10., 11.}); + + // Scalar input time. + ta_copy2.propagate_for(20.); + ta_copy3.propagate_for({20., 20.}); + REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); + + // Try also with max_delta_t. + ta_copy2.propagate_for(30., kw::max_delta_t = std::vector{1e-4, 5e-5}); + ta_copy3.propagate_for({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5}); + REQUIRE(ta_copy2.get_state() == ta_copy3.get_state()); + + REQUIRE(ta.get_time() == std::vector{20., 22.}); + REQUIRE(counter0 == 200000ul); + REQUIRE(counter1 == 440000ul); + REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta_copy.get_time() == std::vector{20., 22.}); + REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); + REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); + REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); + REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); + + // Do backwards in time too. + ta.propagate_for({-10., -11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb); + ta_copy.propagate_for({-10., -11.}); + + REQUIRE(ta.get_time() == std::vector{10., 11.}); + REQUIRE(counter0 == 300000ul); + REQUIRE(counter1 == 660000ul); + REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta_copy.get_time() == std::vector{10., 11.}); + REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); + REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); + REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); + REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); + + ta.propagate_until({0., 0.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb); + ta_copy.propagate_until({0., 0.}); + + REQUIRE(ta.get_time() == std::vector{0., 0.}); + REQUIRE(counter0 == 400000ul); + REQUIRE(counter1 == 880000ul); + REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta_copy.get_time() == std::vector{0., 0.}); + REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(), + [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; })); + + REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); + REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); + REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); + REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); + + // Try with scalar max_delta_t. + ta_copy = ta; + ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 1e-4}); + ta_copy.propagate_until({10., 11.}, kw::max_delta_t = 1e-4); + REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res()); - REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.)); - REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.)); - REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.)); - REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.)); + ta.propagate_for({10., 11.}, kw::max_delta_t = {1e-4, 1e-4}); + ta_copy.propagate_for({10., 11.}, kw::max_delta_t = 1e-4); + REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res()); - // Try with scalar max_delta_t. - ta_copy = ta; - ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 1e-4}); - ta_copy.propagate_until({10., 11.}, kw::max_delta_t = 1e-4); - REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res()); - - ta.propagate_for({10., 11.}, kw::max_delta_t = {1e-4, 1e-4}); - ta_copy.propagate_for({10., 11.}, kw::max_delta_t = 1e-4); - REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res()); - - // Test the callback is moved. - step_callback_batch f_cb_until(cb_functor_until{}); - value_ptr(f_cb_until)->n_copies_after = value_ptr(f_cb_until)->n_copies; - auto [_, out_cb] = ta.propagate_until(20., kw::callback = std::move(f_cb_until)); - // Invoke again the callback to ensure no copies have been made. - out_cb(ta); - - step_callback_batch f_cb_for(cb_functor_for{}); - value_ptr(f_cb_for)->n_copies_after = value_ptr(f_cb_for)->n_copies; - std::tie(_, out_cb) = ta.propagate_for(10., kw::callback = std::move(f_cb_for)); - out_cb(ta); - REQUIRE(value_isa(out_cb)); - - // Do the same test with the range overload, moving in the callbacks initially stored - // in a range. This will check that the logic that converts the input range into - // a step callback does proper forwarding. - { - std::vector cf_vec = {cb_functor_for{}, cb_functor_for{}}; - cf_vec[0].n_copies_after = cf_vec[0].n_copies; - cf_vec[1].n_copies_after = cf_vec[1].n_copies; - std::tie(_, out_cb) = ta.propagate_for( - 10., kw::callback - = cf_vec | std::views::transform([](cb_functor_for &c) -> cb_functor_for && { return std::move(c); })); + // Test the callback is moved. + step_callback_batch f_cb_until(cb_functor_until{}); + value_ptr(f_cb_until)->n_copies_after = value_ptr(f_cb_until)->n_copies; + auto [_, out_cb] = ta.propagate_until(20., kw::callback = std::move(f_cb_until)); + // Invoke again the callback to ensure no copies have been made. out_cb(ta); - REQUIRE(value_isa>(out_cb)); - } - { - std::vector cf_vec = {cb_functor_until{}, cb_functor_until{}}; - cf_vec[0].n_copies_after = cf_vec[0].n_copies; - cf_vec[1].n_copies_after = cf_vec[1].n_copies; - std::tie(_, out_cb) = ta.propagate_until( - 50., kw::callback = cf_vec | std::views::transform([](cb_functor_until &c) -> cb_functor_until && { - return std::move(c); - })); + step_callback_batch f_cb_for(cb_functor_for{}); + value_ptr(f_cb_for)->n_copies_after = value_ptr(f_cb_for)->n_copies; + std::tie(_, out_cb) = ta.propagate_for(10., kw::callback = std::move(f_cb_for)); out_cb(ta); - REQUIRE(value_isa>(out_cb)); + REQUIRE(value_isa(out_cb)); + + // Do the same test with the range overload, moving in the callbacks initially stored + // in a range. This will check that the logic that converts the input range into + // a step callback does proper forwarding. + { + std::vector cf_vec = {cb_functor_for{}, cb_functor_for{}}; + cf_vec[0].n_copies_after = cf_vec[0].n_copies; + cf_vec[1].n_copies_after = cf_vec[1].n_copies; + std::tie(_, out_cb) = ta.propagate_for( + 10., kw::callback = cf_vec | std::views::transform([](cb_functor_for &c) -> cb_functor_for && { + return std::move(c); + })); + out_cb(ta); + REQUIRE(value_isa>(out_cb)); + } + + { + std::vector cf_vec = {cb_functor_until{}, cb_functor_until{}}; + cf_vec[0].n_copies_after = cf_vec[0].n_copies; + cf_vec[1].n_copies_after = cf_vec[1].n_copies; + std::tie(_, out_cb) = ta.propagate_until( + 50., kw::callback = cf_vec | std::views::transform([](cb_functor_until &c) -> cb_functor_until && { + return std::move(c); + })); + out_cb(ta); + REQUIRE(value_isa>(out_cb)); + } } } From dc64f66ec06697211161dd747175f9cb0eee9029 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 15:08:02 +0200 Subject: [PATCH 15/30] Fix CMake warning when building the benchmarks. --- benchmark/CMakeLists.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 8f3af7bc2..c01f48243 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,6 +1,16 @@ +# NOTE: we look for Boost in CONFIG mode first, as that has become the official supported way +# of locating Boost in recent Boost/CMake versions. If we fail, we try again in +# MODULE mode as last resort. # NOTE: don't find a specific version as we already checked # outside that the Boost version is appropriate. -find_package(Boost REQUIRED COMPONENTS program_options) +find_package(Boost QUIET COMPONENTS program_options CONFIG) +if(NOT ${Boost_FOUND}) + message(STATUS "Boost not found in CONFIG mode, retrying in MODULE mode.") + find_package(Boost QUIET MODULE COMPONENTS program_options) +endif() +if(NOT ${Boost_FOUND}) + message(FATAL_ERROR "Could not locate Boost in either CONFIG or MODULE mode.") +endif() if(NOT TARGET Boost::program_options) message(STATUS "The 'Boost::program_options' imported target is missing, creating it.") add_library(Boost::program_options UNKNOWN IMPORTED) From 9a2827f72c58cd216b1eb67ed6ce2fcf8f95f345 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Wed, 28 Aug 2024 15:17:41 +0200 Subject: [PATCH 16/30] Fix n body creation benchmark. --- benchmark/n_body_creation.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmark/n_body_creation.cpp b/benchmark/n_body_creation.cpp index 5da4e508c..9b166f0b0 100644 --- a/benchmark/n_body_creation.cpp +++ b/benchmark/n_body_creation.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -56,7 +57,13 @@ int main(int argc, char *argv[]) std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start) .count()); - std::cout << ta.get_llvm_state().get_ir() << '\n'; + if (compact_mode) { + for (auto ir : std::get<1>(ta.get_llvm_state()).get_ir()) { + std::cout << ir << '\n'; + } + } else { + std::cout << std::get<0>(ta.get_llvm_state()).get_ir() << '\n'; + } auto counter = 0u; for (const auto &ex : ta.get_decomposition()) { From 70a93bd0e1ad6f02901789fb7d29606b61e0fbce Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 09:29:20 +0200 Subject: [PATCH 17/30] Doc tweaks. --- doc/install.rst | 2 +- doc/tut_extended_precision.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/install.rst b/doc/install.rst index 7beed7bf4..cad4aeac9 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -70,7 +70,7 @@ heyoka (and all its dependencies) have been compiled with a compiler supporting 128-bit precision ^^^^^^^^^^^^^^^^^ -On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit ARM), +On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit Linux ARM), quadruple-precision integrations are always supported via ``long double``. Otherwise, on platforms such as x86-64, quadruple-precision computations are supported if: diff --git a/doc/tut_extended_precision.rst b/doc/tut_extended_precision.rst index 00e0cb77c..a01d9acef 100644 --- a/doc/tut_extended_precision.rst +++ b/doc/tut_extended_precision.rst @@ -11,7 +11,7 @@ not only in single and double precision, but also in extended precision. Specifi How these extended precision floating-point types can be accessed and used from C++ varies depending on the platform. The 80-bit extended-precision format is available as the C++ ``long double`` type on most platforms based on Intel x86 processors. Quadruple-precision -computations are supported either via the ``long double`` type (e.g., on 64-bit ARM processors) or via the the :cpp:class:`mppp::real128` type +computations are supported either via the ``long double`` type (e.g., on 64-bit Linux ARM) or via the the :cpp:class:`mppp::real128` type (provided that the platform supports the nonstandard ``__float128`` floating-point type and that heyoka was compiled with support for the mp++ library - see the :ref:`installation instructions `). From 49509ce22d94624c6f4342c50aaa3bccc0123d2a Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 10:05:53 +0200 Subject: [PATCH 18/30] Add the option of specifying whether or not to enable parallel JIT compilation to the integrators and to cfunc. --- include/heyoka/expression.hpp | 20 +++++++++-- include/heyoka/kw.hpp | 3 +- include/heyoka/llvm_state.hpp | 3 +- include/heyoka/taylor.hpp | 27 +++++++++++---- src/cfunc_class.cpp | 10 +++--- src/expression_cfunc.cpp | 11 +++--- src/taylor_adaptive.cpp | 4 +-- src/taylor_adaptive_batch.cpp | 4 +-- test/cfunc.cpp | 15 ++++----- test/make_multi_cfunc.cpp | 61 +++++++++++++++++++--------------- test/taylor_adaptive.cpp | 10 ++++-- test/taylor_adaptive_batch.cpp | 16 ++++++--- 12 files changed, 118 insertions(+), 66 deletions(-) diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp index a5c099c12..72b01683b 100644 --- a/include/heyoka/expression.hpp +++ b/include/heyoka/expression.hpp @@ -699,7 +699,7 @@ auto cfunc_common_opts(const KwArgs &...kw_args) template std::tuple, std::vector>> make_multi_cfunc(llvm_state, const std::string &, const std::vector &, const std::vector &, - std::uint32_t, bool, bool, long long); + std::uint32_t, bool, bool, long long, bool); } // namespace detail @@ -818,13 +818,27 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS cfunc } }(); + // Parallel JIT compilation. + auto parjit = [&p]() -> bool { + if constexpr (p.has(kw::parjit)) { + if constexpr (std::integral>) { + return static_cast(p(kw::parjit)); + } else { + static_assert(detail::always_false_v, "Invalid type for the 'parjit' keyword argument."); + } + } else { + return detail::default_parjit; + } + }(); + // Build the template llvm_state from the keyword arguments. llvm_state s(kw_args...); - return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec); + return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec, + parjit); } explicit cfunc(std::vector, std::vector, - std::tuple, llvm_state, bool>); + std::tuple, llvm_state, bool, bool>); HEYOKA_DLL_LOCAL void check_valid(const char *) const; diff --git a/include/heyoka/kw.hpp b/include/heyoka/kw.hpp index 8c8276e21..b7c8c163c 100644 --- a/include/heyoka/kw.hpp +++ b/include/heyoka/kw.hpp @@ -21,7 +21,7 @@ HEYOKA_BEGIN_NAMESPACE namespace kw { -// llvm_state. +// llvm_state/llvm_multi_state. IGOR_MAKE_NAMED_ARGUMENT(mname); IGOR_MAKE_NAMED_ARGUMENT(opt_level); IGOR_MAKE_NAMED_ARGUMENT(fast_math); @@ -34,6 +34,7 @@ IGOR_MAKE_NAMED_ARGUMENT(fast_math); IGOR_MAKE_NAMED_ARGUMENT(force_avx512); IGOR_MAKE_NAMED_ARGUMENT(slp_vectorize); IGOR_MAKE_NAMED_ARGUMENT(code_model); +IGOR_MAKE_NAMED_ARGUMENT(parjit); // cfunc API. IGOR_MAKE_NAMED_ARGUMENT(batch_size); diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp index 086e0c631..759f624e4 100644 --- a/include/heyoka/llvm_state.hpp +++ b/include/heyoka/llvm_state.hpp @@ -388,7 +388,8 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state template requires std::ranges::input_range && std::same_as>> - explicit llvm_multi_state(R &&rng) : llvm_multi_state(std::vector(std::ranges::begin(rng), std::ranges::end(rng))) + explicit llvm_multi_state(R &&rng, bool parjit = detail::default_parjit) + : llvm_multi_state(std::vector(std::ranges::begin(rng), std::ranges::end(rng)), parjit) { } llvm_multi_state(const llvm_multi_state &); diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp index be7d7f77c..fd7f65003 100644 --- a/include/heyoka/taylor.hpp +++ b/include/heyoka/taylor.hpp @@ -235,7 +235,20 @@ auto taylor_adaptive_common_ops(const KwArgs &...kw_args) } }(); - return std::tuple{high_accuracy, std::move(tol), compact_mode, std::move(pars), parallel_mode}; + // Parallel JIT compilation. + auto parjit = [&p]() -> bool { + if constexpr (p.has(kw::parjit)) { + if constexpr (std::integral>) { + return static_cast(p(kw::parjit)); + } else { + static_assert(always_false_v, "Invalid type for the 'parjit' keyword argument."); + } + } else { + return default_parjit; + } + }(); + + return std::tuple{high_accuracy, std::move(tol), compact_mode, std::move(pars), parallel_mode, parjit}; } // Small helper to construct a default value for the max_delta_t @@ -443,7 +456,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada // Private implementation-detail constructor machinery. using sys_t = std::variant>, var_ode_sys>; void finalise_ctor_impl(sys_t, std::vector, std::optional, std::optional, bool, bool, std::vector, - std::vector, std::vector, bool, std::optional); + std::vector, std::vector, bool, std::optional, bool); template void finalise_ctor(sys_t sys, std::vector state, const KwArgs &...kw_args) { @@ -463,7 +476,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada } }(); - auto [high_accuracy, tol, compact_mode, pars, parallel_mode] + auto [high_accuracy, tol, compact_mode, pars, parallel_mode, parjit] = detail::taylor_adaptive_common_ops(kw_args...); // Extract the terminal events, if any. @@ -499,7 +512,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada finalise_ctor_impl(std::move(sys), std::move(state), std::move(tm), std::move(tol), high_accuracy, compact_mode, std::move(pars), std::move(tes), std::move(ntes), parallel_mode, - std::move(prec)); + std::move(prec), parjit); } } @@ -853,7 +866,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch // Private implementation-detail constructor machinery. using sys_t = std::variant>, var_ode_sys>; void finalise_ctor_impl(sys_t, std::vector, std::uint32_t, std::vector, std::optional, bool, bool, - std::vector, std::vector, std::vector, bool); + std::vector, std::vector, std::vector, bool, bool); template void finalise_ctor(sys_t sys, std::vector state, std::uint32_t batch_size, const KwArgs &...kw_args) { @@ -875,7 +888,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch } }(); - auto [high_accuracy, tol, compact_mode, pars, parallel_mode] + auto [high_accuracy, tol, compact_mode, pars, parallel_mode, parjit] = detail::taylor_adaptive_common_ops(kw_args...); // Extract the terminal events, if any. @@ -898,7 +911,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch finalise_ctor_impl(std::move(sys), std::move(state), batch_size, std::move(tm), std::move(tol), high_accuracy, compact_mode, std::move(pars), std::move(tes), std::move(ntes), - parallel_mode); + parallel_mode, parjit); } } diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp index 93882b47c..9bcc50b2d 100644 --- a/src/cfunc_class.cpp +++ b/src/cfunc_class.cpp @@ -182,7 +182,7 @@ struct cfunc::impl { // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) explicit impl(std::vector fn, std::vector vars, llvm_state s, std::optional batch_size, bool high_accuracy, bool compact_mode, bool parallel_mode, - long long prec, bool check_prec) + long long prec, bool check_prec, bool parjit) : m_fn(std::move(fn)), m_vars(std::move(vars)), m_states(std::array{s, s, s}), m_prec(prec), m_check_prec(check_prec), m_high_accuracy(high_accuracy), m_compact_mode(compact_mode), m_parallel_mode(parallel_mode) @@ -207,7 +207,7 @@ struct cfunc::impl { if (compact_mode) { // Build the multi cfunc, and assign the internal members. std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc( - std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec); + std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec, parjit); // Compile. std::get<1>(m_states).compile(); @@ -308,15 +308,15 @@ template // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) cfunc::cfunc(std::vector fn, std::vector vars, // NOLINTNEXTLINE(performance-unnecessary-value-param) - std::tuple, llvm_state, bool> tup) + std::tuple, llvm_state, bool, bool> tup) { // Unpack the tuple. - auto &[high_accuracy, compact_mode, parallel_mode, prec, batch_size, s, check_prec] = tup; + auto &[high_accuracy, compact_mode, parallel_mode, prec, batch_size, s, check_prec, parjit] = tup; // Construct the impl. // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer) m_impl = std::make_unique(std::move(fn), std::move(vars), std::move(s), batch_size, high_accuracy, - compact_mode, parallel_mode, prec, check_prec); + compact_mode, parallel_mode, prec, check_prec, parjit); } template diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp index 5eeb93538..940586166 100644 --- a/src/expression_cfunc.cpp +++ b/src/expression_cfunc.cpp @@ -2035,7 +2035,7 @@ std::array add_multi_cfunc_impl(llvm::Type *fp_t, std::list, std::vector>> make_multi_cfunc_impl(llvm::Type *fp_t, const llvm_state &tplt, const std::string &name, const std::vector &fn, const std::vector &vars, std::uint32_t batch_size, - bool high_accuracy, bool parallel_mode) + bool high_accuracy, bool parallel_mode, bool parjit) { if (batch_size == 0u) [[unlikely]] { throw std::invalid_argument("The batch size of a compiled function cannot be zero"); @@ -2264,7 +2264,8 @@ make_multi_cfunc_impl(llvm::Type *fp_t, const llvm_state &tplt, const std::strin // // https://en.cppreference.com/w/cpp/ranges/as_rvalue_view return std::make_tuple( - llvm_multi_state(states_lists[0] | std::views::transform([](auto &s) -> auto && { return std::move(s); })), + llvm_multi_state(states_lists[0] | std::views::transform([](auto &s) -> auto && { return std::move(s); }), + parjit), std::move(dc), std::move(tape_size_align)); } @@ -2293,7 +2294,7 @@ template std::tuple, std::vector>> make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector &fn, const std::vector &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode, - long long prec) + long long prec, bool parjit) { #if defined(HEYOKA_ARCH_PPC) if constexpr (std::is_same_v) { @@ -2320,7 +2321,7 @@ make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector(tplt, prec); - return make_multi_cfunc_impl(fp_t, tplt, name, fn, vars, batch_size, high_accuracy, parallel_mode); + return make_multi_cfunc_impl(fp_t, tplt, name, fn, vars, batch_size, high_accuracy, parallel_mode, parjit); } // Explicit instantiations. @@ -2328,7 +2329,7 @@ make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector, std::vector>> \ make_multi_cfunc(llvm_state, const std::string &, const std::vector &, \ - const std::vector &, std::uint32_t, bool, bool, long long); + const std::vector &, std::uint32_t, bool, bool, long long, bool); HEYOKA_MAKE_MULTI_CFUNC_INST(float) HEYOKA_MAKE_MULTI_CFUNC_INST(double) diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp index 5416a9278..d6513d550 100644 --- a/src/taylor_adaptive.cpp +++ b/src/taylor_adaptive.cpp @@ -174,7 +174,7 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, std::optional time, std::optional tol, bool high_accuracy, bool compact_mode, std::vector pars, std::vector tes, std::vector ntes, bool parallel_mode, - [[maybe_unused]] std::optional prec) + [[maybe_unused]] std::optional prec, bool parjit) { HEYOKA_TAYLOR_REF_FROM_I_DATA(m_state); HEYOKA_TAYLOR_REF_FROM_I_DATA(m_pars); @@ -461,7 +461,7 @@ void taylor_adaptive::finalise_ctor_impl(sys_t vsys, std::vector state, std::ranges::reverse(states); // Create the multi state and assign it. - m_llvm_state = llvm_multi_state(std::move(states)); + m_llvm_state = llvm_multi_state(std::move(states), parjit); // Compile. std::get<1>(m_llvm_state).compile(); diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp index 1c9c6a042..b29660e53 100644 --- a/src/taylor_adaptive_batch.cpp +++ b/src/taylor_adaptive_batch.cpp @@ -76,7 +76,7 @@ template void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector state, std::uint32_t batch_size, std::vector time, std::optional tol, bool high_accuracy, bool compact_mode, std::vector pars, std::vector tes, - std::vector ntes, bool parallel_mode) + std::vector ntes, bool parallel_mode, bool parjit) { // NOTE: this must hold because tol == 0 is interpreted // as undefined in finalise_ctor(). @@ -309,7 +309,7 @@ void taylor_adaptive_batch::finalise_ctor_impl(sys_t vsys, std::vector sta std::ranges::reverse(states); // Create the multi state and assign it. - m_llvm_state = llvm_multi_state(std::move(states)); + m_llvm_state = llvm_multi_state(std::move(states), parjit); // Compile. std::get<1>(m_llvm_state).compile(); diff --git a/test/cfunc.cpp b/test/cfunc.cpp index 4bd0ff645..8db7096a8 100644 --- a/test/cfunc.cpp +++ b/test/cfunc.cpp @@ -37,6 +37,7 @@ #include #include +#include #include #include @@ -135,13 +136,9 @@ TEST_CASE("basic") REQUIRE(cf0.get_fn() == std::vector{x + y, x - y}); REQUIRE(cf0.get_vars() == std::vector{y, x}); REQUIRE(!cf0.get_dc().empty()); - if (cf0.get_compact_mode()) { - REQUIRE(std::get<1>(cf0.get_llvm_states()).get_opt_level() == 3u); - } else { - REQUIRE(std::get<0>(cf0.get_llvm_states())[0].get_opt_level() == 3u); - REQUIRE(std::get<0>(cf0.get_llvm_states())[1].get_opt_level() == 3u); - REQUIRE(std::get<0>(cf0.get_llvm_states())[2].get_opt_level() == 3u); - } + REQUIRE(std::get<0>(cf0.get_llvm_states())[0].get_opt_level() == 3u); + REQUIRE(std::get<0>(cf0.get_llvm_states())[1].get_opt_level() == 3u); + REQUIRE(std::get<0>(cf0.get_llvm_states())[2].get_opt_level() == 3u); REQUIRE(cf0.get_high_accuracy() == false); REQUIRE(cf0.get_compact_mode() == false); REQUIRE(cf0.get_parallel_mode() == true); @@ -165,13 +162,15 @@ TEST_CASE("basic") kw::batch_size = custom_batch_size, kw::opt_level = opt_level, kw::high_accuracy = high_accuracy, - kw::compact_mode = compact_mode}; + kw::compact_mode = compact_mode, + kw::parjit = detail::default_parjit}; REQUIRE(cf0.get_fn() == std::vector{x + y, x - y}); REQUIRE(cf0.get_vars() == std::vector{y, x}); REQUIRE(!cf0.get_dc().empty()); if (cf0.get_compact_mode()) { REQUIRE(std::get<1>(cf0.get_llvm_states()).get_opt_level() == opt_level); + REQUIRE(std::get<1>(cf0.get_llvm_states()).get_parjit() == detail::default_parjit); } else { REQUIRE(std::get<0>(cf0.get_llvm_states())[0].get_opt_level() == opt_level); REQUIRE(std::get<0>(cf0.get_llvm_states())[1].get_opt_level() == opt_level); diff --git a/test/make_multi_cfunc.cpp b/test/make_multi_cfunc.cpp index 46c8cfc1d..53b0ff0d9 100644 --- a/test/make_multi_cfunc.cpp +++ b/test/make_multi_cfunc.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -86,7 +87,8 @@ TEST_CASE("basic") llvm_state tplt{kw::opt_level = opt_level}; auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", {x + y + heyoka::time, x - y - par[0]}, - {x, y}, 1, false, false, 0); + {x, y}, 1, false, false, 0, detail::default_parjit); + REQUIRE(ms.get_parjit() == detail::default_parjit); REQUIRE(sa.size() == 1u); @@ -135,7 +137,7 @@ TEST_CASE("basic") llvm_state tplt{kw::opt_level = opt_level}; auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", {x + y + heyoka::time, x - y - par[0]}, - {x, y}, 2, false, false, 0); + {x, y}, 2, false, false, 0, detail::default_parjit); REQUIRE(sa.size() == 2u); @@ -213,8 +215,9 @@ TEST_CASE("sgp4") llvm_state tplt; - auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", model::sgp4(), - std::vector(inputs.begin(), inputs.end()), 1, false, false, 0); + auto [ms, dc, sa] + = detail::make_multi_cfunc(tplt, "test", model::sgp4(), std::vector(inputs.begin(), inputs.end()), 1, + false, false, 0, detail::default_parjit); REQUIRE(sa.size() == 1u); @@ -281,7 +284,8 @@ TEST_CASE("nbody") std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; }); std::ranges::sort(vars, std::less{}); - auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, batch_size, false, false, 0); + auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, batch_size, false, false, 0, + detail::default_parjit); ms.compile(); @@ -402,7 +406,8 @@ TEST_CASE("nbody mp") std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; }); std::ranges::sort(vars, std::less{}); - auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, 1, false, false, prec); + auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, 1, false, false, prec, + detail::default_parjit); ms.compile(); @@ -507,7 +512,8 @@ TEST_CASE("nbody par") std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; }); std::ranges::sort(vars, std::less{}); - auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, batch_size, false, false, 0); + auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, batch_size, false, false, 0, + detail::default_parjit); ms.compile(); @@ -701,7 +707,8 @@ TEST_CASE("nbody par mp") std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; }); std::ranges::sort(vars, std::less{}); - auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, 1, false, false, prec); + auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", exs, vars, 1, false, false, prec, + detail::default_parjit); ms.compile(); @@ -858,8 +865,8 @@ TEST_CASE("numparams") std::generate(pars.begin(), pars.end(), gen); - auto [ms, dc, sa] - = detail::make_multi_cfunc(tplt, "test", {1_dbl, par[0]}, {}, batch_size, false, false, 0); + auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", {1_dbl, par[0]}, {}, batch_size, false, + false, 0, detail::default_parjit); REQUIRE(((batch_size == 1u && sa.size() == 1u) || (batch_size > 1u && sa.size() == 2u))); @@ -936,7 +943,7 @@ TEST_CASE("numparams mp") std::generate(outs.begin(), outs.end(), gen); auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", {1_dbl, par[0], par[1], -2_dbl}, {}, 1, - false, false, prec); + false, false, prec, detail::default_parjit); ms.compile(); @@ -1006,8 +1013,9 @@ TEST_CASE("bogus stride") std::generate(ins.begin(), ins.end(), gen); std::generate(pars.begin(), pars.end(), gen); - auto [ms, dc, sa] = detail::make_multi_cfunc(tplt, "test", {x + 2_dbl * y + par[0] * z, par[1] - x * y}, - {x, y, z}, batch_size, false, false, 0); + auto [ms, dc, sa] + = detail::make_multi_cfunc(tplt, "test", {x + 2_dbl * y + par[0] * z, par[1] - x * y}, {x, y, z}, + batch_size, false, false, 0, detail::default_parjit); ms.compile(); @@ -1047,13 +1055,13 @@ TEST_CASE("failure modes") { using Catch::Matchers::Message; - REQUIRE_THROWS_MATCHES( - detail::make_multi_cfunc(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 0, false, false, 0), - std::invalid_argument, Message("The batch size of a compiled function cannot be zero")); + REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 0, false, false, + 0, detail::default_parjit), + std::invalid_argument, Message("The batch size of a compiled function cannot be zero")); - REQUIRE_THROWS_MATCHES( - detail::make_multi_cfunc(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false, true, 0), - std::invalid_argument, Message("Parallel mode has not been implemented yet")); + REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false, true, + 0, detail::default_parjit), + std::invalid_argument, Message("Parallel mode has not been implemented yet")); #if defined(HEYOKA_ARCH_PPC) @@ -1065,16 +1073,17 @@ TEST_CASE("failure modes") #if defined(HEYOKA_HAVE_REAL) - REQUIRE_THROWS_MATCHES( - detail::make_multi_cfunc(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false, false, 0), - std::invalid_argument, - Message(fmt::format("An invalid precision value of 0 was passed to make_multi_cfunc() (the " - "value must be in the [{}, {}] range)", - mppp::real_prec_min(), mppp::real_prec_max()))); + REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false, + false, 0, detail::default_parjit), + std::invalid_argument, + Message(fmt::format("An invalid precision value of 0 was passed to make_multi_cfunc() (the " + "value must be in the [{}, {}] range)", + mppp::real_prec_min(), mppp::real_prec_max()))); #endif - REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc(llvm_state{}, "", {1_dbl, par[0]}, {}, 1, false, false, 0), + REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc(llvm_state{}, "", {1_dbl, par[0]}, {}, 1, false, false, 0, + detail::default_parjit), std::invalid_argument, Message("A non-empty function name is required when invoking make_multi_cfunc()")); } diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp index ee77593f5..9aa195674 100644 --- a/test/taylor_adaptive.cpp +++ b/test/taylor_adaptive.cpp @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -1657,7 +1658,8 @@ void s11n_test_impl() kw::nt_events = {nt_event(v - par[0], s11n_nt_cb{})}, kw::pars = std::vector{-1e-4}, kw::high_accuracy = true, - kw::compact_mode = true}; + kw::compact_mode = true, + kw::parjit = detail::default_parjit}; REQUIRE(ta.get_tol() == std::numeric_limits::epsilon()); REQUIRE(ta.get_high_accuracy()); @@ -1703,6 +1705,7 @@ void s11n_test_impl() REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit()); REQUIRE(value_type_index(ta.get_t_events()[0].get_callback()) == value_type_index(ta_copy.get_t_events()[0].get_callback())); @@ -1827,7 +1830,8 @@ TEST_CASE("copy semantics") kw::pars = std::vector{-1e-4}, kw::high_accuracy = true, kw::compact_mode = true, - kw::tol = 1e-11}; + kw::tol = 1e-11, + kw::parjit = detail::default_parjit}; auto ta_copy = ta; @@ -1838,6 +1842,7 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit()); ta.step(); ta_copy.step(); @@ -1861,6 +1866,7 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit()); ta.step(); ta_copy.step(); diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp index ec6cb755c..979e6823e 100644 --- a/test/taylor_adaptive_batch.cpp +++ b/test/taylor_adaptive_batch.cpp @@ -1059,9 +1059,13 @@ void s11n_test_impl() // Test without events. { - auto ta = taylor_adaptive_batch{ - {prime(x) = v, prime(v) = -9.8 * sin(x + par[0])}, {0., 0.01, 0.5, 0.51}, 2u, - kw::pars = std::vector{-1e-4, -1.1e-4}, kw::high_accuracy = true, kw::compact_mode = true}; + auto ta = taylor_adaptive_batch{{prime(x) = v, prime(v) = -9.8 * sin(x + par[0])}, + {0., 0.01, 0.5, 0.51}, + 2u, + kw::pars = std::vector{-1e-4, -1.1e-4}, + kw::high_accuracy = true, + kw::compact_mode = true, + kw::parjit = detail::default_parjit}; REQUIRE(ta.get_tol() == std::numeric_limits::epsilon()); REQUIRE(ta.get_high_accuracy()); @@ -1103,6 +1107,7 @@ void s11n_test_impl() REQUIRE(ta.get_d_output() == ta_copy.get_d_output()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit()); REQUIRE(ta.get_step_res() == ta_copy.get_step_res()); REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res()); @@ -1736,7 +1741,8 @@ TEST_CASE("copy semantics") kw::pars = std::vector{-1e-4, -1e-4}, kw::high_accuracy = true, kw::compact_mode = true, - kw::tol = 1e-11}; + kw::tol = 1e-11, + kw::parjit = detail::default_parjit}; auto ta_copy = ta; @@ -1747,6 +1753,7 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit()); ta.step(); ta_copy.step(); @@ -1770,6 +1777,7 @@ TEST_CASE("copy semantics") REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir()); REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc()); + REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit()); ta.step(); ta_copy.step(); From 209646f1f5fc5500252ca522beffbe8da5c70c0a Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 10:07:48 +0200 Subject: [PATCH 19/30] Add benchmark for the compilation time of SGP4 dynamics. --- benchmark/CMakeLists.txt | 1 + benchmark/sgp4_dynamics.cpp | 91 +++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 benchmark/sgp4_dynamics.cpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index c01f48243..ae9ab8a3f 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -83,6 +83,7 @@ ADD_HEYOKA_BENCHMARK(sims_flanagan_jac) ADD_HEYOKA_BENCHMARK(cfunc_mt) ADD_HEYOKA_BENCHMARK(diff_tensors) ADD_HEYOKA_BENCHMARK(var_construction) +ADD_HEYOKA_BENCHMARK(sgp4_dynamics) if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR) ADD_HEYOKA_BENCHMARK(pendulum_mp) diff --git a/benchmark/sgp4_dynamics.cpp b/benchmark/sgp4_dynamics.cpp new file mode 100644 index 000000000..2bb8bd782 --- /dev/null +++ b/benchmark/sgp4_dynamics.cpp @@ -0,0 +1,91 @@ +// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using namespace heyoka; + +std::vector> construct_sgp4_ode() +{ + // Fetch sgp4's formulae. + auto sgp4_func = heyoka::model::sgp4(); + + // The variable representing tsince in the sgp4 formulae. + const auto tsince = heyoka::expression("tsince"); + + // In sgp4_func, replace the TLE data with params, and tsince + // with tsince + par[7]. + sgp4_func = heyoka::subs(sgp4_func, {{"n0", heyoka::par[0]}, + {"e0", heyoka::par[1]}, + {"i0", heyoka::par[2]}, + {"node0", heyoka::par[3]}, + {"omega0", heyoka::par[4]}, + {"m0", heyoka::par[5]}, + {"bstar", heyoka::par[6]}, + {"tsince", tsince + heyoka::par[7]}}); + + // Compute the rhs of the sgp4 ODE, substituting tsince with the time placeholder. + const auto dt = heyoka::diff_tensors(sgp4_func, {tsince}); + auto sgp4_rhs = heyoka::subs(dt.get_jacobian(), {{tsince, heyoka::time}}); + + // Create the state variables for the ODE. + auto [x, y, z, vx, vy, vz, e, r] = heyoka::make_vars("x", "y", "z", "vx", "vy", "vz", "e", "r"); + + // Add the differential equation for r. + // NOTE: do **not** use vx/vy/vz here. Apparently, in the SGP4 algorithm, if one takes the + // time derivatives of x/y/z one does not get *exactly* the same values as the vx/vy/vz returned + // by SGP4. In order for the differential equation for r to be correct, we need the the true time + // derivatives of x/y/z, and we cannot use what SGP4 says are the velocities. + sgp4_rhs.push_back(heyoka::sum({x * sgp4_rhs[0], y * sgp4_rhs[1], z * sgp4_rhs[2]}) / r); + + // Return the ODE sys. + using heyoka::prime; + return {prime(x) = sgp4_rhs[0], prime(y) = sgp4_rhs[1], prime(z) = sgp4_rhs[2], prime(vx) = sgp4_rhs[3], + prime(vy) = sgp4_rhs[4], prime(vz) = sgp4_rhs[5], prime(e) = sgp4_rhs[6], prime(r) = sgp4_rhs[7]}; +} + +int main(int argc, char *argv[]) +{ + set_logger_level_trace(); + + namespace po = boost::program_options; + + bool parjit = false; + + po::options_description desc("Options"); + + desc.add_options()("help", "produce help message")("parjit", "parallel JIT compilation"); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 0; + } + + if (vm.count("parjit")) { + parjit = true; + } + + taylor_adaptive ta{construct_sgp4_ode(), std::vector(8u), kw::high_accuracy = true, + kw::compact_mode = true, kw::parjit = parjit}; +} From c26be3b9ad90a0e656094c363e27e2933859ebbe Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 10:14:01 +0200 Subject: [PATCH 20/30] Minor tweak to config.hpp.in. --- config.hpp.in | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config.hpp.in b/config.hpp.in index 19f41f3ca..e5fcff6a1 100644 --- a/config.hpp.in +++ b/config.hpp.in @@ -53,13 +53,17 @@ // NOTE: handy Boost library for this since 1.73: // https://www.boost.org/doc/libs/1_73_0/libs/predef/doc/index.html +// +// NOTE: it makes sense here to handle only the GCC/MSVC macros here +// (on the assumption that clang is identical to GCC in this respect). +// No point in using macros provided by compilers we do not test on. #if defined(_ARCH_PPC) || defined(_M_PPC) #define HEYOKA_ARCH_PPC #endif -#if defined(__arm__) || defined(_M_ARM) || defined(__arm) || defined(__aarch64__) +#if defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT) || defined(__aarch64__) || defined(_M_ARM64) #define HEYOKA_ARCH_ARM From cbdfc6a6ca96d43823732c73b5a221f53e54e351 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 10:18:58 +0200 Subject: [PATCH 21/30] Remove now-unused definition. --- config.hpp.in | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config.hpp.in b/config.hpp.in index e5fcff6a1..bd5d80e25 100644 --- a/config.hpp.in +++ b/config.hpp.in @@ -75,10 +75,6 @@ #endif -// Maximum number of blocks that can be processed in parallel -// when computing the Taylor derivatives in parallel mode. -#define HEYOKA_CM_PAR_MAX_INVOKE_N 20 - // Setup of the ABI versioning and tagging // machinery. From 535d531e4db6f6f879df613085339cda324c37e7 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 11:03:42 +0200 Subject: [PATCH 22/30] Set default_parjit to false on ARM. --- include/heyoka/llvm_state.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp index 759f624e4..4ee929cbf 100644 --- a/include/heyoka/llvm_state.hpp +++ b/include/heyoka/llvm_state.hpp @@ -351,9 +351,9 @@ void llvm_state_mem_cache_try_insert(std::vector, unsigned, llvm_mc // There is evidence of an LLVM thread scheduling bug when parallel compilation // is active, that rarely results in multiply-defined symbols for external C // functions, which leads to compilation failure. So far, we have been able to -// trigger this issue only on Linux aarch64. +// trigger this issue only on 64-bit arm. inline constexpr bool default_parjit = -#if defined(HEYOKA_ARCH_ARM) && defined(__linux__) +#if defined(HEYOKA_ARCH_ARM) false #else true From 3ec07e771c908e7a0d0039177d0d2c603c46b09c Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 14:04:47 +0200 Subject: [PATCH 23/30] Small tweak to avoid potential unused variable warnings. --- src/taylor_adaptive.cpp | 2 +- src/taylor_adaptive_batch.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp index d6513d550..0707e69a9 100644 --- a/src/taylor_adaptive.cpp +++ b/src/taylor_adaptive.cpp @@ -65,7 +65,7 @@ // NOTE: this is a helper macro to reduce typing when accessing the // data members of i_data. // NOLINTNEXTLINE(bugprone-macro-parentheses) -#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) auto &name = m_i_data->name +#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) [[maybe_unused]] auto &name = m_i_data->name HEYOKA_BEGIN_NAMESPACE diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp index b29660e53..6bff47ca1 100644 --- a/src/taylor_adaptive_batch.cpp +++ b/src/taylor_adaptive_batch.cpp @@ -62,7 +62,7 @@ // NOTE: this is a helper macro to reduce typing when accessing the // data members of i_data. // NOLINTNEXTLINE(bugprone-macro-parentheses) -#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) auto &name = m_i_data->name +#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) [[maybe_unused]] auto &name = m_i_data->name HEYOKA_BEGIN_NAMESPACE From cbdd39ff6c7785441596cd1ffded28323dbad4da Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 14:23:48 +0200 Subject: [PATCH 24/30] Factor out the helper to compute the cost of a floating-point scalar operation. --- CMakeLists.txt | 1 + include/heyoka/detail/type_traits.hpp | 3 + src/cfunc_class.cpp | 48 +----------- src/detail/type_traits.cpp | 108 ++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 47 deletions(-) create mode 100644 src/detail/type_traits.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8211fb370..6cfdd07bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,6 +193,7 @@ set(HEYOKA_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/type_traits.cpp" # NOTE: this will be an empty file in case we are not # building with support for real. "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp" diff --git a/include/heyoka/detail/type_traits.hpp b/include/heyoka/detail/type_traits.hpp index 04e16c1a2..db1126e25 100644 --- a/include/heyoka/detail/type_traits.hpp +++ b/include/heyoka/detail/type_traits.hpp @@ -139,6 +139,9 @@ inline constexpr bool is_x86_fp80 = is_ieee754_binaryN(); template inline constexpr bool is_ieee754_binary128 = is_ieee754_binaryN(); +template +double get_fp_unit_cost(); + } // namespace detail HEYOKA_END_NAMESPACE diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp index 9bcc50b2d..02b33b63e 100644 --- a/src/cfunc_class.cpp +++ b/src/cfunc_class.cpp @@ -945,53 +945,7 @@ void cfunc::multi_eval(out_2d outputs, in_2d inputs, std::optional par // - the batch size. // Cost of a scalar fp operation. - constexpr auto fp_unit_cost = []() -> double { - if constexpr (std::same_as || std::same_as) { - // float and double. - return 1; - } else if constexpr (std::same_as) { - // long double. - if constexpr (detail::is_ieee754_binary64) { - return 1; - } else if constexpr (detail::is_x86_fp80) { - return 5; - } else if constexpr (detail::is_ieee754_binary128) { -#if defined(HEYOKA_ARCH_PPC) - return 10; -#else - return 100; -#endif - } else { -#if defined(HEYOKA_ARCH_PPC) - // Double-double implementation. - return 5; -#else - static_assert(detail::always_false_v, "Unknown fp cost model."); -#endif - } - } -#if defined(HEYOKA_HAVE_REAL128) - else if constexpr (std::same_as) { -#if defined(HEYOKA_ARCH_PPC) - return 10; -#else - return 100; -#endif - } -#endif -#if defined(HEYOKA_HAVE_REAL) - else if constexpr (std::same_as) { - // NOTE: this should be improved to take into account - // the selected precision. - // NOTE: for reference, mppp::real with 113 bits of precision - // is slightly slower than software-implemented quadmath. - return 1000; - } -#endif - else { - static_assert(detail::always_false_v, "Unknown fp cost model."); - } - }(); + const auto fp_unit_cost = detail::get_fp_unit_cost(); // Total number of fp operations: number of elementary subexpressions in the // decomposition * ncols. diff --git a/src/detail/type_traits.cpp b/src/detail/type_traits.cpp new file mode 100644 index 000000000..c2b8f9e8c --- /dev/null +++ b/src/detail/type_traits.cpp @@ -0,0 +1,108 @@ +// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com) +// +// This file is part of the heyoka library. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include + +#include + +#if defined(HEYOKA_HAVE_REAL128) + +#include + +#endif + +#if defined(HEYOKA_HAVE_REAL) + +#include + +#endif + +#include + +HEYOKA_BEGIN_NAMESPACE + +namespace detail +{ + +// A function to compute a rough estimate of the cost of performing +// an elementary operation (e.g., addition/multiplication) on a scalar +// floating-point value of type T. +// +// The cost is calibrated to be 1 for single/double precision values, +// so that the unit of measure for the cost is a (very rough) approximation +// of clock cycles. +template +double get_fp_unit_cost() +{ + if constexpr (std::same_as || std::same_as) { + // float and double. + return 1; + } else if constexpr (std::same_as) { + // long double. + if constexpr (is_ieee754_binary64) { + return 1; + } else if constexpr (is_x86_fp80) { + return 5; + } else if constexpr (is_ieee754_binary128) { +#if defined(HEYOKA_ARCH_PPC) + return 10; +#else + return 100; +#endif + } else { +#if defined(HEYOKA_ARCH_PPC) + // Double-double implementation. + return 5; +#else + static_assert(always_false_v, "Unknown fp cost model for long double."); +#endif + } + } +#if defined(HEYOKA_HAVE_REAL128) + else if constexpr (std::same_as) { +#if defined(HEYOKA_ARCH_PPC) + return 10; +#else + return 100; +#endif + } +#endif +#if defined(HEYOKA_HAVE_REAL) + else if constexpr (std::same_as) { + // NOTE: this should be improved to take into account + // the selected precision. + // NOTE: for reference, mppp::real with 113 bits of precision + // is slightly slower than software-implemented quadmath. + return 1000; + } +#endif + else { + static_assert(always_false_v, "Unknown fp cost model for an unsupported floating-point type."); + } +} + +// Explicit instantiations. +template double get_fp_unit_cost(); +template double get_fp_unit_cost(); +template double get_fp_unit_cost(); + +#if defined(HEYOKA_HAVE_REAL128) + +template double get_fp_unit_cost(); + +#endif + +#if defined(HEYOKA_HAVE_REAL) + +template double get_fp_unit_cost(); + +#endif + +} // namespace detail + +HEYOKA_END_NAMESPACE From 0cddb720048539086b5882d3bc3615555457581e Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Thu, 29 Aug 2024 14:29:38 +0200 Subject: [PATCH 25/30] Minor. --- src/detail/type_traits.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/detail/type_traits.cpp b/src/detail/type_traits.cpp index c2b8f9e8c..925646567 100644 --- a/src/detail/type_traits.cpp +++ b/src/detail/type_traits.cpp @@ -35,7 +35,7 @@ namespace detail // // The cost is calibrated to be 1 for single/double precision values, // so that the unit of measure for the cost is a (very rough) approximation -// of clock cycles. +// of a clock cycle. template double get_fp_unit_cost() { From 1e061978ce4582461578f99de7975748447d4621 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Fri, 30 Aug 2024 10:53:20 +0200 Subject: [PATCH 26/30] Internal doc additions. --- src/cfunc_class.cpp | 6 ++++++ src/taylor_02.cpp | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp index 02b33b63e..c7edd99a2 100644 --- a/src/cfunc_class.cpp +++ b/src/cfunc_class.cpp @@ -943,6 +943,12 @@ void cfunc::multi_eval(out_2d outputs, in_2d inputs, std::optional par // - the value of ncols, // - the floating-point type in use, // - the batch size. + // + // Note that this cost model is very rough and does not take into account, + // for instance, that different elementary operations may have very different + // costs (e.g., a trig function vs a simple add). Perhaps we can re-evaluate this + // in the future and maybe just remove it and parallelise regardless to simplify + // the logic. // Cost of a scalar fp operation. const auto fp_unit_cost = detail::get_fp_unit_cost(); diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 4c311d726..4aae9d02e 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -953,6 +953,15 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint // Generate the code for the computation of the Taylor derivatives. if (parallel_mode) { + // NOTE: in principle here we could implement a cost model to decide at runtime + // whether or not it is worth it to run the parallel implementation depending + // on the current Taylor order. The cost model for the computation of the Taylor + // derivatives is quite simple (as all AD formulae basically boild down to + // sums of products), apart from order 0 where we may have operations with + // wildly different costs (e.g., a cos() vs a simple addition). We made an attempt + // at implementing such a cost model at one point, but there were no benefits + // (even a small slowdown) in the large N-body problem used as a test case. + // Thus, for now, let us keep things simple. taylor_cm_codegen_segment_diff_parallel(s, fp_vec_type, seg_map, n_uvars); } else { taylor_cm_codegen_segment_diff_sequential(s, fp_vec_type, seg_map, n_uvars); From a11e32baac8abd344f436d4af5f0f01aaf6336b6 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Fri, 30 Aug 2024 11:18:24 +0200 Subject: [PATCH 27/30] Update the known issues page. --- doc/known_issues.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/known_issues.rst b/doc/known_issues.rst index 599f17127..495739430 100644 --- a/doc/known_issues.rst +++ b/doc/known_issues.rst @@ -18,6 +18,14 @@ Unsolved The root cause is most likely a code-generation/optimisation problem in LLVM. This issue is currently under investigation. +* The parallel compilation feature (added in heyoka 6.0.0) is currently disabled + by default on 64-bit ARM processors (this includes the Apple M1 and its successors). + The reason is a likely thread scheduling bug in LLVM's parallel compilation facilities + that very rarely results in a multiply-defined symbol, which ultimately leads to compilation + failure. The issue is currently under investigation by the LLVM developers. In the + meantime, you can explicitly turn on parallel compilation via the ``kw::parjit`` + :ref:`keyword argument ` when constructing an integrator or a compiled + function. Solved ====== From a631bf6b6d4f8be345701fba2049d18a4059c4e4 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Fri, 30 Aug 2024 11:32:48 +0200 Subject: [PATCH 28/30] Minor. --- src/taylor_02.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 4aae9d02e..5b993c1ed 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -956,7 +956,7 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint // NOTE: in principle here we could implement a cost model to decide at runtime // whether or not it is worth it to run the parallel implementation depending // on the current Taylor order. The cost model for the computation of the Taylor - // derivatives is quite simple (as all AD formulae basically boild down to + // derivatives is quite simple (as all AD formulae basically boil down to // sums of products), apart from order 0 where we may have operations with // wildly different costs (e.g., a cos() vs a simple addition). We made an attempt // at implementing such a cost model at one point, but there were no benefits From a92e8807a56c6663119cf36ab07993a5326d8ca7 Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Fri, 30 Aug 2024 17:01:07 +0200 Subject: [PATCH 29/30] Update changelog. --- doc/changelog.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/changelog.rst b/doc/changelog.rst index 98ec5911f..c1af36f72 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -7,6 +7,11 @@ Changelog New ~~~ +- Implement parallel compilation for Taylor integrators + and compiled functions + (`#446 `__, + `#444 `__, + `#441 `__). - Add the possibility of specifying the LLVM code model used for JIT compilation (`#440 `__). From 57189c7b0841582faaba253fb57bbe3f684faedb Mon Sep 17 00:00:00 2001 From: Francesco Biscani Date: Sat, 31 Aug 2024 09:12:40 +0200 Subject: [PATCH 30/30] Internal doc bit. --- src/expression_cfunc.cpp | 7 +++++++ src/taylor_02.cpp | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp index 940586166..20b128001 100644 --- a/src/expression_cfunc.cpp +++ b/src/expression_cfunc.cpp @@ -1704,6 +1704,13 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list // Limit of codegenned blocks per state. // NOTE: this has not been really properly tuned, // needs more investigation. + // NOTE: it would probably be better here to keep track of the + // total number of function calls per segment, rather than + // the number of blocks. The reason for this is that each + // function call in principle increases the size of the + // auxiliary global arrays used by the compact mode + // argument generators, which in turn increases the code + // generation time. constexpr auto max_n_cg_blocks = 20u; // Variable to keep track of the u variable diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index 5b993c1ed..af54aae39 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -1049,6 +1049,13 @@ std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::T // Limit of codegenned blocks per state. // NOTE: this has not been really properly tuned, // needs more investigation. + // NOTE: it would probably be better here to keep track of the + // total number of function calls per segment, rather than + // the number of blocks. The reason for this is that each + // function call in principle increases the size of the + // auxiliary global arrays used by the compact mode + // argument generators, which in turns increases the code + // generation time. constexpr auto max_n_cg_blocks = 20u; // Variable to keep track of the index of the first u variable