From b91b2407404e57f156d818c032325b7525b76983 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 22 Aug 2024 14:58:13 +0200
Subject: [PATCH 01/30] Minor.

---
 src/detail/llvm_helpers.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index ba766450e..c5a4afc2c 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -3365,7 +3365,7 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t)
 // function cannot be called concurrently from multiple threads on the same tp object,
 // or even on different tp objects defined in the same context.
 // NOTE: this handles only floating-point (vector) types at this time, extending
-// to intgeral types should be fairly easy.
+// to integral types should be fairly easy.
 // NOTE: perhaps this function could be made more generic for arbitrary struct types
 // by (recursively) reading the struct layout and then reproducing it in the target
 // context. Like this, we could avoid special casing for the mppp::real types.

From 3e71b6e1d1ddf2c968364f72d160f20f750e88e6 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sun, 25 Aug 2024 15:05:45 +0200
Subject: [PATCH 02/30] Some very preliminar, non-functional code.

---
 include/heyoka/detail/i_data.hpp |  21 +-
 include/heyoka/taylor.hpp        |   6 +-
 src/detail/i_data.cpp            |  57 ++-
 src/taylor_00.cpp                |  41 +-
 src/taylor_02.cpp                | 776 ++++++++++++++-----------------
 src/taylor_adaptive.cpp          |  19 +-
 6 files changed, 467 insertions(+), 453 deletions(-)

diff --git a/include/heyoka/detail/i_data.hpp b/include/heyoka/detail/i_data.hpp
index 3f56994d4..205e0a943 100644
--- a/include/heyoka/detail/i_data.hpp
+++ b/include/heyoka/detail/i_data.hpp
@@ -15,12 +15,15 @@
 
 #endif
 
+#include <array>
+#include <cstddef>
 #include <cstdint>
 #include <optional>
 #include <variant>
 #include <vector>
 
 #include <heyoka/config.hpp>
+#include <heyoka/detail/aligned_buffer.hpp>
 #include <heyoka/detail/dfloat.hpp>
 #include <heyoka/detail/fwd_decl.hpp>
 #include <heyoka/llvm_state.hpp>
@@ -64,8 +67,8 @@ struct taylor_adaptive<T>::i_data {
     std::vector<T> m_state;
     // Time.
     detail::dfloat<T> m_time;
-    // The LLVM machinery.
-    llvm_state m_llvm;
+    // The LLVM (multi)state.
+    std::variant<llvm_state, llvm_multi_state> m_llvm_state;
     // Dimension of the system.
     std::uint32_t m_dim{};
     // Taylor decomposition.
@@ -78,10 +81,18 @@ struct taylor_adaptive<T>::i_data {
     bool m_high_accuracy{};
     // Compact mode.
     bool m_compact_mode{};
-    // The steppers.
+    // The stepper types (non-compact mode).
     using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept;
     using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept;
-    std::variant<step_f_t, step_f_e_t> m_step_f;
+    // The stepper types (compact mode). These have an additional argument - the tape pointer.
+    using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept;
+    using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept;
+    // The stepper.
+    std::variant<step_f_t, step_f_e_t, c_step_f_t, c_step_f_e_t> m_step_f;
+    // Size/alignment for the compact mode tape.
+    std::array<std::size_t, 2> m_tape_sa{};
+    // Compact mode tape.
+    detail::aligned_buffer_t m_tape;
     // The vector of parameters.
     std::vector<T> m_pars;
     // The vector for the Taylor coefficients.
@@ -118,6 +129,8 @@ struct taylor_adaptive<T>::i_data {
     i_data &operator=(i_data &&) noexcept = delete;
 
     ~i_data();
+
+    void init_cm_tape();
 };
 
 template <typename T>
diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp
index 5282f6885..46bcc7258 100644
--- a/include/heyoka/taylor.hpp
+++ b/include/heyoka/taylor.hpp
@@ -109,9 +109,9 @@ taylor_dc_t taylor_add_adaptive_step(llvm_state &, llvm::Type *, llvm::Type *, c
 llvm::Value *taylor_c_make_sv_funcs_arr(llvm_state &, const std::vector<std::uint32_t> &);
 
 std::variant<std::pair<llvm::Value *, llvm::Type *>, std::vector<llvm::Value *>>
-taylor_compute_jet(llvm_state &, llvm::Type *, llvm::Value *, llvm::Value *, llvm::Value *, const taylor_dc_t &,
-                   const std::vector<std::uint32_t> &, std::uint32_t, std::uint32_t, std::uint32_t, std::uint32_t, bool,
-                   bool, bool);
+taylor_compute_jet(llvm_state &, llvm::Type *, llvm::Value *, llvm::Value *, llvm::Value *, llvm::Value *,
+                   const taylor_dc_t &, const std::vector<std::uint32_t> &, std::uint32_t, std::uint32_t, std::uint32_t,
+                   std::uint32_t, bool, bool, bool);
 
 std::pair<std::string, std::vector<llvm::Type *>>
 taylor_c_diff_func_name_args(llvm::LLVMContext &, llvm::Type *, const std::string &, std::uint32_t, std::uint32_t,
diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp
index 51f15f257..572ca7655 100644
--- a/src/detail/i_data.cpp
+++ b/src/detail/i_data.cpp
@@ -8,9 +8,13 @@
 
 #include <heyoka/config.hpp>
 
+#include <cassert>
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <variant>
+
+#include <boost/serialization/array.hpp>
 
 #if defined(HEYOKA_HAVE_REAL128)
 
@@ -24,6 +28,7 @@
 
 #endif
 
+#include <heyoka/detail/aligned_buffer.hpp>
 #include <heyoka/detail/i_data.hpp>
 #include <heyoka/detail/optional_s11n.hpp>
 #include <heyoka/detail/variant_s11n.hpp>
@@ -89,18 +94,38 @@ void serialize(Archive &ar, std::tuple<heyoka::taylor_outcome, Args...> &tup, un
 
 HEYOKA_BEGIN_NAMESPACE
 
+// Helper to initialise the compact-mode tape. Assumes an empty tape.
+template <typename T>
+void taylor_adaptive<T>::i_data::init_cm_tape()
+{
+    assert(!m_tape);
+
+    const auto [sz, al] = m_tape_sa;
+
+    if (m_compact_mode) {
+        assert(sz != 0u);
+        assert(al != 0u);
+
+        m_tape = detail::make_aligned_buffer(sz, al);
+    } else {
+        assert(sz == 0u);
+        assert(al == 0u);
+    }
+}
+
 template <typename T>
 void taylor_adaptive<T>::i_data::save(boost::archive::binary_oarchive &ar, unsigned) const
 {
     ar << m_state;
     ar << m_time;
-    ar << m_llvm;
+    ar << m_llvm_state;
     ar << m_dim;
     ar << m_dc;
     ar << m_order;
     ar << m_tol;
     ar << m_high_accuracy;
     ar << m_compact_mode;
+    ar << m_tape_sa;
     ar << m_pars;
     ar << m_tc;
     ar << m_last_h;
@@ -114,13 +139,14 @@ void taylor_adaptive<T>::i_data::load(boost::archive::binary_iarchive &ar, unsig
 {
     ar >> m_state;
     ar >> m_time;
-    ar >> m_llvm;
+    ar >> m_llvm_state;
     ar >> m_dim;
     ar >> m_dc;
     ar >> m_order;
     ar >> m_tol;
     ar >> m_high_accuracy;
     ar >> m_compact_mode;
+    ar >> m_tape_sa;
     ar >> m_pars;
     ar >> m_tc;
     ar >> m_last_h;
@@ -129,22 +155,35 @@ void taylor_adaptive<T>::i_data::load(boost::archive::binary_iarchive &ar, unsig
     ar >> m_tm_data;
 
     // Recover the function pointers.
-    m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
+    m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
+
+    // Reconstruct the compact mode tape, if necessary.
+    m_tape.reset();
+    init_cm_tape();
 }
 
+// NOTE: this ctor provides only partial initialisation of the data members.
+// The rest of the initialisation is performed from the integrator ctor.
+// NOTE: m_llvm_state is inited as a single llvm_state regardless of the use
+// of compact mode. It will be converted into a multi state if needed at a
+// later stage.
 template <typename T>
-taylor_adaptive<T>::i_data::i_data(llvm_state s) : m_llvm(std::move(s))
+taylor_adaptive<T>::i_data::i_data(llvm_state s) : m_llvm_state(std::move(s))
 {
 }
 
 template <typename T>
 taylor_adaptive<T>::i_data::i_data(const i_data &other)
-    : m_state(other.m_state), m_time(other.m_time), m_llvm(other.m_llvm), m_dim(other.m_dim), m_dc(other.m_dc),
-      m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy),
-      m_compact_mode(other.m_compact_mode), m_pars(other.m_pars), m_tc(other.m_tc), m_last_h(other.m_last_h),
-      m_d_out(other.m_d_out), m_vsys(other.m_vsys), m_tm_data(other.m_tm_data)
+    : m_state(other.m_state), m_time(other.m_time), m_llvm_state(other.m_llvm_state), m_dim(other.m_dim),
+      m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy),
+      m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa), m_pars(other.m_pars), m_tc(other.m_tc),
+      m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_vsys(other.m_vsys), m_tm_data(other.m_tm_data)
 {
-    m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
+    // Recover the function pointers.
+    m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
+
+    // Init the compact mode tape, if necessary.
+    init_cm_tape();
 }
 
 template <typename T>
diff --git a/src/taylor_00.cpp b/src/taylor_00.cpp
index 6861f7b8a..b3e322a57 100644
--- a/src/taylor_00.cpp
+++ b/src/taylor_00.cpp
@@ -9,7 +9,9 @@
 #include <heyoka/config.hpp>
 
 #include <algorithm>
+#include <array>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <stdexcept>
@@ -689,14 +691,24 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_
 }
 
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-taylor_dc_t taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name,
-                                     const std::vector<std::pair<expression, expression>> &sys,
-                                     std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
-                                     bool parallel_mode, std::uint32_t order)
+std::tuple<taylor_dc_t, std::variant<llvm_state, std::vector<llvm_state>>, std::array<std::size_t, 2>>
+taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name,
+                         const std::vector<std::pair<expression, expression>> &sys, std::uint32_t batch_size,
+                         bool high_accuracy, bool compact_mode, bool parallel_mode, std::uint32_t order)
 {
-    assert(!s.is_compiled());
+    assert(!tplt.is_compiled());
     assert(batch_size > 0u);
 
+    // Setup the return state(s) and fetch the main state.
+    auto ret_states = [compact_mode, &tplt]() -> std::variant<llvm_state, std::vector<llvm_state>> {
+        if (compact_mode) {
+            return std::vector{tplt.make_similar()};
+        } else {
+            return tplt.make_similar();
+        }
+    }();
+    auto &s = compact_mode ? std::get<1>(ret_states)[0] : std::get<0>(ret_states);
+
     // Record the number of equations/variables.
     const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
 
@@ -715,14 +727,17 @@ taylor_dc_t taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm::
     auto &md = s.module();
 
     // Prepare the function prototype. The arguments are:
+    //
     // - pointer to the current state vector (read & write),
     // - pointer to the parameters (read only),
     // - pointer to the time value(s) (read only),
     // - pointer to the array of max timesteps (read & write),
-    // - pointer to the Taylor coefficients output (write only).
+    // - pointer to the Taylor coefficients output (write only),
+    // - pointer to the tape (read & write, compact mode only).
+    //
     // These pointers cannot overlap.
     auto *fp_vec_t = make_vector_type(fp_t, batch_size);
-    const std::vector<llvm::Type *> fargs(5, llvm::PointerType::getUnqual(ext_fp_t));
+    const std::vector<llvm::Type *> fargs(compact_mode ? 6 : 5, llvm::PointerType::getUnqual(context));
     // The function does not return anything.
     auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
     assert(ft != nullptr);
@@ -760,14 +775,22 @@ taylor_dc_t taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm::
     tc_ptr->addAttr(llvm::Attribute::NoAlias);
     tc_ptr->addAttr(llvm::Attribute::WriteOnly);
 
+    llvm::Argument *tape_ptr = nullptr;
+    if (compact_mode) {
+        tape_ptr = tc_ptr + 1;
+        tape_ptr->setName("tape_ptr");
+        tape_ptr->addAttr(llvm::Attribute::NoCapture);
+        tape_ptr->addAttr(llvm::Attribute::NoAlias);
+    }
+
     // Create a new basic block to start insertion into.
     auto *bb = llvm::BasicBlock::Create(context, "entry", f);
     assert(bb != nullptr);
     builder.SetInsertPoint(bb);
 
     // Compute the jet of derivatives at the given order.
-    auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, dc, {}, n_eq, n_uvars, order,
-                                           batch_size, compact_mode, high_accuracy, parallel_mode);
+    auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, tape_ptr, dc, {}, n_eq, n_uvars,
+                                           order, batch_size, compact_mode, high_accuracy, parallel_mode);
 
     // Determine the integration timestep.
     auto *h = taylor_determine_h(s, fp_t, diff_variant, sv_funcs_dc, nullptr, h_ptr, n_eq, n_uvars, order, batch_size,
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 130cdda90..0b824b892 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -11,18 +11,20 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <limits>
+#include <list>
 #include <map>
 #include <stdexcept>
-#include <string>
 #include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include <boost/numeric/conversion/cast.hpp>
+#include <boost/safe_numerics/safe_integer.hpp>
 
 #include <fmt/core.h>
 #include <fmt/ranges.h>
@@ -500,35 +502,233 @@ void taylor_c_compute_sv_diffs(llvm_state &s, llvm::Type *fp_t,
     });
 }
 
-// For each segment in s_dc, this function will return a dict mapping an LLVM function
-// f for the computation of a Taylor derivative to a size and a vector of std::functions. For example, one entry
-// in the return value will read something like:
-// {f : (2, [g_0, g_1, g_2])}
-// The meaning in this example is that the arity of f is 3 and it will be called with 2 different
-// sets of arguments. The g_i functions are expected to be called with input argument j in [0, 1]
-// to yield the value of the i-th function argument for f at the j-th invocation.
-auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vector<taylor_dc_t> &s_dc,
-                                // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-                                std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t batch_size, bool high_accuracy)
+// Helper to perform the computation of the Taylor derivatives in compact mode across
+// multiple LLVM states.
+auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *main_par_ptr,
+                              llvm::Value *main_time_ptr, llvm::Value *main_tape_ptr, const taylor_dc_t &dc,
+                              const std::vector<taylor_dc_t> &s_dc, const std::vector<std::uint32_t> &sv_funcs_dc,
+                              std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
+                              bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx)
 {
-    // Log runtime in trace mode.
-    spdlog::stopwatch sw;
+    // TODO implement.
+    (void)parallel_mode;
 
-    // Init the return value.
-    // NOTE: use maps with name-based comparison for the functions. This ensures that the order in which these
-    // functions are invoked in taylor_compute_jet_compact_mode() is always the same. If we used directly pointer
+    // Generate the global arrays for the computation of the derivatives
+    // of the state variables in the main state.
+    const auto svd_gl = taylor_c_make_sv_diff_globals(main_state, main_fp_t, dc, n_uvars);
+
+    // Structure used to log, in trace mode, the breakdown of each segment.
+    // For each segment, this structure contains the number of invocations
+    // of each function in the segment. It will be unused if we are not tracing.
+    std::vector<std::vector<std::uint32_t>> segment_bd;
+
+    // Are we tracing?
+    const auto is_tracing = get_logger()->should_log(spdlog::level::trace);
+
+    // List of evaluation functions in a segment.
+    //
+    // This map contains a list of functions for the compact-mode evaluation of Taylor derivatives.
+    // Each function is mapped to a pair, containing:
+    //
+    // - the number of times the function is to be invoked,
+    // - a list of functors (generators) that generate the arguments for
+    //   the invocation.
+    //
+    // NOTE: we use maps with name-based comparison for the functions. This ensures that the order in which these
+    // functions are invoked is always the same. If we used directly pointer
     // comparisons instead, the order could vary across different executions and different platforms. The name
     // mangling we do when creating the function names should ensure that there are no possible name collisions.
-    std::vector<
-        std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
-                 llvm_func_name_compare>>
-        retval;
+    using seg_f_list_t
+        = std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
+                   llvm_func_name_compare>;
+
+    // Init the list of states.
+    // NOTE: we use lists here because it is convenient to have
+    // pointer/reference stability when iteratively constructing
+    // the set of states.
+    std::list<llvm_state> states;
+
+    // Push back a new state and use it as initial current state.
+    // NOTE: like this, we always end up creating at least one driver
+    // function and a state, even in the degenerate case of an empty decomposition,
+    // which is suboptimal peformance-wise.
+    // I do not think however that it is worth it to complicate the code to avoid
+    // this corner-case pessimisation.
+    states.push_back(main_state.make_similar());
+    auto *cur_state = &states.back();
+
+    // Index of the state we are currently operating on.
+    boost::safe_numerics::safe<unsigned> cur_state_idx = 0;
+
+    // Helper to create and return the prototype of a driver function in the state s.
+    auto make_driver_proto = [](llvm_state &s, unsigned cur_idx) {
+        auto &builder = s.builder();
+        auto &md = s.module();
+        auto &ctx = s.context();
+
+        // The arguments to the driver are:
+        // - a pointer to the tape,
+        // - pointers to par and time,
+        // - the current diff order.
+        auto *ptr_tp = llvm::PointerType::getUnqual(ctx);
+        std::vector<llvm::Type *> fargs{ptr_tp, ptr_tp, ptr_tp, builder.getInt32Ty()};
+
+        // The driver does not return anything.
+        auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
+        assert(ft != nullptr); // LCOV_EXCL_LINE
+
+        // Now create the driver.
+        const auto cur_name = fmt::format("heyoka.cm_jet.driver_{}", cur_idx);
+        auto *f = llvm_func_create(ft, llvm::Function::ExternalLinkage, cur_name, &md);
+        // NOTE: the driver cannot call itself recursively.
+        f->addFnAttr(llvm::Attribute::NoRecurse);
+
+        // Add the arguments' attributes.
+        // NOTE: no aliasing is assumed between the pointer
+        // arguments.
+        auto *tape_arg = f->args().begin();
+        tape_arg->setName("tape_ptr");
+        tape_arg->addAttr(llvm::Attribute::NoCapture);
+        tape_arg->addAttr(llvm::Attribute::NoAlias);
+
+        auto *par_ptr_arg = tape_arg + 1;
+        par_ptr_arg->setName("par_ptr");
+        par_ptr_arg->addAttr(llvm::Attribute::NoCapture);
+        par_ptr_arg->addAttr(llvm::Attribute::NoAlias);
+        par_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
+
+        auto *time_ptr_arg = tape_arg + 2;
+        time_ptr_arg->setName("time_ptr");
+        time_ptr_arg->addAttr(llvm::Attribute::NoCapture);
+        time_ptr_arg->addAttr(llvm::Attribute::NoAlias);
+        time_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
+
+        return f;
+    };
+
+    // TODO doc fix.
+    // Helper to compute the Taylor derivatives for a block.
+    // func is the LLVM function for the computation of the Taylor derivative in the block,
+    // ncalls the number of times it must be called, gens the generators for the
+    // function arguments and cur_order the order of the derivative. s is the llvm state
+    // in which we are computing the derivatives.
+    auto block_diff = [n_uvars](llvm_state &s, llvm::Function *func, std::uint32_t ncalls, const auto &gens,
+                                llvm::Value *tape_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr,
+                                llvm::Value *cur_order, llvm::Type *fp_vec_type) {
+        // LCOV_EXCL_START
+        assert(ncalls > 0u);
+        assert(!gens.empty());
+        assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast<bool>(f); }));
+        // LCOV_EXCL_STOP
+
+        // Fetch the builder for the current state.
+        auto &bld = s.builder();
+
+        // We will be manually unrolling loops if ncalls is small enough.
+        // This seems to help with compilation times.
+        constexpr auto max_unroll_n = 5u;
+
+        if (ncalls > max_unroll_n) {
+            // Loop over the number of calls.
+            llvm_loop_u32(s, bld.getInt32(0), bld.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
+                // Create the u variable index from the first generator.
+                auto u_idx = gens[0](cur_call_idx);
+
+                // Initialise the vector of arguments with which func must be called. The following
+                // initial arguments are always present:
+                // - current Taylor order,
+                // - u index of the variable,
+                // - tape of derivatives,
+                // - pointer to the param values,
+                // - pointer to the time value(s).
+                std::vector<llvm::Value *> args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr};
+
+                // Create the other arguments via the generators.
+                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                    args.push_back(gens[i](cur_call_idx));
+                }
+
+                // Calculate the derivative and store the result.
+                taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args));
+            });
+        } else {
+            // The manually-unrolled version of the above.
+            for (std::uint32_t idx = 0; idx < ncalls; ++idx) {
+                auto *cur_call_idx = bld.getInt32(idx);
+                auto u_idx = gens[0](cur_call_idx);
+                std::vector<llvm::Value *> args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr};
+
+                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                    args.push_back(gens[i](cur_call_idx));
+                }
+
+                taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args));
+            }
+        }
+    };
+
+    // NOTE: unlike in compiled functions, we cannot at the same time
+    // declare and invoke the drivers from the main module as the invocation
+    // happens from within an LLVM loop. Thus, we first define the drivers
+    // in the states and add their declarations in the main state, and only
+    // at a later stage we perform the invocation of the drivers in the
+    // main state.
+
+    // Declarations of the drivers in the main state.
+    std::vector<llvm::Function *> main_driver_decls;
+    // Add the declaration for the first driver.
+    main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx));
+
+    // Add the driver declaration to the current state,
+    // and start insertion into the driver.
+    cur_state->builder().SetInsertPoint(
+        llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx)));
+
+    // Variable to keep track of how many blocks have been codegenned
+    // in the current state.
+    boost::safe_numerics::safe<unsigned> n_cg_blocks = 0;
+
+    // Limit of codegenned blocks per state.
+    // NOTE: this has not been really properly tuned,
+    // needs more investigation.
+    constexpr auto max_n_cg_blocks = 20u;
 
     // Variable to keep track of the u variable
     // on whose definition we are operating.
     auto cur_u_idx = n_eq;
+
+    // Iterate over the segments in s_dc.
     for (const auto &seg : s_dc) {
-        // This structure maps an LLVM function to sets of arguments
+        if (n_cg_blocks > max_n_cg_blocks) {
+            // We have codegenned enough blocks for this state. Create the return
+            // value for the current driver, and move to the next one.
+            cur_state->builder().CreateRetVoid();
+
+            // Create the new current state.
+            states.push_back(main_state.make_similar());
+            cur_state = &states.back();
+
+            // Reset/update the counters.
+            n_cg_blocks = 0;
+            ++cur_state_idx;
+
+            // Add the driver declaration to the main state.
+            main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx));
+
+            // Add the driver declaration to the current state,
+            // and start insertion into the driver.
+            cur_state->builder().SetInsertPoint(
+                llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx)));
+        }
+
+        // Fetch the internal fp type and its vector counterpart for the current state.
+        auto *fp_t = llvm_clone_type(*cur_state, main_fp_t);
+        auto *fp_vec_type = make_vector_type(fp_t, batch_size);
+
+        // Fetch the current builder.
+        auto &cur_builder = cur_state->builder();
+
+        // This structure maps a function to sets of arguments
         // with which the function is to be called. For instance, if function
         // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
         // will contain {f : [[a, b, c], [d, e, f]]}.
@@ -547,7 +747,7 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
 
         for (const auto &ex : seg) {
             // Get the function for the computation of the derivative.
-            auto *func = taylor_c_diff_func(s, fp_t, ex.first, n_uvars, batch_size, high_accuracy);
+            auto *func = taylor_c_diff_func(*cur_state, fp_t, ex.first, n_uvars, batch_size, high_accuracy);
 
             // Insert the function into tmp_map.
             const auto [it, is_new_func] = tmp_map.try_emplace(func);
@@ -610,9 +810,8 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
             }
         }
 
-        // Add a new entry in retval for the current segment.
-        retval.emplace_back();
-        auto &a_map = retval.back();
+        // Create the seg_f_list_t for the current segment.
+        seg_f_list_t seg_map;
 
         for (const auto &[func, vv] : tmp_map_transpose) {
             // NOTE: vv.size() is now the number of arguments. We know it cannot
@@ -622,7 +821,7 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
             assert(!vv.empty()); // LCOV_EXCL_LINE
 
             // Add the function.
-            const auto [it, ins_status] = a_map.try_emplace(func);
+            const auto [it, ins_status] = seg_map.try_emplace(func);
             assert(ins_status); // LCOV_EXCL_LINE
 
             // Set the number of calls for this function.
@@ -633,447 +832,180 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
             // Create the g functions for each argument.
             for (const auto &v : vv) {
                 it->second.second.push_back(std::visit(
-                    [&s, fp_t](const auto &x) {
+                    [cur_state, fp_t](const auto &x) {
                         using type = uncvref_t<decltype(x)>;
 
                         if constexpr (std::is_same_v<type, std::vector<std::uint32_t>>) {
-                            return cm_make_arg_gen_vidx(s, x);
+                            return cm_make_arg_gen_vidx(*cur_state, x);
                         } else {
-                            return cm_make_arg_gen_vc(s, fp_t, x);
+                            return cm_make_arg_gen_vc(*cur_state, fp_t, x);
                         }
                     },
                     v));
             }
         }
-    }
 
-    get_logger()->trace("Taylor build function maps runtime: {}", sw);
+        // Fetch the arguments from the driver prototype.
+        auto *driver_f = cur_builder.GetInsertBlock()->getParent();
+        auto *tape_ptr = driver_f->args().begin();
+        auto *par_ptr = driver_f->args().begin() + 1;
+        auto *time_ptr = driver_f->args().begin() + 2;
+        auto *cur_order = driver_f->args().begin() + 3;
 
-    // LCOV_EXCL_START
-    // Log a breakdown of the return value in trace mode.
-    if (get_logger()->should_log(spdlog::level::trace)) {
-        std::vector<std::vector<std::uint32_t>> fm_bd;
+        // Compute the derivatives for this segment.
+        for (const auto &[func, fpair] : seg_map) {
+            const auto &[ncalls, gens] = fpair;
+
+            block_diff(*cur_state, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type);
+        }
 
-        for (const auto &m : retval) {
-            fm_bd.emplace_back();
+        // Update the number of codegenned blocks.
+        n_cg_blocks += seg_map.size();
 
-            for (const auto &p : m) {
-                fm_bd.back().push_back(p.second.first);
+        // LCOV_EXCL_START
+        // Update segment_bd if needed.
+        if (is_tracing) {
+            segment_bd.emplace_back();
+
+            for (const auto &p : seg_map) {
+                segment_bd.back().push_back(p.second.first);
             }
         }
+        // LCOV_EXCL_STOP
+    }
 
-        get_logger()->trace("Taylor function maps breakdown: {}", fm_bd);
+    // We need one last return statement for the last added state.
+    cur_state->builder().CreateRetVoid();
+
+    // LCOV_EXCL_START
+    // Log segment_bd, if needed.
+    if (is_tracing) {
+        get_logger()->trace("Taylor function maps breakdown: {}", segment_bd);
     }
     // LCOV_EXCL_STOP
 
-    return retval;
+    // Back in the main state, we begin by invoking all the drivers with order zero.
+    // That is, we are computing the initial values of the u variables.
+    auto &main_bld = main_state.builder();
+    for (auto *cur_driver_f : main_driver_decls) {
+        main_bld.CreateCall(cur_driver_f, {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(0)});
+    }
+
+    // Next, we compute all derivatives up to order 'order - 1'.
+    llvm_loop_u32(main_state, main_bld.getInt32(1), main_bld.getInt32(order), [&](llvm::Value *cur_order) {
+        // State variables first.
+        taylor_c_compute_sv_diffs(main_state, main_fp_t, svd_gl, main_tape_ptr, main_par_ptr, n_uvars, cur_order,
+                                  batch_size);
+
+        // The other u variables.
+        for (auto *cur_driver_f : main_driver_decls) {
+            main_bld.CreateCall(cur_driver_f, {main_tape_ptr, main_par_ptr, main_time_ptr, cur_order});
+        }
+    });
+
+    // Next, we compute the last-order derivatives for the state variables.
+    taylor_c_compute_sv_diffs(main_state, main_fp_t, svd_gl, main_tape_ptr, main_par_ptr, n_uvars,
+                              main_bld.getInt32(order), batch_size);
+
+    // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs
+    // correspond to u variables somewhere in the decomposition, we will have to compute the
+    // last-order derivatives of the u variables until we are sure all sv_funcs derivatives
+    // have been properly computed.
+    if (max_svf_idx >= n_eq) {
+        // Monitor the starting index of the current
+        // segment while iterating on the segments.
+        auto cur_start_u_idx = n_eq;
+
+        for (decltype(s_dc.size()) seg_idx = 0; seg_idx < s_dc.size(); ++seg_idx) {
+            if (cur_start_u_idx > max_svf_idx) {
+                // We computed all the necessary derivatives, break out.
+                break;
+            }
+
+            // Invoke the driver for the current segment.
+            main_bld.CreateCall(main_driver_decls[seg_idx],
+                                {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(order)});
+
+            // Update cur_start_u_idx.
+            cur_start_u_idx += static_cast<std::uint32_t>(s_dc[seg_idx].size());
+        }
+    }
 }
 
 // Helper for the computation of a jet of derivatives in compact mode,
-// used in taylor_compute_jet().
-std::pair<llvm::Value *, llvm::Type *> taylor_compute_jet_compact_mode(
+// used in taylor_compute_jet(). The return value are the size/alignment
+// requirements for the tape of derivatives. All LLVM values and types
+// passed to this function are defined in the main state.
+std::array<std::size_t, 2> taylor_compute_jet_compact_mode(
     // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-    llvm_state &s, llvm::Type *fp_type, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
-    const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq, std::uint32_t n_uvars,
-    std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode)
+    llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
+    llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq,
+    std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode)
 {
-    auto &builder = s.builder();
-    auto &context = s.context();
-    auto &md = s.module();
+    auto &main_bld = main_state.builder();
+    auto &main_md = main_state.module();
+
+    // Determine the vector type corresponding to main_fp_t.
+    auto *main_fp_vec_t = make_vector_type(main_fp_t, batch_size);
 
     // Fetch the external type corresponding to fp_type.
-    auto *ext_fp_t = make_external_llvm_type(fp_type);
+    auto *main_ext_fp_t = make_external_llvm_type(main_fp_t);
 
     // Split dc into segments.
     const auto s_dc = taylor_segment_dc(dc, n_eq);
 
-    // Generate the function maps.
-    const auto f_maps = taylor_build_function_maps(s, fp_type, s_dc, n_eq, n_uvars, batch_size, high_accuracy);
-
-    // Log the runtime of IR construction in trace mode.
-    spdlog::stopwatch sw;
-
-    // Generate the global arrays for the computation of the derivatives
-    // of the state variables.
-    const auto svd_gl = taylor_c_make_sv_diff_globals(s, fp_type, dc, n_uvars);
-
     // Determine the maximum u variable index appearing in sv_funcs_dc, or zero
     // if sv_funcs_dc is empty.
-    const auto max_svf_idx = sv_funcs_dc.empty() ? static_cast<std::uint32_t>(0)
-                                                 : *std::max_element(sv_funcs_dc.begin(), sv_funcs_dc.end());
+    const auto max_svf_idx
+        = sv_funcs_dc.empty() ? static_cast<std::uint32_t>(0) : *std::ranges::max_element(sv_funcs_dc);
 
-    // Prepare the array that will contain the jet of derivatives.
+    // Determine the total number of elements to be stored in the tape of derivatives.
     // We will be storing all the derivatives of the u variables
     // up to order 'order - 1', the derivatives of order
     // 'order' of the state variables and the derivatives
     // of order 'order' of the sv_funcs.
-    // NOTE: the array size is specified as a 64-bit integer in the
-    // LLVM API.
-    // NOTE: fp_type is the original, scalar floating-point type.
-    // It will be turned into a vector type (if necessary) by
-    // make_vector_type() below.
     // NOTE: if sv_funcs_dc is empty, or if all its indices are not greater
     // than the indices of the state variables, then we don't need additional
     // slots after the sv derivatives. If we need additional slots, allocate
     // another full column of derivatives, as it is complicated at this stage
     // to know exactly how many slots we will need.
-    auto *fp_vec_type = make_vector_type(fp_type, batch_size);
-    auto *diff_array_type
-        = llvm::ArrayType::get(fp_vec_type, (max_svf_idx < n_eq) ? (n_uvars * order + n_eq) : (n_uvars * (order + 1u)));
-
-    // Make the global array and fetch a pointer to its first element.
-    // NOTE: we use a global array rather than a local one here because
-    // its size can grow quite large, which can lead to stack overflow issues.
-    // This has of course consequences in terms of thread safety, which
-    // we will have to document.
-    auto *diff_arr_gvar = make_global_zero_array(md, diff_array_type);
-    auto *diff_arr
-        = builder.CreateInBoundsGEP(diff_array_type, diff_arr_gvar, {builder.getInt32(0), builder.getInt32(0)});
-
-    // NOTE: diff_arr is used as temporary storage for the current function,
-    // but it is declared as a global variable in order to avoid stack overflow.
-    // This creates a situation in which LLVM cannot elide stores into diff_arr
+    // NOTE: overflow checking for this computation has been performed externally.
+    const auto tot_tape_N = (max_svf_idx < n_eq) ? (n_uvars * order + n_eq) : (n_uvars * (order + 1u));
+
+    // Total required size in bytes for the tape.
+    const auto tape_sz = boost::safe_numerics::safe<std::size_t>(get_size(main_md, main_fp_vec_t)) * tot_tape_N;
+
+    // Tape alignment.
+    const auto tape_al = boost::numeric_cast<std::size_t>(get_alignment(main_md, main_fp_vec_t));
+
+    // Log the runtime of IR construction in trace mode.
+    spdlog::stopwatch sw;
+
+    // NOTE: tape_ptr is used as temporary storage for the current function,
+    // but it is provided externally from dynamically-allocated memory in order to avoid stack overflow.
+    // This creates a situation in which LLVM cannot elide stores into tape_ptr
     // (even if it figures out a way to avoid storing intermediate results into
-    // diff_arr) because LLVM must assume that some other function may
+    // it) because LLVM must assume that some other function may
     // use these stored values later. Thus, we declare via an intrinsic that the
-    // lifetime of diff_arr begins here and ends at the end of the function,
+    // lifetime of tape_ptr begins here and ends at the end of the function,
     // so that LLVM can assume that any value stored in it cannot be possibly
     // used outside this function.
-    builder.CreateLifetimeStart(diff_arr, builder.getInt64(get_size(md, diff_array_type)));
+    main_bld.CreateLifetimeStart(tape_ptr, main_bld.getInt64(tape_sz));
 
     // Copy over the order-0 derivatives of the state variables.
     // NOTE: overflow checking is already done in the parent function.
-    llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
+    llvm_loop_u32(main_state, main_bld.getInt32(0), main_bld.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
         // Fetch the pointer from order0.
-        auto *ptr
-            = builder.CreateInBoundsGEP(ext_fp_t, order0, builder.CreateMul(cur_var_idx, builder.getInt32(batch_size)));
+        auto *ptr = main_bld.CreateInBoundsGEP(main_ext_fp_t, order0,
+                                               main_bld.CreateMul(cur_var_idx, main_bld.getInt32(batch_size)));
 
         // Load as a vector.
-        auto *vec = ext_load_vector_from_memory(s, fp_type, ptr, batch_size);
-
-        // Store into diff_arr.
-        taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, builder.getInt32(0), cur_var_idx, vec);
-    });
-
-    // NOTE: these are used only in parallel mode.
-    std::vector<std::vector<llvm::AllocaInst *>> par_funcs_ptrs;
-    llvm::Value *gl_par_data = nullptr;
-    llvm::Type *par_data_t = nullptr;
-
-    if (parallel_mode) {
-        auto *ext_fp_ptr_t = llvm::PointerType::getUnqual(ext_fp_t);
-
-        // NOTE: we will use a global variable with these fields:
-        //
-        // - int32 (current Taylor order),
-        // - T * (pointer to the runtime parameters),
-        // - T * (pointer to the time coordinate(s)),
-        //
-        // to pass the data necessary to the parallel workers.
-        par_data_t = llvm::StructType::get(context, {builder.getInt32Ty(), ext_fp_ptr_t, ext_fp_ptr_t});
-        // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
-        gl_par_data = new llvm::GlobalVariable(md, par_data_t, false, llvm::GlobalVariable::InternalLinkage,
-                                               llvm::ConstantAggregateZero::get(par_data_t));
-
-        // Write the par/time pointers into the global struct (unlike the current order, this needs
-        // to be done only once).
-        builder.CreateStore(
-            par_ptr, builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(1)}));
-        builder.CreateStore(
-            time_ptr, builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(2)}));
-
-        // Fetch the function types for the parallel worker and the wrapper.
-        auto *worker_t
-            = llvm::FunctionType::get(builder.getVoidTy(), {builder.getInt32Ty(), builder.getInt32Ty()}, false);
-        assert(worker_t != nullptr); // LCOV_EXCL_LINE
-
-        auto *wrapper_t = llvm::FunctionType::get(builder.getVoidTy(), {}, false);
-        assert(wrapper_t != nullptr); // LCOV_EXCL_LINE
-
-        for (const auto &map : f_maps) {
-            par_funcs_ptrs.emplace_back();
-
-            for (const auto &p : map) {
-                // The LLVM function for the computation of the
-                // derivative in compact mode.
-                const auto &func = p.first;
-
-                // The number of func calls.
-                const auto ncalls = p.second.first;
-
-                // The generators for the arguments of func.
-                const auto &gens = p.second.second;
-
-                // Fetch the current insertion block.
-                auto *orig_bb = builder.GetInsertBlock();
-
-                // Create the worker function.
-                auto *worker = llvm::Function::Create(worker_t, llvm::Function::InternalLinkage, "", &md);
-                assert(worker != nullptr); // LCOV_EXCL_LINE
-
-                // Fetch the function arguments.
-                auto *b_idx = worker->args().begin();
-                auto *e_idx = worker->args().begin() + 1;
-
-                // Create a new basic block to start insertion into.
-                builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", worker));
-
-                // Load the order and par/time pointers from the global variable.
-                auto *cur_order = builder.CreateLoad(
-                    builder.getInt32Ty(),
-                    builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(0)}));
-                auto *par_arg = builder.CreateLoad(
-                    ext_fp_ptr_t,
-                    builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(1)}));
-                auto *time_arg = builder.CreateLoad(
-                    ext_fp_ptr_t,
-                    builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(2)}));
-
-                // Iterate over the range.
-                llvm_loop_u32(s, b_idx, e_idx, [&](llvm::Value *cur_call_idx) {
-                    // Create the u variable index from the first generator.
-                    auto *u_idx = gens[0](cur_call_idx);
-
-                    // Initialise the vector of arguments with which func must be called. The following
-                    // initial arguments are always present:
-                    // - current Taylor order,
-                    // - u index of the variable,
-                    // - array of derivatives,
-                    // - pointer to the param values,
-                    // - pointer to the time value(s).
-                    std::vector<llvm::Value *> args{cur_order, u_idx, diff_arr, par_arg, time_arg};
-
-                    // Create the other arguments via the generators.
-                    for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
-                        args.push_back(gens[i](cur_call_idx));
-                    }
-
-                    // Calculate the derivative and store the result.
-                    taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, cur_order, u_idx,
-                                        builder.CreateCall(func, args));
-                });
-
-                // Return.
-                builder.CreateRetVoid();
-
-                // Create the wrapper function. This will execute multiple calls
-                // to the worker in parallel, until the entire range [0, ncalls) has
-                // been consumed.
-                auto *wrapper = llvm::Function::Create(wrapper_t, llvm::Function::InternalLinkage, "", &md);
-                assert(wrapper != nullptr); // LCOV_EXCL_LINE
-
-                // Create a new basic block to start insertion into.
-                builder.SetInsertPoint(llvm::BasicBlock::Create(context, "entry", wrapper));
-
-                // Invoke the parallel looper.
-                llvm_invoke_external(
-                    s, "heyoka_cm_par_looper", builder.getVoidTy(), {builder.getInt32(ncalls), worker},
-                    llvm::AttributeList::get(context, llvm::AttributeList::FunctionIndex,
-                                             {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn}));
-
-                // Return.
-                builder.CreateRetVoid();
-
-                // Restore the original insertion block.
-                builder.SetInsertPoint(orig_bb);
-
-                // Add a pointer to the wrapper to par_funcs_ptrs.
-                auto *f_ptr = builder.CreateAlloca(wrapper->getType());
-                builder.CreateStore(wrapper, f_ptr);
-                par_funcs_ptrs.back().push_back(f_ptr);
-            }
-        }
-    }
-
-    // Helper to compute the Taylor derivatives for a block.
-    // func is the LLVM function for the computation of the Taylor derivative in the block,
-    // ncalls the number of times it must be called, gens the generators for the
-    // function arguments and cur_order the order of the derivative.
-    auto block_diff = [&](llvm::Function *func, std::uint32_t ncalls, const auto &gens, llvm::Value *cur_order) {
-        // LCOV_EXCL_START
-        assert(ncalls > 0u);
-        assert(!gens.empty());
-        assert(std::all_of(gens.begin(), gens.end(), [](const auto &f) { return static_cast<bool>(f); }));
-        // LCOV_EXCL_STOP
-
-        // We will be manually unrolling loops if ncalls is small enough.
-        // This seems to help with compilation times.
-        constexpr auto max_unroll_n = 5u;
-
-        if (ncalls > max_unroll_n) {
-            // Loop over the number of calls.
-            llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
-                // Create the u variable index from the first generator.
-                auto u_idx = gens[0](cur_call_idx);
-
-                // Initialise the vector of arguments with which func must be called. The following
-                // initial arguments are always present:
-                // - current Taylor order,
-                // - u index of the variable,
-                // - array of derivatives,
-                // - pointer to the param values,
-                // - pointer to the time value(s).
-                std::vector<llvm::Value *> args{cur_order, u_idx, diff_arr, par_ptr, time_ptr};
-
-                // Create the other arguments via the generators.
-                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
-                    args.push_back(gens[i](cur_call_idx));
-                }
-
-                // Calculate the derivative and store the result.
-                taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, cur_order, u_idx,
-                                    builder.CreateCall(func, args));
-            });
-        } else {
-            // The manually-unrolled version of the above.
-            for (std::uint32_t idx = 0; idx < ncalls; ++idx) {
-                auto *cur_call_idx = builder.getInt32(idx);
-                auto u_idx = gens[0](cur_call_idx);
-                std::vector<llvm::Value *> args{cur_order, u_idx, diff_arr, par_ptr, time_ptr};
-
-                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
-                    args.push_back(gens[i](cur_call_idx));
-                }
-
-                taylor_c_store_diff(s, fp_vec_type, diff_arr, n_uvars, cur_order, u_idx,
-                                    builder.CreateCall(func, args));
-            }
-        }
-    };
-
-    // Helper to compute concurrently all the derivatives
-    // in a segment using the parallel wrappers.
-    auto parallel_segment_diff = [&](const auto &pfptrs) {
-        assert(!pfptrs.empty()); // LCOV_EXCL_LINE
-
-        // NOTE: we can invoke in parallel only up to a fixed number
-        // of wrappers. Thus, we process them in chunks.
-
-        // The remaining number of wrappers to invoke.
-        auto rem = pfptrs.size();
-
-        // Starting index in pfptrs.
-        decltype(rem) start_idx = 0;
-
-        while (rem != 0u) {
-            // Current chunk size.
-            const auto cur_size = std::min(static_cast<decltype(rem)>(HEYOKA_CM_PAR_MAX_INVOKE_N), rem);
-
-            // Setup the function name.
-            const auto fname = fmt::format("heyoka_cm_par_invoke_{}", cur_size);
-
-            // Setup the function arguments.
-            std::vector<llvm::Value *> args;
-            for (auto i = start_idx; i < start_idx + cur_size; ++i) {
-                assert(i < pfptrs.size()); // LCOV_EXCL_LINE
-                auto *ptr = pfptrs[i];
-                args.push_back(builder.CreateLoad(ptr->getAllocatedType(), ptr));
-            }
-
-            // Invoke.
-            llvm_invoke_external(s, fname, builder.getVoidTy(), args,
-                                 llvm::AttributeList::get(context, llvm::AttributeList::FunctionIndex,
-                                                          {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn}));
-
-            // Update rem and start_idx.
-            rem -= cur_size;
-            start_idx += cur_size;
-        }
-    };
-
-    // Helper to compute and store the derivatives of order cur_order
-    // of the u variables which are not state variables.
-    auto compute_u_diffs = [&](llvm::Value *cur_order) {
-        if (parallel_mode) {
-            // Store the current order in the global struct.
-            builder.CreateStore(cur_order, builder.CreateInBoundsGEP(par_data_t, gl_par_data,
-                                                                     {builder.getInt32(0), builder.getInt32(0)}));
-
-            // For each segment, invoke the wrapper functions concurrently.
-            for (const auto &pfptrs : par_funcs_ptrs) {
-                parallel_segment_diff(pfptrs);
-            }
-        } else {
-            // For each block in each segment, compute the derivatives
-            // of order cur_order serially.
-            for (const auto &map : f_maps) {
-                for (const auto &p : map) {
-                    block_diff(p.first, p.second.first, p.second.second, cur_order);
-                }
-            }
-        }
-    };
-
-    // Compute the order-0 derivatives (i.e., the initial values)
-    // for all u variables which are not state variables.
-    compute_u_diffs(builder.getInt32(0));
-
-    // Compute all derivatives up to order 'order - 1'.
-    llvm_loop_u32(s, builder.getInt32(1), builder.getInt32(order), [&](llvm::Value *cur_order) {
-        // State variables first.
-        taylor_c_compute_sv_diffs(s, fp_type, svd_gl, diff_arr, par_ptr, n_uvars, cur_order, batch_size);
+        auto *vec = ext_load_vector_from_memory(main_state, main_fp_t, ptr, batch_size);
 
-        // The other u variables.
-        compute_u_diffs(cur_order);
+        // Store into tape_ptr.
+        taylor_c_store_diff(main_state, main_fp_vec_t, tape_ptr, n_uvars, main_bld.getInt32(0), cur_var_idx, vec);
     });
 
-    // Compute the last-order derivatives for the state variables.
-    taylor_c_compute_sv_diffs(s, fp_type, svd_gl, diff_arr, par_ptr, n_uvars, builder.getInt32(order), batch_size);
-
-    // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs
-    // correspond to u variables in the decomposition, we will have to compute the
-    // last-order derivatives of the u variables until we are sure all sv_funcs derivatives
-    // have been properly computed.
-    if (max_svf_idx >= n_eq) {
-        // Monitor the starting index of the current
-        // segment while iterating on the segments.
-        auto cur_start_u_idx = n_eq;
-
-        if (parallel_mode) {
-            // Store the derivative order in the global struct.
-            builder.CreateStore(
-                builder.getInt32(order),
-                builder.CreateInBoundsGEP(par_data_t, gl_par_data, {builder.getInt32(0), builder.getInt32(0)}));
-
-            for (decltype(f_maps.size()) i = 0; i < f_maps.size(); ++i) {
-                if (cur_start_u_idx > max_svf_idx) {
-                    // We computed all the necessary derivatives, break out.
-                    break;
-                }
-
-                // Compute the derivatives for the current segment.
-                parallel_segment_diff(par_funcs_ptrs[i]);
-
-                // Update cur_start_u_idx, taking advantage of the fact
-                // that each block in a segment processes the derivatives
-                // of exactly ncalls u variables.
-                for (const auto &p : f_maps[i]) {
-                    const auto ncalls = p.second.first;
-                    cur_start_u_idx += ncalls;
-                }
-            }
-        } else {
-            for (const auto &map : f_maps) {
-                if (cur_start_u_idx > max_svf_idx) {
-                    // We computed all the necessary derivatives, break out.
-                    break;
-                }
-
-                // Compute the derivatives of all the blocks in the segment.
-                for (const auto &p : map) {
-                    const auto ncalls = p.second.first;
-
-                    block_diff(p.first, ncalls, p.second.second, builder.getInt32(order));
-
-                    // Update cur_start_u_idx taking advantage of the fact
-                    // that each block in a segment processes the derivatives
-                    // of exactly ncalls u variables.
-                    cur_start_u_idx += ncalls;
-                }
-            }
-        }
-    }
-
     get_logger()->trace("Taylor IR creation compact mode runtime: {}", sw);
 
     // Return the array of derivatives of the u variables and its type.
@@ -1114,26 +1046,28 @@ auto taylor_load_values(llvm_state &s, llvm::Type *fp_t, llvm::Value *in, std::u
 // order is the max derivative order desired, batch_size the batch size.
 // order0 is a pointer to an array of (at least) n_eq * batch_size scalar elements
 // containing the derivatives of order 0. par_ptr is a pointer to an array containing
-// the numerical values of the parameters, time_ptr a pointer to the time value(s).
-// sv_funcs are the indices, in the decomposition, of the functions of state
+// the numerical values of the parameters, time_ptr a pointer to the time value(s),
+// tape_ptr a pointer to the tape of derivatives (only in compact mode, otherwise
+// a null value). sv_funcs are the indices, in the decomposition, of the functions of state
 // variables.
 //
 // order0, par_ptr and time_ptr are all external pointers.
 //
 // The return value is a variant containing either:
-// - in compact mode, the array containing the derivatives of all u variables,
+// - in compact mode, the size/alignment requirements for the tape of derivatives,
 // - otherwise, the jet of derivatives of the state variables and sv_funcs
 //   up to order 'order'.
-std::variant<std::pair<llvm::Value *, llvm::Type *>, std::vector<llvm::Value *>>
+std::variant<std::array<std::size_t, 2>, std::vector<llvm::Value *>>
 taylor_compute_jet(llvm_state &s, llvm::Type *fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
-                   const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq,
-                   std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool compact_mode,
-                   bool high_accuracy, bool parallel_mode)
+                   llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc,
+                   std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
+                   bool compact_mode, bool high_accuracy, bool parallel_mode)
 {
     // LCOV_EXCL_START
     assert(batch_size > 0u);
     assert(n_eq > 0u);
     assert(order > 0u);
+    assert((tape_ptr != nullptr) == compact_mode);
     // LCOV_EXCL_STOP
 
     // Make sure we can represent n_uvars * (order + 1) as a 32-bit
@@ -1155,8 +1089,8 @@ taylor_compute_jet(llvm_state &s, llvm::Type *fp_t, llvm::Value *order0, llvm::V
     // LCOV_EXCL_STOP
 
     if (compact_mode) {
-        return taylor_compute_jet_compact_mode(s, fp_t, order0, par_ptr, time_ptr, dc, sv_funcs_dc, n_eq, n_uvars,
-                                               order, batch_size, high_accuracy, parallel_mode);
+        return taylor_compute_jet_compact_mode(s, fp_t, order0, par_ptr, time_ptr, tape_ptr, dc, sv_funcs_dc, n_eq,
+                                               n_uvars, order, batch_size, high_accuracy, parallel_mode);
     } else {
         // Log the runtime of IR construction in trace mode.
         spdlog::stopwatch sw;
diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp
index 3a8bb1bab..ffaa6b5f2 100644
--- a/src/taylor_adaptive.cpp
+++ b/src/taylor_adaptive.cpp
@@ -180,7 +180,7 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_time);
-    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_last_h);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tol);
@@ -386,12 +386,15 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
     m_order = detail::taylor_order_from_tol(m_tol);
 
     // Determine the external fp type.
-    auto *ext_fp_t = detail::to_external_llvm_type<T>(m_llvm.context());
+    auto *ext_fp_t = detail::to_external_llvm_type<T>(std::get<0>(m_llvm_state).context());
 
     // Determine the internal fp type.
     // NOTE: in case of mppp::real, we ensured earlier that the tolerance value
     // has the correct precision, so that internal_llvm_type_like() will yield the correct internal type.
-    auto *fp_t = detail::internal_llvm_type_like(m_llvm, m_tol);
+    auto *fp_t = detail::internal_llvm_type_like(std::get<0>(m_llvm_state), m_tol);
+
+    // The state(s) which will be returned by the construction of the stepper function.
+    std::variant<llvm_state, std::vector<llvm_state>> states;
 
     // Add the stepper function.
     if (with_events) {
@@ -404,11 +407,13 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
             ee.push_back(ev.get_expression());
         }
 
-        m_dc = detail::taylor_add_adaptive_step_with_events(m_llvm, ext_fp_t, fp_t, "step_e", sys, 1, compact_mode, ee,
-                                                            high_accuracy, parallel_mode, m_order);
+        std::tie(m_dc, states)
+            = detail::taylor_add_adaptive_step_with_events(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step_e", sys, 1,
+                                                           compact_mode, ee, high_accuracy, parallel_mode, m_order);
     } else {
-        m_dc = detail::taylor_add_adaptive_step(m_llvm, ext_fp_t, fp_t, "step", sys, 1, high_accuracy, compact_mode,
-                                                parallel_mode, m_order);
+        std::tie(m_dc, states)
+            = detail::taylor_add_adaptive_step(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step", sys, 1, high_accuracy,
+                                               compact_mode, parallel_mode, m_order);
     }
 
     // Fix m_pars' size, if necessary.

From 06c47b4ecb2f33567d8f0f3a3a0254f49be99af1 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 26 Aug 2024 15:12:29 +0200
Subject: [PATCH 03/30] More non-functional WIP. [skip ci]

---
 include/heyoka/detail/i_data.hpp |   4 +
 include/heyoka/taylor.hpp        |  20 +-
 src/detail/i_data.cpp            |  14 +-
 src/taylor_00.cpp                | 178 ++++----
 src/taylor_02.cpp                | 753 +++++++++++++++++--------------
 src/taylor_adaptive.cpp          | 130 ++++--
 6 files changed, 633 insertions(+), 466 deletions(-)

diff --git a/include/heyoka/detail/i_data.hpp b/include/heyoka/detail/i_data.hpp
index 205e0a943..be49f7f49 100644
--- a/include/heyoka/detail/i_data.hpp
+++ b/include/heyoka/detail/i_data.hpp
@@ -69,6 +69,10 @@ struct taylor_adaptive<T>::i_data {
     detail::dfloat<T> m_time;
     // The LLVM (multi)state.
     std::variant<llvm_state, llvm_multi_state> m_llvm_state;
+    // A template LLVM state we keep around to create states
+    // similar to m_llvm_state as needed. This is created with the
+    // same settings as m_llvm_state.
+    llvm_state m_tplt_state;
     // Dimension of the system.
     std::uint32_t m_dim{};
     // Taylor decomposition.
diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp
index 46bcc7258..39498c1e1 100644
--- a/include/heyoka/taylor.hpp
+++ b/include/heyoka/taylor.hpp
@@ -12,6 +12,7 @@
 #include <heyoka/config.hpp>
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <concepts>
 #include <cstddef>
@@ -98,17 +99,19 @@ HEYOKA_DLL_PUBLIC llvm::Value *taylor_c_load_diff(llvm_state &, llvm::Type *, ll
 HEYOKA_DLL_PUBLIC void taylor_c_store_diff(llvm_state &, llvm::Type *, llvm::Value *, std::uint32_t, llvm::Value *,
                                            llvm::Value *, llvm::Value *);
 
-taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &, llvm::Type *, llvm::Type *, const std::string &,
-                                                 const std::vector<std::pair<expression, expression>> &, std::uint32_t,
-                                                 bool, const std::vector<expression> &, bool, bool, std::uint32_t);
+std::tuple<taylor_dc_t, std::array<std::size_t, 2>, std::vector<llvm_state>>
+taylor_add_adaptive_step_with_events(llvm_state &, llvm::Type *, const std::string &,
+                                     const std::vector<std::pair<expression, expression>> &, std::uint32_t, bool,
+                                     const std::vector<expression> &, bool, bool, std::uint32_t);
 
-taylor_dc_t taylor_add_adaptive_step(llvm_state &, llvm::Type *, llvm::Type *, const std::string &,
-                                     const std::vector<std::pair<expression, expression>> &, std::uint32_t, bool, bool,
-                                     bool, std::uint32_t);
+std::tuple<taylor_dc_t, std::array<std::size_t, 2>, std::vector<llvm_state>>
+taylor_add_adaptive_step(llvm_state &, llvm::Type *, llvm::Type *, const std::string &,
+                         const std::vector<std::pair<expression, expression>> &, std::uint32_t, bool, bool, bool,
+                         std::uint32_t);
 
 llvm::Value *taylor_c_make_sv_funcs_arr(llvm_state &, const std::vector<std::uint32_t> &);
 
-std::variant<std::pair<llvm::Value *, llvm::Type *>, std::vector<llvm::Value *>>
+std::variant<std::pair<std::array<std::size_t, 2>, std::vector<llvm_state>>, std::vector<llvm::Value *>>
 taylor_compute_jet(llvm_state &, llvm::Type *, llvm::Value *, llvm::Value *, llvm::Value *, llvm::Value *,
                    const taylor_dc_t &, const std::vector<std::uint32_t> &, std::uint32_t, std::uint32_t, std::uint32_t,
                    std::uint32_t, bool, bool, bool);
@@ -507,6 +510,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada
     explicit taylor_adaptive(private_ctor_t, llvm_state);
 
     HEYOKA_DLL_LOCAL void check_variational(const char *) const;
+    HEYOKA_DLL_LOCAL void assign_stepper(bool);
 
     // Input type for Taylor map computation.
     using tm_input_t = mdspan<const T, dextents<std::uint32_t, 1>>;
@@ -548,7 +552,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada
 
     ~taylor_adaptive();
 
-    [[nodiscard]] const llvm_state &get_llvm_state() const;
+    [[nodiscard]] const std::variant<llvm_state, llvm_multi_state> &get_llvm_state() const;
 
     [[nodiscard]] const taylor_dc_t &get_decomposition() const;
 
diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp
index 572ca7655..cbb27b9ac 100644
--- a/src/detail/i_data.cpp
+++ b/src/detail/i_data.cpp
@@ -119,6 +119,7 @@ void taylor_adaptive<T>::i_data::save(boost::archive::binary_oarchive &ar, unsig
     ar << m_state;
     ar << m_time;
     ar << m_llvm_state;
+    ar << m_tplt_state;
     ar << m_dim;
     ar << m_dc;
     ar << m_order;
@@ -140,6 +141,7 @@ void taylor_adaptive<T>::i_data::load(boost::archive::binary_iarchive &ar, unsig
     ar >> m_state;
     ar >> m_time;
     ar >> m_llvm_state;
+    ar >> m_tplt_state;
     ar >> m_dim;
     ar >> m_dc;
     ar >> m_order;
@@ -168,16 +170,18 @@ void taylor_adaptive<T>::i_data::load(boost::archive::binary_iarchive &ar, unsig
 // of compact mode. It will be converted into a multi state if needed at a
 // later stage.
 template <typename T>
-taylor_adaptive<T>::i_data::i_data(llvm_state s) : m_llvm_state(std::move(s))
+taylor_adaptive<T>::i_data::i_data(llvm_state s)
+    : m_llvm_state(std::move(s)), m_tplt_state(std::get<0>(m_llvm_state).make_similar())
 {
 }
 
 template <typename T>
 taylor_adaptive<T>::i_data::i_data(const i_data &other)
-    : m_state(other.m_state), m_time(other.m_time), m_llvm_state(other.m_llvm_state), m_dim(other.m_dim),
-      m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy),
-      m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa), m_pars(other.m_pars), m_tc(other.m_tc),
-      m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_vsys(other.m_vsys), m_tm_data(other.m_tm_data)
+    : m_state(other.m_state), m_time(other.m_time), m_llvm_state(other.m_llvm_state), m_tplt_state(other.m_tplt_state),
+      m_dim(other.m_dim), m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol),
+      m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa),
+      m_pars(other.m_pars), m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_vsys(other.m_vsys),
+      m_tm_data(other.m_tm_data)
 {
     // Recover the function pointers.
     m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
diff --git a/src/taylor_00.cpp b/src/taylor_00.cpp
index b3e322a57..7ddc57979 100644
--- a/src/taylor_00.cpp
+++ b/src/taylor_00.cpp
@@ -8,7 +8,6 @@
 
 #include <heyoka/config.hpp>
 
-#include <algorithm>
 #include <array>
 #include <cassert>
 #include <cstddef>
@@ -95,20 +94,20 @@ number taylor_determine_h_rhofac(llvm_state &s, llvm::Type *fp_t, std::uint32_t
 }
 
 // Helper to generate the LLVM code to determine the timestep in an adaptive Taylor integrator,
-// following Jorba's prescription. diff_variant is the output of taylor_compute_jet(), and it contains
-// the jet of derivatives for the state variables and the sv_funcs. h_ptr is an external pointer containing
-// the clamping values for the timesteps. svf_ptr is a pointer to the first element of an LLVM array containing the
-// values in sv_funcs_dc. If max_abs_state_ptr is not nullptr, the computed norm infinity of the
-// state vector (including sv_funcs, if any) will be written into it (max_abs_state_ptr is an external pointer).
-llvm::Value *
-taylor_determine_h(llvm_state &s, llvm::Type *fp_t,
-                   const std::variant<std::pair<llvm::Value *, llvm::Type *>, std::vector<llvm::Value *>> &diff_variant,
-                   const std::vector<std::uint32_t> &sv_funcs_dc,
-                   // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-                   llvm::Value *svf_ptr, llvm::Value *h_ptr,
-                   // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-                   std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
-                   llvm::Value *max_abs_state_ptr)
+// following Jorba's prescription. diff_variant is the output of taylor_compute_jet(). h_ptr is an external pointer
+// containing the clamping values for the timesteps. svf_ptr is a pointer to the first element of an LLVM array
+// containing the values in sv_funcs_dc. If max_abs_state_ptr is not nullptr, the computed norm infinity of the state
+// vector (including sv_funcs, if any) will be written into it (max_abs_state_ptr is an external pointer).
+// tape_ptr is the pointer to the tape of derivatives in compact mode, or a null pointer otherwise.
+llvm::Value *taylor_determine_h(llvm_state &s, llvm::Type *fp_t,
+                                const std::variant<std::pair<std::array<std::size_t, 2>, std::vector<llvm_state>>,
+                                                   std::vector<llvm::Value *>> &diff_variant,
+                                const std::vector<std::uint32_t> &sv_funcs_dc,
+                                // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+                                llvm::Value *svf_ptr, llvm::Value *h_ptr,
+                                // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+                                std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order,
+                                std::uint32_t batch_size, llvm::Value *max_abs_state_ptr, llvm::Value *tape_ptr)
 {
     assert(batch_size != 0u);
 #if !defined(NDEBUG)
@@ -130,7 +129,8 @@ taylor_determine_h(llvm_state &s, llvm::Type *fp_t,
 
     if (diff_variant.index() == 0u) {
         // Compact mode.
-        auto *diff_arr = std::get<0>(diff_variant).first;
+        assert(tape_ptr != nullptr);
+        auto *diff_arr = tape_ptr;
 
         // These will end up containing the norm infinity of the state vector + sv_funcs and the
         // norm infinity of the derivatives at orders order and order - 1.
@@ -196,6 +196,7 @@ taylor_determine_h(llvm_state &s, llvm::Type *fp_t,
         max_abs_diff_om1 = builder.CreateLoad(vec_t, max_abs_diff_om1);
     } else {
         // Non-compact mode.
+        assert(tape_ptr == nullptr);
         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
 
         const auto n_sv_funcs = static_cast<std::uint32_t>(sv_funcs_dc.size());
@@ -271,23 +272,24 @@ taylor_determine_h(llvm_state &s, llvm::Type *fp_t,
 }
 
 // Run the Horner scheme to propagate an ODE state via the evaluation of the Taylor polynomials.
-// diff_var contains either the derivatives for all u variables (in compact mode) or only
-// for the state variables (non-compact mode). The evaluation point (i.e., the timestep)
+// diff_var is the output of taylor_compute_jet(). The evaluation point (i.e., the timestep)
 // is h. The evaluation is run in parallel over the polynomials of all the state
-// variables.
-std::variant<llvm::Value *, std::vector<llvm::Value *>>
-taylor_run_multihorner(llvm_state &s, llvm::Type *fp_t,
-                       const std::variant<std::pair<llvm::Value *, llvm::Type *>, std::vector<llvm::Value *>> &diff_var,
-                       // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-                       llvm::Value *h, std::uint32_t n_eq, std::uint32_t n_uvars,
-                       // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-                       std::uint32_t order, std::uint32_t batch_size)
+// variables. tape_ptr is the pointer to the tape of derivatives in compact mode, or a null pointer otherwise.
+std::variant<llvm::Value *, std::vector<llvm::Value *>> taylor_run_multihorner(
+    llvm_state &s, llvm::Type *fp_t,
+    const std::variant<std::pair<std::array<std::size_t, 2>, std::vector<llvm_state>>, std::vector<llvm::Value *>>
+        &diff_var,
+    // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+    llvm::Value *h, std::uint32_t n_eq, std::uint32_t n_uvars,
+    // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+    std::uint32_t order, std::uint32_t batch_size, llvm::Value *tape_ptr)
 {
     auto &builder = s.builder();
 
     if (diff_var.index() == 0u) {
         // Compact mode.
-        auto *diff_arr = std::get<0>(diff_var).first;
+        assert(tape_ptr != nullptr);
+        auto *diff_arr = tape_ptr;
 
         // Create the array storing the results of the evaluation.
         auto *fp_vec_t = make_vector_type(fp_t, batch_size);
@@ -325,6 +327,7 @@ taylor_run_multihorner(llvm_state &s, llvm::Type *fp_t,
         return res_arr;
     } else {
         // Non-compact mode.
+        assert(tape_ptr == nullptr);
         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_var);
 
         // Init the return value, filling it with the values of the
@@ -347,18 +350,21 @@ taylor_run_multihorner(llvm_state &s, llvm::Type *fp_t,
 
 // Same as taylor_run_multihorner(), but instead of the Horner scheme this implementation uses
 // a compensated summation over the naive evaluation of monomials.
-std::variant<llvm::Value *, std::vector<llvm::Value *>>
-taylor_run_ceval(llvm_state &s, llvm::Type *fp_t,
-                 const std::variant<std::pair<llvm::Value *, llvm::Type *>, std::vector<llvm::Value *>> &diff_var,
-                 llvm::Value *h,
-                 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-                 std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, bool, std::uint32_t batch_size)
+std::variant<llvm::Value *, std::vector<llvm::Value *>> taylor_run_ceval(
+    llvm_state &s, llvm::Type *fp_t,
+    const std::variant<std::pair<std::array<std::size_t, 2>, std::vector<llvm_state>>, std::vector<llvm::Value *>>
+        &diff_var,
+    llvm::Value *h,
+    // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+    std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, bool, std::uint32_t batch_size,
+    llvm::Value *tape_ptr)
 {
     auto &builder = s.builder();
 
     if (diff_var.index() == 0u) {
         // Compact mode.
-        auto *diff_arr = std::get<0>(diff_var).first;
+        assert(tape_ptr != nullptr);
+        auto *diff_arr = tape_ptr;
 
         // Create the arrays storing the results of the evaluation and the running compensations.
         auto *fp_vec_t = make_vector_type(fp_t, batch_size);
@@ -416,6 +422,7 @@ taylor_run_ceval(llvm_state &s, llvm::Type *fp_t,
         return res_arr;
     } else {
         // Non-compact mode.
+        assert(tape_ptr == nullptr);
         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_var);
 
         // Init the return values with the order-0 monomials, and the running
@@ -453,13 +460,15 @@ taylor_run_ceval(llvm_state &s, llvm::Type *fp_t,
 // Helper to generate the LLVM code to store the Taylor coefficients of the state variables and
 // the sv funcs into an external array. The Taylor polynomials are stored in row-major order,
 // first the state variables and after the sv funcs. For use in the adaptive timestepper implementations.
-// tc_ptr is an external pointer.
-void taylor_write_tc(
-    llvm_state &s, llvm::Type *fp_t,
-    const std::variant<std::pair<llvm::Value *, llvm::Type *>, std::vector<llvm::Value *>> &diff_variant,
-    const std::vector<std::uint32_t> &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *tc_ptr,
-    // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-    std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size)
+// tc_ptr is an external pointer. tape_ptr is the pointer to the tape of derivatives in compact mode, or a null pointer
+// otherwise.
+void taylor_write_tc(llvm_state &s, llvm::Type *fp_t,
+                     const std::variant<std::pair<std::array<std::size_t, 2>, std::vector<llvm_state>>,
+                                        std::vector<llvm::Value *>> &diff_variant,
+                     const std::vector<std::uint32_t> &sv_funcs_dc, llvm::Value *svf_ptr, llvm::Value *tc_ptr,
+                     // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+                     std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
+                     llvm::Value *tape_ptr)
 {
     // LCOV_EXCL_START
     assert(batch_size != 0u);
@@ -499,8 +508,8 @@ void taylor_write_tc(
 
     if (diff_variant.index() == 0u) {
         // Compact mode.
-
-        auto *diff_arr = std::get<0>(diff_variant).first;
+        assert(tape_ptr != nullptr);
+        auto *diff_arr = tape_ptr;
 
         // Write out the Taylor coefficients for the state variables.
         llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(n_eq), [&](llvm::Value *cur_var) {
@@ -546,7 +555,7 @@ void taylor_write_tc(
         }
     } else {
         // Non-compact mode.
-
+        assert(tape_ptr == nullptr);
         const auto &diff_arr = std::get<std::vector<llvm::Value *>>(diff_variant);
 
         for (std::uint32_t j = 0; j < n_eq + n_sv_funcs; ++j) {
@@ -578,12 +587,11 @@ void taylor_write_tc(
 // propagate the state of the system. Instead, its output will be the jet of derivatives
 // of all state variables and event equations, and the deduced timestep value(s).
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_fp_t, llvm::Type *fp_t,
-                                                 const std::string &name,
-                                                 const std::vector<std::pair<expression, expression>> &sys,
-                                                 std::uint32_t batch_size, bool compact_mode,
-                                                 const std::vector<expression> &evs, bool high_accuracy,
-                                                 bool parallel_mode, std::uint32_t order)
+std::tuple<taylor_dc_t, std::array<std::size_t, 2>, std::vector<llvm_state>>
+taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *fp_t, const std::string &name,
+                                     const std::vector<std::pair<expression, expression>> &sys,
+                                     std::uint32_t batch_size, bool compact_mode, const std::vector<expression> &evs,
+                                     bool high_accuracy, bool parallel_mode, std::uint32_t order)
 {
     assert(!s.is_compiled());
     assert(batch_size != 0u);
@@ -603,14 +611,17 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_
     auto &md = s.module();
 
     // Prepare the function prototype. The arguments are:
+    //
     // - pointer to the output jet of derivative (write only),
     // - pointer to the current state vector (read only),
     // - pointer to the parameters (read only),
     // - pointer to the time value(s) (read only),
     // - pointer to the array of max timesteps (read & write),
-    // - pointer to the max_abs_state output variable (write only).
+    // - pointer to the max_abs_state output variable (write only),
+    // - pointer to the tape (read & write, compact mode only).
+    //
     // These pointers cannot overlap.
-    const std::vector<llvm::Type *> fargs(6, llvm::PointerType::getUnqual(ext_fp_t));
+    const std::vector<llvm::Type *> fargs(compact_mode ? 7 : 6, llvm::PointerType::getUnqual(context));
     // The function does not return anything.
     auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
     assert(ft != nullptr);
@@ -655,6 +666,14 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_
     max_abs_state_ptr->addAttr(llvm::Attribute::NoAlias);
     max_abs_state_ptr->addAttr(llvm::Attribute::WriteOnly);
 
+    llvm::Argument *tape_ptr = nullptr;
+    if (compact_mode) {
+        tape_ptr = max_abs_state_ptr + 1;
+        tape_ptr->setName("tape_ptr");
+        tape_ptr->addAttr(llvm::Attribute::NoCapture);
+        tape_ptr->addAttr(llvm::Attribute::NoAlias);
+    }
+
     // Create a new basic block to start insertion into.
     auto *bb = llvm::BasicBlock::Create(context, "entry", f);
     assert(bb != nullptr); // LCOV_EXCL_LINE
@@ -665,50 +684,44 @@ taylor_dc_t taylor_add_adaptive_step_with_events(llvm_state &s, llvm::Type *ext_
     auto *svf_ptr = compact_mode ? taylor_c_make_sv_funcs_arr(s, ev_dc) : nullptr;
 
     // Compute the jet of derivatives at the given order.
-    auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, dc, ev_dc, n_eq, n_uvars, order,
-                                           batch_size, compact_mode, high_accuracy, parallel_mode);
+    auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, tape_ptr, dc, ev_dc, n_eq, n_uvars,
+                                           order, batch_size, compact_mode, high_accuracy, parallel_mode);
 
     // Determine the integration timestep.
     auto *h = taylor_determine_h(s, fp_t, diff_variant, ev_dc, svf_ptr, h_ptr, n_eq, n_uvars, order, batch_size,
-                                 max_abs_state_ptr);
+                                 max_abs_state_ptr, tape_ptr);
 
     // Store h to memory.
     ext_store_vector_to_memory(s, h_ptr, h);
 
     // Copy the jet of derivatives to jet_ptr.
-    taylor_write_tc(s, fp_t, diff_variant, ev_dc, svf_ptr, jet_ptr, n_eq, n_uvars, order, batch_size);
+    taylor_write_tc(s, fp_t, diff_variant, ev_dc, svf_ptr, jet_ptr, n_eq, n_uvars, order, batch_size, tape_ptr);
 
     // End the lifetime of the array of derivatives, if we are in compact mode.
     if (compact_mode) {
-        builder.CreateLifetimeEnd(std::get<0>(diff_variant).first,
-                                  builder.getInt64(get_size(md, std::get<0>(diff_variant).second)));
+        const auto [sz, al] = std::get<0>(diff_variant).first;
+        builder.CreateLifetimeEnd(tape_ptr, builder.getInt64(boost::numeric_cast<std::uint64_t>(sz)));
     }
 
     // Create the return value.
     builder.CreateRetVoid();
 
-    return dc;
+    if (compact_mode) {
+        return {std::move(dc), std::move(std::get<0>(diff_variant).first), std::move(std::get<0>(diff_variant).second)};
+    } else {
+        return {std::move(dc), {}, {}};
+    }
 }
 
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-std::tuple<taylor_dc_t, std::variant<llvm_state, std::vector<llvm_state>>, std::array<std::size_t, 2>>
-taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name,
+std::tuple<taylor_dc_t, std::array<std::size_t, 2>, std::vector<llvm_state>>
+taylor_add_adaptive_step(llvm_state &s, llvm::Type *ext_fp_t, llvm::Type *fp_t, const std::string &name,
                          const std::vector<std::pair<expression, expression>> &sys, std::uint32_t batch_size,
                          bool high_accuracy, bool compact_mode, bool parallel_mode, std::uint32_t order)
 {
-    assert(!tplt.is_compiled());
+    assert(!s.is_compiled());
     assert(batch_size > 0u);
 
-    // Setup the return state(s) and fetch the main state.
-    auto ret_states = [compact_mode, &tplt]() -> std::variant<llvm_state, std::vector<llvm_state>> {
-        if (compact_mode) {
-            return std::vector{tplt.make_similar()};
-        } else {
-            return tplt.make_similar();
-        }
-    }();
-    auto &s = compact_mode ? std::get<1>(ret_states)[0] : std::get<0>(ret_states);
-
     // Record the number of equations/variables.
     const auto n_eq = boost::numeric_cast<std::uint32_t>(sys.size());
 
@@ -788,18 +801,19 @@ taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Typ
     assert(bb != nullptr);
     builder.SetInsertPoint(bb);
 
-    // Compute the jet of derivatives at the given order.
+    // Generate the code for the computation of the jet of derivatives at the given order.
     auto diff_variant = taylor_compute_jet(s, fp_t, state_ptr, par_ptr, time_ptr, tape_ptr, dc, {}, n_eq, n_uvars,
                                            order, batch_size, compact_mode, high_accuracy, parallel_mode);
 
     // Determine the integration timestep.
     auto *h = taylor_determine_h(s, fp_t, diff_variant, sv_funcs_dc, nullptr, h_ptr, n_eq, n_uvars, order, batch_size,
-                                 nullptr);
+                                 nullptr, tape_ptr);
 
     // Evaluate the Taylor polynomials, producing the updated state of the system.
     auto new_state_var
-        = high_accuracy ? taylor_run_ceval(s, fp_t, diff_variant, h, n_eq, n_uvars, order, high_accuracy, batch_size)
-                        : taylor_run_multihorner(s, fp_t, diff_variant, h, n_eq, n_uvars, order, batch_size);
+        = high_accuracy
+              ? taylor_run_ceval(s, fp_t, diff_variant, h, n_eq, n_uvars, order, high_accuracy, batch_size, tape_ptr)
+              : taylor_run_multihorner(s, fp_t, diff_variant, h, n_eq, n_uvars, order, batch_size, tape_ptr);
 
     // Store the new state.
     // NOTE: no need to perform overflow check on n_eq * batch_size,
@@ -837,7 +851,7 @@ taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Typ
         [&]() {
             // tc_ptr is not null: copy the Taylor coefficients
             // for the state variables.
-            taylor_write_tc(s, fp_t, diff_variant, {}, nullptr, tc_ptr, n_eq, n_uvars, order, batch_size);
+            taylor_write_tc(s, fp_t, diff_variant, {}, nullptr, tc_ptr, n_eq, n_uvars, order, batch_size, tape_ptr);
         },
         []() {
             // Taylor coefficients were not requested,
@@ -846,14 +860,18 @@ taylor_add_adaptive_step(const llvm_state &tplt, llvm::Type *ext_fp_t, llvm::Typ
 
     // End the lifetime of the array of derivatives, if we are in compact mode.
     if (compact_mode) {
-        builder.CreateLifetimeEnd(std::get<0>(diff_variant).first,
-                                  builder.getInt64(get_size(md, std::get<0>(diff_variant).second)));
+        const auto [sz, al] = std::get<0>(diff_variant).first;
+        builder.CreateLifetimeEnd(tape_ptr, builder.getInt64(boost::numeric_cast<std::uint64_t>(sz)));
     }
 
     // Create the return value.
     builder.CreateRetVoid();
 
-    return dc;
+    if (compact_mode) {
+        return {std::move(dc), std::move(std::get<0>(diff_variant).first), std::move(std::get<0>(diff_variant).second)};
+    } else {
+        return {std::move(dc), {}, {}};
+    }
 }
 
 } // namespace detail
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 0b824b892..0a3c03a95 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -17,6 +17,7 @@
 #include <limits>
 #include <list>
 #include <map>
+#include <ranges>
 #include <stdexcept>
 #include <type_traits>
 #include <utility>
@@ -100,8 +101,6 @@ namespace
 // that do not represent state variables) into parallelisable segments. Within a segment,
 // the definition of a u variable does not depend on any u variable defined within that segment.
 // NOTE: the hidden deps are not considered as dependencies.
-// NOTE: the segments in the return value will contain shallow copies of the
-// expressions in dc.
 std::vector<taylor_dc_t> taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t n_eq)
 {
     // Log runtime in trace mode.
@@ -177,8 +176,8 @@ std::vector<taylor_dc_t> taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t
     }
 
 #if !defined(NDEBUG)
-    // Verify s_dc.
 
+    // Verify s_dc.
     decltype(dc.size()) counter = 0;
     for (const auto &s : s_dc) {
         // No segment can be empty.
@@ -199,6 +198,7 @@ std::vector<taylor_dc_t> taylor_segment_dc(const taylor_dc_t &dc, std::uint32_t
     }
 
     assert(counter == dc.size() - static_cast<decltype(dc.size())>(n_eq) * 2u);
+
 #endif
 
     get_logger()->debug("Taylor decomposition N of segments: {}", s_dc.size());
@@ -502,45 +502,309 @@ void taylor_c_compute_sv_diffs(llvm_state &s, llvm::Type *fp_t,
     });
 }
 
-// Helper to perform the computation of the Taylor derivatives in compact mode across
-// multiple LLVM states.
-auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *main_par_ptr,
-                              llvm::Value *main_time_ptr, llvm::Value *main_tape_ptr, const taylor_dc_t &dc,
-                              const std::vector<taylor_dc_t> &s_dc, const std::vector<std::uint32_t> &sv_funcs_dc,
-                              std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
-                              bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx)
+// Helper to create and return the prototype of a driver function for
+// the computation of Taylor derivatives in compact mode. s is the llvm state
+// in which we are operating, cur_idx the index of the driver.
+llvm::Function *taylor_cm_make_driver_proto(llvm_state &s, unsigned cur_idx)
 {
-    // TODO implement.
-    (void)parallel_mode;
+    auto &builder = s.builder();
+    auto &md = s.module();
+    auto &ctx = s.context();
+
+    // The arguments to the driver are:
+    // - a pointer to the tape,
+    // - pointers to par and time,
+    // - the current diff order.
+    auto *ptr_tp = llvm::PointerType::getUnqual(ctx);
+    std::vector<llvm::Type *> fargs{ptr_tp, ptr_tp, ptr_tp, builder.getInt32Ty()};
+
+    // The driver does not return anything.
+    auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
+    assert(ft != nullptr); // LCOV_EXCL_LINE
+
+    // Now create the driver.
+    const auto cur_name = fmt::format("heyoka.cm_jet.driver_{}", cur_idx);
+    auto *f = llvm_func_create(ft, llvm::Function::ExternalLinkage, cur_name, &md);
+    // NOTE: the driver cannot call itself recursively.
+    f->addFnAttr(llvm::Attribute::NoRecurse);
+
+    // Add the arguments' attributes.
+    // NOTE: no aliasing is assumed between the pointer
+    // arguments.
+    auto *tape_arg = f->args().begin();
+    tape_arg->setName("tape_ptr");
+    tape_arg->addAttr(llvm::Attribute::NoCapture);
+    tape_arg->addAttr(llvm::Attribute::NoAlias);
+
+    auto *par_ptr_arg = tape_arg + 1;
+    par_ptr_arg->setName("par_ptr");
+    par_ptr_arg->addAttr(llvm::Attribute::NoCapture);
+    par_ptr_arg->addAttr(llvm::Attribute::NoAlias);
+    par_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
+
+    auto *time_ptr_arg = tape_arg + 2;
+    time_ptr_arg->setName("time_ptr");
+    time_ptr_arg->addAttr(llvm::Attribute::NoCapture);
+    time_ptr_arg->addAttr(llvm::Attribute::NoAlias);
+    time_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
+
+    auto *order_arg = tape_arg + 3;
+    order_arg->setName("order");
+
+    return f;
+}
 
-    // Generate the global arrays for the computation of the derivatives
-    // of the state variables in the main state.
-    const auto svd_gl = taylor_c_make_sv_diff_globals(main_state, main_fp_t, dc, n_uvars);
+// Helper to codegen the computation of the Taylor derivatives for a block.
+//
+// s is the llvm state in which we are operating, func is the LLVM function for the computation of the Taylor
+// derivative in the block, ncalls the number of times it must be called, gens the generators for the
+// function arguments, tape/par/time_ptr the pointers to the tape/parameter value(s)/time value(s),
+// cur_order the order of the derivative, fp_vec_type the internal vector type used for computations,
+// n_uvars the total number of u variables.
+void taylor_cm_codegen_block_diff(llvm_state &s, llvm::Function *func, std::uint32_t ncalls, const auto &gens,
+                                  llvm::Value *tape_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr,
+                                  llvm::Value *cur_order, llvm::Type *fp_vec_type, std::uint32_t n_uvars)
+{
+    // LCOV_EXCL_START
+    assert(ncalls > 0u);
+    assert(!gens.empty());
+    assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast<bool>(f); }));
+    // LCOV_EXCL_STOP
 
-    // Structure used to log, in trace mode, the breakdown of each segment.
-    // For each segment, this structure contains the number of invocations
-    // of each function in the segment. It will be unused if we are not tracing.
-    std::vector<std::vector<std::uint32_t>> segment_bd;
+    // Fetch the builder for the current state.
+    auto &bld = s.builder();
+
+    // We will be manually unrolling loops if ncalls is small enough.
+    // This seems to help with compilation times.
+    constexpr auto max_unroll_n = 5u;
+
+    if (ncalls > max_unroll_n) {
+        // Loop over the number of calls.
+        llvm_loop_u32(s, bld.getInt32(0), bld.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
+            // Create the u variable index from the first generator.
+            auto u_idx = gens[0](cur_call_idx);
+
+            // Initialise the vector of arguments with which func must be called. The following
+            // initial arguments are always present:
+            // - current Taylor order,
+            // - u index of the variable,
+            // - tape of derivatives,
+            // - pointer to the param values,
+            // - pointer to the time value(s).
+            std::vector<llvm::Value *> args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr};
+
+            // Create the other arguments via the generators.
+            for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                args.push_back(gens[i](cur_call_idx));
+            }
 
-    // Are we tracing?
-    const auto is_tracing = get_logger()->should_log(spdlog::level::trace);
+            // Calculate the derivative and store the result.
+            taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args));
+        });
+    } else {
+        // The manually-unrolled version of the above.
+        for (std::uint32_t idx = 0; idx < ncalls; ++idx) {
+            auto *cur_call_idx = bld.getInt32(idx);
+            auto u_idx = gens[0](cur_call_idx);
+            std::vector<llvm::Value *> args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr};
+
+            for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                args.push_back(gens[i](cur_call_idx));
+            }
 
-    // List of evaluation functions in a segment.
-    //
-    // This map contains a list of functions for the compact-mode evaluation of Taylor derivatives.
-    // Each function is mapped to a pair, containing:
-    //
-    // - the number of times the function is to be invoked,
-    // - a list of functors (generators) that generate the arguments for
-    //   the invocation.
-    //
-    // NOTE: we use maps with name-based comparison for the functions. This ensures that the order in which these
-    // functions are invoked is always the same. If we used directly pointer
-    // comparisons instead, the order could vary across different executions and different platforms. The name
-    // mangling we do when creating the function names should ensure that there are no possible name collisions.
-    using seg_f_list_t
-        = std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
-                   llvm_func_name_compare>;
+            taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args));
+        }
+    }
+}
+
+// List of evaluation functions in a segment.
+//
+// This map contains a list of functions for the compact-mode evaluation of Taylor derivatives.
+// Each function is mapped to a pair, containing:
+//
+// - the number of times the function is to be invoked,
+// - a list of functors (generators) that generate the arguments for
+//   the invocation.
+//
+// NOTE: we use maps with name-based comparison for the functions. This ensures that the order in which these
+// functions are invoked is always the same. If we used directly pointer
+// comparisons instead, the order could vary across different executions and different platforms. The name
+// mangling we do when creating the function names should ensure that there are no possible name collisions.
+using taylor_cm_seg_f_list_t
+    = std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
+               llvm_func_name_compare>;
+
+// Helper to codegen the computation of the Taylor derivatives for a segment.
+//
+// seg is the segment, start_u_idx the index of the first u variable in the segment, s the llvm state
+// we are operating in, fp_t the internal scalar floating-point type, batch_size the batch size, n_uvars
+// the total number of u variables, high_accuracy the high accuracy flag.
+taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint32_t start_u_idx, llvm_state &s,
+                                                      llvm::Type *fp_t, std::uint32_t batch_size, std::uint32_t n_uvars,
+                                                      bool high_accuracy)
+{
+    // Fetch the internal vector type.
+    auto *fp_vec_type = make_vector_type(fp_t, batch_size);
+
+    // Fetch the current builder.
+    auto &bld = s.builder();
+
+    // This structure maps a function to sets of arguments
+    // with which the function is to be called. For instance, if function
+    // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
+    // will contain {f : [[a, b, c], [d, e, f]]}.
+    // After construction, we have verified that for each function
+    // in the map the sets of arguments have all the same size.
+    // NOTE: again, here and below we use name-based ordered maps for the functions.
+    // This ensures that the invocations of cm_make_arg_gen_*(), which create several
+    // global variables, always happen in a well-defined order. If we used an unordered map instead,
+    // the variables would be created in a "random" order, which would result in a
+    // unnecessary miss for the in-memory cache machinery when two logically-identical
+    // LLVM modules are considered different because of the difference in the order
+    // of declaration of global variables.
+    std::map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>, llvm_func_name_compare>
+        tmp_map;
+
+    for (const auto &ex : seg) {
+        // Get the function for the computation of the derivative.
+        auto *func = taylor_c_diff_func(s, fp_t, ex.first, n_uvars, batch_size, high_accuracy);
+
+        // Insert the function into tmp_map.
+        const auto [it, is_new_func] = tmp_map.try_emplace(func);
+
+        assert(is_new_func || !it->second.empty()); // LCOV_EXCL_LINE
+
+        // Convert the variables/constants in the current dc
+        // element into a set of indices/constants.
+        const auto cdiff_args = udef_to_variants(ex.first, ex.second);
+
+        // LCOV_EXCL_START
+        if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) {
+            throw std::invalid_argument(
+                fmt::format("Inconsistent arity detected in a Taylor derivative function in compact "
+                            "mode: the same function is being called with both {} and {} arguments",
+                            it->second.back().size() - 1u, cdiff_args.size()));
+        }
+        // LCOV_EXCL_STOP
+
+        // Add the new set of arguments.
+        it->second.emplace_back();
+        // Add the idx of the u variable.
+        it->second.back().emplace_back(start_u_idx);
+        // Add the actual function arguments.
+        it->second.back().insert(it->second.back().end(), cdiff_args.begin(), cdiff_args.end());
+
+        // Update start_u_idx.
+        ++start_u_idx;
+    }
+
+    // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
+    // to {f : [[a, d], [b, e], [c, f]]}.
+    std::map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>,
+             llvm_func_name_compare>
+        tmp_map_transpose;
+    for (const auto &[func, vv] : tmp_map) {
+        assert(!vv.empty()); // LCOV_EXCL_LINE
+
+        // Add the function.
+        const auto [it, ins_status] = tmp_map_transpose.try_emplace(func);
+        assert(ins_status); // LCOV_EXCL_LINE
+
+        const auto n_calls = vv.size();
+        const auto n_args = vv[0].size();
+        // NOTE: n_args must be at least 1 because the u idx
+        // is prepended to the actual function arguments in
+        // the tmp_map entries.
+        assert(n_args >= 1u); // LCOV_EXCL_LINE
+
+        for (decltype(vv[0].size()) i = 0; i < n_args; ++i) {
+            // Build the vector of values corresponding
+            // to the current argument index.
+            std::vector<std::variant<std::uint32_t, number>> tmp_c_vec;
+            for (decltype(vv.size()) j = 0; j < n_calls; ++j) {
+                tmp_c_vec.push_back(vv[j][i]);
+            }
+
+            // Turn tmp_c_vec (a vector of variants) into a variant
+            // of vectors, and insert the result.
+            it->second.push_back(vv_transpose(tmp_c_vec));
+        }
+    }
+
+    // Create the taylor_cm_seg_f_list_t for the current segment.
+    taylor_cm_seg_f_list_t seg_map;
+
+    for (const auto &[func, vv] : tmp_map_transpose) {
+        // NOTE: vv.size() is now the number of arguments. We know it cannot
+        // be zero because the functions to compute the Taylor derivatives
+        // in compact mode always have at least 1 argument (i.e., the index
+        // of the u variable whose derivative is being computed).
+        assert(!vv.empty()); // LCOV_EXCL_LINE
+
+        // Add the function.
+        const auto [it, ins_status] = seg_map.try_emplace(func);
+        assert(ins_status); // LCOV_EXCL_LINE
+
+        // Set the number of calls for this function.
+        it->second.first
+            = std::visit([](const auto &x) { return boost::numeric_cast<std::uint32_t>(x.size()); }, vv[0]);
+        assert(it->second.first > 0u); // LCOV_EXCL_LINE
+
+        // Create the g functions for each argument.
+        for (const auto &v : vv) {
+            it->second.second.push_back(std::visit(
+                [&s, fp_t](const auto &x) {
+                    using type = uncvref_t<decltype(x)>;
+
+                    if constexpr (std::is_same_v<type, std::vector<std::uint32_t>>) {
+                        return cm_make_arg_gen_vidx(s, x);
+                    } else {
+                        return cm_make_arg_gen_vc(s, fp_t, x);
+                    }
+                },
+                v));
+        }
+    }
+
+    // Fetch the arguments from the driver prototype.
+    auto *driver_f = bld.GetInsertBlock()->getParent();
+    assert(driver_f != nullptr);
+    assert(driver_f->arg_size() == 4u);
+    auto *tape_ptr = driver_f->args().begin();
+    auto *par_ptr = driver_f->args().begin() + 1;
+    auto *time_ptr = driver_f->args().begin() + 2;
+    auto *cur_order = driver_f->args().begin() + 3;
+
+    // Compute the derivatives for this segment.
+    for (const auto &[func, fpair] : seg_map) {
+        const auto &[ncalls, gens] = fpair;
+
+        taylor_cm_codegen_block_diff(s, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type,
+                                     n_uvars);
+    }
+
+    return seg_map;
+}
+
+// Helper to codegen the computation of the Taylor derivatives in compact mode via
+// driver functions implemented across multiple LLVM states. main_state is the state in which the stepper is defined,
+// main_fp_t the internal scalar floating-point type as defined in the main state,
+// main_par/main_time/main_tape_ptr the parameters/time/tape pointers as defined in the
+// main state, dc the Taylor decomposition, s_dc its segmented counterpart, n_eq the number
+// of equations/state variables, order the Taylor order, batch_size the batch size,
+// high_accuracy the high accuracy flag, parallel_mode the parallel mode flag, max_svf_idx
+// the maximum index in the decomposition of the sv funcs (or zero if there are no sv funcs).
+//
+// The return value is a list of states in which the driver functions have been defined.
+std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t,
+                                                 llvm::Value *main_par_ptr, llvm::Value *main_time_ptr,
+                                                 llvm::Value *main_tape_ptr, const taylor_dc_t &dc,
+                                                 const std::vector<taylor_dc_t> &s_dc, std::uint32_t n_eq,
+                                                 std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
+                                                 bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx)
+{
+    // TODO implement.
+    (void)parallel_mode;
 
     // Init the list of states.
     // NOTE: we use lists here because it is convenient to have
@@ -557,115 +821,23 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv
     states.push_back(main_state.make_similar());
     auto *cur_state = &states.back();
 
-    // Index of the state we are currently operating on.
-    boost::safe_numerics::safe<unsigned> cur_state_idx = 0;
-
-    // Helper to create and return the prototype of a driver function in the state s.
-    auto make_driver_proto = [](llvm_state &s, unsigned cur_idx) {
-        auto &builder = s.builder();
-        auto &md = s.module();
-        auto &ctx = s.context();
-
-        // The arguments to the driver are:
-        // - a pointer to the tape,
-        // - pointers to par and time,
-        // - the current diff order.
-        auto *ptr_tp = llvm::PointerType::getUnqual(ctx);
-        std::vector<llvm::Type *> fargs{ptr_tp, ptr_tp, ptr_tp, builder.getInt32Ty()};
-
-        // The driver does not return anything.
-        auto *ft = llvm::FunctionType::get(builder.getVoidTy(), fargs, false);
-        assert(ft != nullptr); // LCOV_EXCL_LINE
-
-        // Now create the driver.
-        const auto cur_name = fmt::format("heyoka.cm_jet.driver_{}", cur_idx);
-        auto *f = llvm_func_create(ft, llvm::Function::ExternalLinkage, cur_name, &md);
-        // NOTE: the driver cannot call itself recursively.
-        f->addFnAttr(llvm::Attribute::NoRecurse);
-
-        // Add the arguments' attributes.
-        // NOTE: no aliasing is assumed between the pointer
-        // arguments.
-        auto *tape_arg = f->args().begin();
-        tape_arg->setName("tape_ptr");
-        tape_arg->addAttr(llvm::Attribute::NoCapture);
-        tape_arg->addAttr(llvm::Attribute::NoAlias);
-
-        auto *par_ptr_arg = tape_arg + 1;
-        par_ptr_arg->setName("par_ptr");
-        par_ptr_arg->addAttr(llvm::Attribute::NoCapture);
-        par_ptr_arg->addAttr(llvm::Attribute::NoAlias);
-        par_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
-
-        auto *time_ptr_arg = tape_arg + 2;
-        time_ptr_arg->setName("time_ptr");
-        time_ptr_arg->addAttr(llvm::Attribute::NoCapture);
-        time_ptr_arg->addAttr(llvm::Attribute::NoAlias);
-        time_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
-
-        return f;
-    };
+    // Generate the global arrays for the computation of the derivatives
+    // of the state variables in the main state.
+    const auto svd_gl = taylor_c_make_sv_diff_globals(main_state, main_fp_t, dc, n_uvars);
 
-    // TODO doc fix.
-    // Helper to compute the Taylor derivatives for a block.
-    // func is the LLVM function for the computation of the Taylor derivative in the block,
-    // ncalls the number of times it must be called, gens the generators for the
-    // function arguments and cur_order the order of the derivative. s is the llvm state
-    // in which we are computing the derivatives.
-    auto block_diff = [n_uvars](llvm_state &s, llvm::Function *func, std::uint32_t ncalls, const auto &gens,
-                                llvm::Value *tape_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr,
-                                llvm::Value *cur_order, llvm::Type *fp_vec_type) {
-        // LCOV_EXCL_START
-        assert(ncalls > 0u);
-        assert(!gens.empty());
-        assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast<bool>(f); }));
-        // LCOV_EXCL_STOP
+    // Structure used to log, in trace mode, the breakdown of each segment.
+    // For each segment, this structure contains the number of invocations
+    // of each function in the segment. It will be unused if we are not tracing.
+    std::vector<std::vector<std::uint32_t>> segment_bd;
 
-        // Fetch the builder for the current state.
-        auto &bld = s.builder();
-
-        // We will be manually unrolling loops if ncalls is small enough.
-        // This seems to help with compilation times.
-        constexpr auto max_unroll_n = 5u;
-
-        if (ncalls > max_unroll_n) {
-            // Loop over the number of calls.
-            llvm_loop_u32(s, bld.getInt32(0), bld.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
-                // Create the u variable index from the first generator.
-                auto u_idx = gens[0](cur_call_idx);
-
-                // Initialise the vector of arguments with which func must be called. The following
-                // initial arguments are always present:
-                // - current Taylor order,
-                // - u index of the variable,
-                // - tape of derivatives,
-                // - pointer to the param values,
-                // - pointer to the time value(s).
-                std::vector<llvm::Value *> args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr};
-
-                // Create the other arguments via the generators.
-                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
-                    args.push_back(gens[i](cur_call_idx));
-                }
+    // Are we tracing?
+    const auto is_tracing = get_logger()->should_log(spdlog::level::trace);
 
-                // Calculate the derivative and store the result.
-                taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args));
-            });
-        } else {
-            // The manually-unrolled version of the above.
-            for (std::uint32_t idx = 0; idx < ncalls; ++idx) {
-                auto *cur_call_idx = bld.getInt32(idx);
-                auto u_idx = gens[0](cur_call_idx);
-                std::vector<llvm::Value *> args{cur_order, u_idx, tape_ptr, par_ptr, time_ptr};
-
-                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
-                    args.push_back(gens[i](cur_call_idx));
-                }
+    // Do we need to compute the last-order derivatives for the sv_funcs?
+    const auto need_svf_lo = max_svf_idx >= n_eq;
 
-                taylor_c_store_diff(s, fp_vec_type, tape_ptr, n_uvars, cur_order, u_idx, bld.CreateCall(func, args));
-            }
-        }
-    };
+    // Index of the state we are currently operating on.
+    boost::safe_numerics::safe<unsigned> cur_state_idx = 0;
 
     // NOTE: unlike in compiled functions, we cannot at the same time
     // declare and invoke the drivers from the main module as the invocation
@@ -677,12 +849,17 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv
     // Declarations of the drivers in the main state.
     std::vector<llvm::Function *> main_driver_decls;
     // Add the declaration for the first driver.
-    main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx));
+    main_driver_decls.push_back(taylor_cm_make_driver_proto(main_state, cur_state_idx));
+
+    // The driver function for the evaluation of the segment
+    // containing max_svf_idx. Will remain null if we do not need
+    // to compute the last-order derivatives for the sv funcs.
+    llvm::Function *max_svf_driver = nullptr;
 
     // Add the driver declaration to the current state,
     // and start insertion into the driver.
-    cur_state->builder().SetInsertPoint(
-        llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx)));
+    cur_state->builder().SetInsertPoint(llvm::BasicBlock::Create(
+        cur_state->context(), "entry", taylor_cm_make_driver_proto(*cur_state, cur_state_idx)));
 
     // Variable to keep track of how many blocks have been codegenned
     // in the current state.
@@ -693,175 +870,81 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv
     // needs more investigation.
     constexpr auto max_n_cg_blocks = 20u;
 
-    // Variable to keep track of the u variable
-    // on whose definition we are operating.
-    auto cur_u_idx = n_eq;
+    // Variable to keep track of the index of the first u variable
+    // in a segment.
+    auto start_u_idx = n_eq;
 
-    // Iterate over the segments in s_dc.
-    for (const auto &seg : s_dc) {
-        if (n_cg_blocks > max_n_cg_blocks) {
-            // We have codegenned enough blocks for this state. Create the return
-            // value for the current driver, and move to the next one.
-            cur_state->builder().CreateRetVoid();
-
-            // Create the new current state.
-            states.push_back(main_state.make_similar());
-            cur_state = &states.back();
-
-            // Reset/update the counters.
-            n_cg_blocks = 0;
-            ++cur_state_idx;
-
-            // Add the driver declaration to the main state.
-            main_driver_decls.push_back(make_driver_proto(main_state, cur_state_idx));
-
-            // Add the driver declaration to the current state,
-            // and start insertion into the driver.
-            cur_state->builder().SetInsertPoint(
-                llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx)));
-        }
-
-        // Fetch the internal fp type and its vector counterpart for the current state.
-        auto *fp_t = llvm_clone_type(*cur_state, main_fp_t);
-        auto *fp_vec_type = make_vector_type(fp_t, batch_size);
-
-        // Fetch the current builder.
-        auto &cur_builder = cur_state->builder();
-
-        // This structure maps a function to sets of arguments
-        // with which the function is to be called. For instance, if function
-        // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
-        // will contain {f : [[a, b, c], [d, e, f]]}.
-        // After construction, we have verified that for each function
-        // in the map the sets of arguments have all the same size.
-        // NOTE: again, here and below we use name-based ordered maps for the functions.
-        // This ensures that the invocations of cm_make_arg_gen_*(), which create several
-        // global variables, always happen in a well-defined order. If we used an unordered map instead,
-        // the variables would be created in a "random" order, which would result in a
-        // unnecessary miss for the in-memory cache machinery when two logically-identical
-        // LLVM modules are considered different because of the difference in the order
-        // of declaration of global variables.
-        std::map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>,
-                 llvm_func_name_compare>
-            tmp_map;
-
-        for (const auto &ex : seg) {
-            // Get the function for the computation of the derivative.
-            auto *func = taylor_c_diff_func(*cur_state, fp_t, ex.first, n_uvars, batch_size, high_accuracy);
-
-            // Insert the function into tmp_map.
-            const auto [it, is_new_func] = tmp_map.try_emplace(func);
-
-            assert(is_new_func || !it->second.empty()); // LCOV_EXCL_LINE
-
-            // Convert the variables/constants in the current dc
-            // element into a set of indices/constants.
-            const auto cdiff_args = udef_to_variants(ex.first, ex.second);
-
-            // LCOV_EXCL_START
-            if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) {
-                throw std::invalid_argument(
-                    fmt::format("Inconsistent arity detected in a Taylor derivative function in compact "
-                                "mode: the same function is being called with both {} and {} arguments",
-                                it->second.back().size() - 1u, cdiff_args.size()));
-            }
-            // LCOV_EXCL_STOP
+    // Helper to finalise the current driver function and create a new one.
+    auto start_new_driver = [&cur_state, &states, &main_state, &n_cg_blocks, &cur_state_idx, &main_driver_decls]() {
+        // Finalise the current driver.
+        cur_state->builder().CreateRetVoid();
 
-            // Add the new set of arguments.
-            it->second.emplace_back();
-            // Add the idx of the u variable.
-            it->second.back().emplace_back(cur_u_idx);
-            // Add the actual function arguments.
-            it->second.back().insert(it->second.back().end(), cdiff_args.begin(), cdiff_args.end());
+        // Create the new current state.
+        states.push_back(main_state.make_similar());
+        cur_state = &states.back();
 
-            ++cur_u_idx;
-        }
+        // Reset/update the counters.
+        n_cg_blocks = 0;
+        ++cur_state_idx;
 
-        // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
-        // to {f : [[a, d], [b, e], [c, f]]}.
-        std::map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>,
-                 llvm_func_name_compare>
-            tmp_map_transpose;
-        for (const auto &[func, vv] : tmp_map) {
-            assert(!vv.empty()); // LCOV_EXCL_LINE
-
-            // Add the function.
-            const auto [it, ins_status] = tmp_map_transpose.try_emplace(func);
-            assert(ins_status); // LCOV_EXCL_LINE
-
-            const auto n_calls = vv.size();
-            const auto n_args = vv[0].size();
-            // NOTE: n_args must be at least 1 because the u idx
-            // is prepended to the actual function arguments in
-            // the tmp_map entries.
-            assert(n_args >= 1u); // LCOV_EXCL_LINE
-
-            for (decltype(vv[0].size()) i = 0; i < n_args; ++i) {
-                // Build the vector of values corresponding
-                // to the current argument index.
-                std::vector<std::variant<std::uint32_t, number>> tmp_c_vec;
-                for (decltype(vv.size()) j = 0; j < n_calls; ++j) {
-                    tmp_c_vec.push_back(vv[j][i]);
-                }
+        // Add the driver declaration to the main state.
+        main_driver_decls.push_back(taylor_cm_make_driver_proto(main_state, cur_state_idx));
 
-                // Turn tmp_c_vec (a vector of variants) into a variant
-                // of vectors, and insert the result.
-                it->second.push_back(vv_transpose(tmp_c_vec));
-            }
-        }
+        // Add the driver declaration to the current state,
+        // and start insertion into the driver.
+        cur_state->builder().SetInsertPoint(llvm::BasicBlock::Create(
+            cur_state->context(), "entry", taylor_cm_make_driver_proto(*cur_state, cur_state_idx)));
+    };
 
-        // Create the seg_f_list_t for the current segment.
-        seg_f_list_t seg_map;
-
-        for (const auto &[func, vv] : tmp_map_transpose) {
-            // NOTE: vv.size() is now the number of arguments. We know it cannot
-            // be zero because the functions to compute the Taylor derivatives
-            // in compact mode always have at least 1 argument (i.e., the index
-            // of the u variable whose derivative is being computed).
-            assert(!vv.empty()); // LCOV_EXCL_LINE
-
-            // Add the function.
-            const auto [it, ins_status] = seg_map.try_emplace(func);
-            assert(ins_status); // LCOV_EXCL_LINE
-
-            // Set the number of calls for this function.
-            it->second.first
-                = std::visit([](const auto &x) { return boost::numeric_cast<std::uint32_t>(x.size()); }, vv[0]);
-            assert(it->second.first > 0u); // LCOV_EXCL_LINE
-
-            // Create the g functions for each argument.
-            for (const auto &v : vv) {
-                it->second.second.push_back(std::visit(
-                    [cur_state, fp_t](const auto &x) {
-                        using type = uncvref_t<decltype(x)>;
-
-                        if constexpr (std::is_same_v<type, std::vector<std::uint32_t>>) {
-                            return cm_make_arg_gen_vidx(*cur_state, x);
-                        } else {
-                            return cm_make_arg_gen_vc(*cur_state, fp_t, x);
-                        }
-                    },
-                    v));
+    // Iterate over the segments in s_dc and codegen the code for the
+    // computation of Taylor derivatives.
+    for (const auto &seg : s_dc) {
+        // Cache the number of expressions in the segment.
+        const auto seg_n_ex = static_cast<std::uint32_t>(seg.size());
+
+        // Are we in the segment containing max_svf_idx? We are if:
+        //
+        // - we need to compute the last-order derivatives of the sv funcs,
+        // - max_svf_idx is somewhere within this segment.
+        //
+        // In such a case, we create a driver specifically for this segment, which we will
+        // invoke again at the end of this function to compute the last-order derivatives
+        // of the sv funcs.
+        const auto is_svf_seg = need_svf_lo && max_svf_idx >= start_u_idx && max_svf_idx < (start_u_idx + seg_n_ex);
+
+        if (n_cg_blocks > max_n_cg_blocks || is_svf_seg) {
+            // Either we have codegenned enough blocks for this state, or we are
+            // in the max_svf_idx state. Finalise the current driver and start the new one.
+            start_new_driver();
+
+            // Assign max_svf_driver if needed.
+            if (is_svf_seg) {
+                assert(max_svf_driver == nullptr);
+                max_svf_driver = main_driver_decls.back();
             }
         }
 
-        // Fetch the arguments from the driver prototype.
-        auto *driver_f = cur_builder.GetInsertBlock()->getParent();
-        auto *tape_ptr = driver_f->args().begin();
-        auto *par_ptr = driver_f->args().begin() + 1;
-        auto *time_ptr = driver_f->args().begin() + 2;
-        auto *cur_order = driver_f->args().begin() + 3;
-
-        // Compute the derivatives for this segment.
-        for (const auto &[func, fpair] : seg_map) {
-            const auto &[ncalls, gens] = fpair;
+        // Fetch the internal fp type for the current state.
+        auto *fp_t = llvm_clone_type(*cur_state, main_fp_t);
 
-            block_diff(*cur_state, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type);
-        }
+        // Codegen the computation of the derivatives for this segment.
+        const auto seg_map
+            = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars, high_accuracy);
 
         // Update the number of codegenned blocks.
         n_cg_blocks += seg_map.size();
 
+        // Update start_u_idx.
+        start_u_idx += seg_n_ex;
+
+        // If we codegenned the max_svf_idx driver, start immediately a new driver.
+        // We want the max_svf_idx driver to contain the codegen for a single segment
+        // and nothing more, otherwise we end up doing unnecessary work when computing
+        // the last-order derivatives of the sv funcs.
+        if (is_svf_seg) {
+            start_new_driver();
+        }
+
         // LCOV_EXCL_START
         // Update segment_bd if needed.
         if (is_tracing) {
@@ -907,40 +990,43 @@ auto taylor_compute_jet_multi(llvm_state &main_state, llvm::Type *main_fp_t, llv
     taylor_c_compute_sv_diffs(main_state, main_fp_t, svd_gl, main_tape_ptr, main_par_ptr, n_uvars,
                               main_bld.getInt32(order), batch_size);
 
-    // Compute the last-order derivatives for the sv_funcs, if any. Because the sv funcs
+    // Finally, we compute the last-order derivatives for the sv_funcs, if needed. Because the sv funcs
     // correspond to u variables somewhere in the decomposition, we will have to compute the
     // last-order derivatives of the u variables until we are sure all sv_funcs derivatives
     // have been properly computed.
-    if (max_svf_idx >= n_eq) {
-        // Monitor the starting index of the current
-        // segment while iterating on the segments.
-        auto cur_start_u_idx = n_eq;
-
-        for (decltype(s_dc.size()) seg_idx = 0; seg_idx < s_dc.size(); ++seg_idx) {
-            if (cur_start_u_idx > max_svf_idx) {
-                // We computed all the necessary derivatives, break out.
-                break;
-            }
+    if (need_svf_lo) {
+        assert(max_svf_driver != nullptr);
 
-            // Invoke the driver for the current segment.
-            main_bld.CreateCall(main_driver_decls[seg_idx],
-                                {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(order)});
+        // What we do here is to iterate over all the drivers, invoke them one by one,
+        // and break out when we have detected max_svf_driver.
+        for (auto *cur_driver_f : main_driver_decls) {
+            main_bld.CreateCall(cur_driver_f, {main_tape_ptr, main_par_ptr, main_time_ptr, main_bld.getInt32(order)});
 
-            // Update cur_start_u_idx.
-            cur_start_u_idx += static_cast<std::uint32_t>(s_dc[seg_idx].size());
+            if (cur_driver_f == max_svf_driver) {
+                break;
+            }
         }
     }
+
+    // Return the states.
+    // NOTE: in C++23 we could use std::ranges::views::as_rvalue instead of
+    // the custom transform:
+    //
+    // https://en.cppreference.com/w/cpp/ranges/as_rvalue_view
+    auto sview = states | std::views::transform([](auto &s) -> auto && { return std::move(s); });
+    return std::vector(std::ranges::begin(sview), std::ranges::end(sview));
 }
 
 // Helper for the computation of a jet of derivatives in compact mode,
-// used in taylor_compute_jet(). The return value are the size/alignment
-// requirements for the tape of derivatives. All LLVM values and types
-// passed to this function are defined in the main state.
-std::array<std::size_t, 2> taylor_compute_jet_compact_mode(
+// used in taylor_compute_jet(). The return values are the size/alignment
+// requirements for the tape of derivatives and the list of states in which
+// the drivers are implemented. All LLVM values and types passed to this function are defined in the main state.
+std::pair<std::array<std::size_t, 2>, std::vector<llvm_state>> taylor_compute_jet_compact_mode(
     // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-    llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
-    llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq,
-    std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode)
+    llvm_state &main_state, llvm::Type *main_fp_t, llvm::Value *order0, llvm::Value *main_par_ptr,
+    llvm::Value *main_time_ptr, llvm::Value *main_tape_ptr, const taylor_dc_t &dc,
+    const std::vector<std::uint32_t> &sv_funcs_dc, std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order,
+    std::uint32_t batch_size, bool high_accuracy, bool parallel_mode)
 {
     auto &main_bld = main_state.builder();
     auto &main_md = main_state.module();
@@ -990,9 +1076,9 @@ std::array<std::size_t, 2> taylor_compute_jet_compact_mode(
     // lifetime of tape_ptr begins here and ends at the end of the function,
     // so that LLVM can assume that any value stored in it cannot be possibly
     // used outside this function.
-    main_bld.CreateLifetimeStart(tape_ptr, main_bld.getInt64(tape_sz));
+    main_bld.CreateLifetimeStart(main_tape_ptr, main_bld.getInt64(tape_sz));
 
-    // Copy over the order-0 derivatives of the state variables.
+    // Copy the order-0 derivatives of the state variables into the tape.
     // NOTE: overflow checking is already done in the parent function.
     llvm_loop_u32(main_state, main_bld.getInt32(0), main_bld.getInt32(n_eq), [&](llvm::Value *cur_var_idx) {
         // Fetch the pointer from order0.
@@ -1003,13 +1089,17 @@ std::array<std::size_t, 2> taylor_compute_jet_compact_mode(
         auto *vec = ext_load_vector_from_memory(main_state, main_fp_t, ptr, batch_size);
 
         // Store into tape_ptr.
-        taylor_c_store_diff(main_state, main_fp_vec_t, tape_ptr, n_uvars, main_bld.getInt32(0), cur_var_idx, vec);
+        taylor_c_store_diff(main_state, main_fp_vec_t, main_tape_ptr, n_uvars, main_bld.getInt32(0), cur_var_idx, vec);
     });
 
+    // Codegen the computation of the Taylor derivatives across multiple states.
+    auto states = taylor_compute_jet_multi(main_state, main_fp_t, main_par_ptr, main_time_ptr, main_tape_ptr, dc, s_dc,
+                                           n_eq, n_uvars, order, batch_size, high_accuracy, parallel_mode, max_svf_idx);
+
     get_logger()->trace("Taylor IR creation compact mode runtime: {}", sw);
 
-    // Return the array of derivatives of the u variables and its type.
-    return std::make_pair(diff_arr, static_cast<llvm::Type *>(diff_array_type));
+    // Return the tape size/alignment and the list of states containing the drivers.
+    return std::make_pair(std::array<std::size_t, 2>{tape_sz, tape_al}, std::move(states));
 }
 
 // Given an input pointer 'in', load the first n * batch_size values in it as n vectors
@@ -1054,10 +1144,11 @@ auto taylor_load_values(llvm_state &s, llvm::Type *fp_t, llvm::Value *in, std::u
 // order0, par_ptr and time_ptr are all external pointers.
 //
 // The return value is a variant containing either:
-// - in compact mode, the size/alignment requirements for the tape of derivatives,
-// - otherwise, the jet of derivatives of the state variables and sv_funcs
+// - in compact mode, the size/alignment requirements for the tape of derivatives
+//   and the list of states in which the driver functions are implemented, or
+// - the jet of derivatives of the state variables and sv_funcs
 //   up to order 'order'.
-std::variant<std::array<std::size_t, 2>, std::vector<llvm::Value *>>
+std::variant<std::pair<std::array<std::size_t, 2>, std::vector<llvm_state>>, std::vector<llvm::Value *>>
 taylor_compute_jet(llvm_state &s, llvm::Type *fp_t, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
                    llvm::Value *tape_ptr, const taylor_dc_t &dc, const std::vector<std::uint32_t> &sv_funcs_dc,
                    std::uint32_t n_eq, std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp
index ffaa6b5f2..5416a9278 100644
--- a/src/taylor_adaptive.cpp
+++ b/src/taylor_adaptive.cpp
@@ -43,6 +43,7 @@
 #endif
 
 #include <heyoka/continuous_output.hpp>
+#include <heyoka/detail/aligned_buffer.hpp>
 #include <heyoka/detail/dfloat.hpp>
 #include <heyoka/detail/ed_data.hpp>
 #include <heyoka/detail/event_detection.hpp>
@@ -56,6 +57,7 @@
 #include <heyoka/exceptions.hpp>
 #include <heyoka/expression.hpp>
 #include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
 #include <heyoka/s11n.hpp>
 #include <heyoka/step_callback.hpp>
 #include <heyoka/taylor.hpp>
@@ -174,13 +176,13 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
                                             std::vector<nt_event_t> ntes, bool parallel_mode,
                                             [[maybe_unused]] std::optional<long long> prec)
 {
-    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_state);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_pars);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_time);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_last_h);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tol);
@@ -191,6 +193,8 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_vsys);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tm_data);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape_sa);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape);
 
     // NOTE: this must hold because tol == 0 is interpreted
     // as undefined in finalise_ctor().
@@ -394,7 +398,8 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
     auto *fp_t = detail::internal_llvm_type_like(std::get<0>(m_llvm_state), m_tol);
 
     // The state(s) which will be returned by the construction of the stepper function.
-    std::variant<llvm_state, std::vector<llvm_state>> states;
+    // If we are not in compact mode, this vector will remain empty.
+    std::vector<llvm_state> states;
 
     // Add the stepper function.
     if (with_events) {
@@ -407,11 +412,10 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
             ee.push_back(ev.get_expression());
         }
 
-        std::tie(m_dc, states)
-            = detail::taylor_add_adaptive_step_with_events(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step_e", sys, 1,
-                                                           compact_mode, ee, high_accuracy, parallel_mode, m_order);
+        std::tie(m_dc, m_tape_sa, states) = detail::taylor_add_adaptive_step_with_events(
+            std::get<0>(m_llvm_state), fp_t, "step_e", sys, 1, compact_mode, ee, high_accuracy, parallel_mode, m_order);
     } else {
-        std::tie(m_dc, states)
+        std::tie(m_dc, m_tape_sa, states)
             = detail::taylor_add_adaptive_step(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step", sys, 1, high_accuracy,
                                                compact_mode, parallel_mode, m_order);
     }
@@ -440,27 +444,43 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
     // Log runtimes in trace mode.
     spdlog::stopwatch sw;
 
-    // Add the function for the computation of
-    // the dense output.
-    detail::taylor_add_d_out_function(m_llvm, fp_t, m_dim, m_order, 1, high_accuracy);
+    // Add the function for the computation of the dense output.
+    // NOTE: in compact mode, the dense output function will be added to the main state.
+    detail::taylor_add_d_out_function(std::get<0>(m_llvm_state), fp_t, m_dim, m_order, 1, high_accuracy);
 
     detail::get_logger()->trace("Taylor dense output runtime: {}", sw);
     sw.reset();
 
-    // Run the jit.
-    m_llvm.compile();
+    // Run the jit compilation.
+    if (compact_mode) {
+        // Add the main state to the list of states.
+        states.push_back(std::move(std::get<0>(m_llvm_state)));
 
-    detail::get_logger()->trace("Taylor LLVM compilation runtime: {}", sw);
+        // Reverse the list of states so that we start with the
+        // compilation of the main state first, which may be bigger.
+        std::ranges::reverse(states);
 
-    // Fetch the stepper.
-    if (with_events) {
-        m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(m_llvm.jit_lookup("step_e"));
+        // Create the multi state and assign it.
+        m_llvm_state = llvm_multi_state(std::move(states));
+
+        // Compile.
+        std::get<1>(m_llvm_state).compile();
+
+        // Create the storage for the tape of derivatives.
+        const auto [sz, al] = m_tape_sa;
+        m_tape = detail::make_aligned_buffer(sz, al);
     } else {
-        m_step_f = reinterpret_cast<typename i_data::step_f_t>(m_llvm.jit_lookup("step"));
+        std::get<0>(m_llvm_state).compile();
     }
 
+    detail::get_logger()->trace("Taylor LLVM compilation runtime: {}", sw);
+
+    // Fetch the stepper.
+    assign_stepper(with_events);
+
     // Fetch the function to compute the dense output.
-    m_d_out_f = reinterpret_cast<typename i_data::d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
+    m_d_out_f = std::visit(
+        [](auto &s) { return reinterpret_cast<typename i_data::d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
 
     // Setup the vector for the Taylor coefficients.
     using su32_t = boost::safe_numerics::safe<std::uint32_t>;
@@ -489,27 +509,27 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
 #endif
 
     // Init the event data structure if needed.
-    // NOTE: this can be done in parallel with the rest of the constructor,
+    // NOTE: in principle this can be done in parallel with the rest of the constructor,
     // once we have m_order/m_dim and we are done using tes/ntes.
     if (with_events) {
-        m_ed_data = std::make_unique<ed_data>(m_llvm.make_similar(), std::move(tes), std::move(ntes), m_order, m_dim,
-                                              m_state[0]);
+        m_ed_data = std::make_unique<ed_data>(m_tplt_state.make_similar(), std::move(tes), std::move(ntes), m_order,
+                                              m_dim, m_state[0]);
     }
 
     if (auto_ic_setup) {
         // Finish the automatic setup of the ics for a variational
         // integrator.
-        detail::setup_variational_ics_t0(m_llvm, m_state, m_pars, &m_time.hi, std::get<1>(vsys), 1, m_high_accuracy,
-                                         m_compact_mode);
+        detail::setup_variational_ics_t0(m_tplt_state, m_state, m_pars, &m_time.hi, std::get<1>(vsys), 1,
+                                         m_high_accuracy, m_compact_mode);
     }
 
     if (is_variational) {
 #if defined(HEYOKA_HAVE_REAL)
         if constexpr (std::is_same_v<T, mppp::real>) {
-            m_tm_data.emplace(std::get<1>(vsys), static_cast<long long>(this->get_prec()), m_llvm, 1);
+            m_tm_data.emplace(std::get<1>(vsys), static_cast<long long>(this->get_prec()), m_tplt_state, 1);
         } else {
 #endif
-            m_tm_data.emplace(std::get<1>(vsys), 0, m_llvm, 1);
+            m_tm_data.emplace(std::get<1>(vsys), 0, m_tplt_state, 1);
 #if defined(HEYOKA_HAVE_REAL)
         }
 #endif
@@ -548,11 +568,7 @@ taylor_adaptive<T>::taylor_adaptive(const taylor_adaptive &other)
     : base_t(static_cast<const base_t &>(other)), m_i_data(std::make_unique<i_data>(*other.m_i_data)),
       m_ed_data(other.m_ed_data ? std::make_unique<ed_data>(*other.m_ed_data) : nullptr)
 {
-    if (m_ed_data) {
-        m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(m_i_data->m_llvm.jit_lookup("step_e"));
-    } else {
-        m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_t>(m_i_data->m_llvm.jit_lookup("step"));
-    }
+    assign_stepper(static_cast<bool>(m_ed_data));
 }
 
 template <typename T>
@@ -615,12 +631,8 @@ void taylor_adaptive<T>::load_impl(Archive &ar, unsigned version)
         ar >> m_i_data;
         ar >> m_ed_data;
 
-        // Recover the function pointers.
-        if (m_ed_data) {
-            m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(m_i_data->m_llvm.jit_lookup("step_e"));
-        } else {
-            m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_t>(m_i_data->m_llvm.jit_lookup("step"));
-        }
+        // Recover the stepper.
+        assign_stepper(static_cast<bool>(m_ed_data));
         // LCOV_EXCL_START
     } catch (...) {
         // Reset to def-cted state in case of exceptions.
@@ -706,14 +718,20 @@ std::tuple<taylor_outcome, T> taylor_adaptive<T>::step_impl(T max_delta_t, bool
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape);
 
     auto h = max_delta_t;
 
-    if (m_step_f.index() == 0u) {
+    if (m_step_f.index() == 0u || m_step_f.index() == 2u) {
         assert(!m_ed_data); // LCOV_EXCL_LINE
 
         // Invoke the vanilla stepper.
-        std::get<0>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr);
+        if (m_step_f.index() == 0u) {
+            std::get<0>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr);
+        } else {
+            std::get<2>(m_step_f)(m_state.data(), m_pars.data(), &m_time.hi, &h, wtc ? m_tc.data() : nullptr,
+                                  m_tape.get());
+        }
 
         // Update the time.
         m_time += h;
@@ -737,7 +755,12 @@ std::tuple<taylor_outcome, T> taylor_adaptive<T>::step_impl(T max_delta_t, bool
         // Invoke the stepper for event handling. We will record the norm infinity of the state vector +
         // event equations at the beginning of the timestep for later use.
         auto max_abs_state = detail::num_zero_like(h);
-        std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state);
+        if (m_step_f.index() == 1u) {
+            std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state);
+        } else {
+            std::get<3>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), &m_time.hi, &h, &max_abs_state,
+                                  m_tape.get());
+        }
 
         // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for
         // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that
@@ -1043,7 +1066,7 @@ taylor_adaptive<T>::propagate_until_impl(detail::dfloat<T> t, std::size_t max_st
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order);
-    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy);
 
     // Check the current time.
@@ -1129,7 +1152,7 @@ taylor_adaptive<T>::propagate_until_impl(detail::dfloat<T> t, std::size_t max_st
             }
 
             // Construct the return value.
-            continuous_output<T> ret(m_llvm.make_similar());
+            continuous_output<T> ret(m_tplt_state.make_similar());
 
             // Fill in the data.
             ret.m_tcs = std::move(c_out_tcs);
@@ -1661,9 +1684,9 @@ taylor_adaptive<T>::propagate_grid_impl(std::vector<T> grid, std::size_t max_ste
 }
 
 template <typename T>
-const llvm_state &taylor_adaptive<T>::get_llvm_state() const
+const std::variant<llvm_state, llvm_multi_state> &taylor_adaptive<T>::get_llvm_state() const
 {
-    return m_i_data->m_llvm;
+    return m_i_data->m_llvm_state;
 }
 
 template <typename T>
@@ -1894,6 +1917,29 @@ void taylor_adaptive<T>::check_variational(const char *fname) const
     }
 }
 
+// Helper to fetch the stepper function from m_llvm_state.
+template <typename T>
+void taylor_adaptive<T>::assign_stepper(bool with_events)
+{
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state);
+
+    if (with_events) {
+        if (m_compact_mode) {
+            m_step_f = reinterpret_cast<typename i_data::c_step_f_e_t>(std::get<1>(m_llvm_state).jit_lookup("step_e"));
+        } else {
+            m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(std::get<0>(m_llvm_state).jit_lookup("step_e"));
+        }
+    } else {
+        if (m_compact_mode) {
+            m_step_f = reinterpret_cast<typename i_data::c_step_f_t>(std::get<1>(m_llvm_state).jit_lookup("step"));
+        } else {
+            m_step_f = reinterpret_cast<typename i_data::step_f_t>(std::get<0>(m_llvm_state).jit_lookup("step"));
+        }
+    }
+}
+
 template <typename T>
 const std::vector<expression> &taylor_adaptive<T>::get_vargs() const
 {

From 0744c61387606ab83e9845638aadb122dfa75e11 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 26 Aug 2024 15:16:08 +0200
Subject: [PATCH 04/30] [skip ci]


From 94ef42a76ba06d9fdcce945a5d916373c045cee8 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Mon, 26 Aug 2024 16:53:42 +0200
Subject: [PATCH 05/30] Implement parallel compilation for batch-mode
 integrators too, test adaptations.

---
 include/heyoka/detail/i_data.hpp |  22 ++++-
 include/heyoka/taylor.hpp        |   3 +-
 src/detail/i_data.cpp            |  54 ++++++++++--
 src/taylor_adaptive_batch.cpp    | 143 +++++++++++++++++++++----------
 test/llvm_state.cpp              |   2 +-
 test/logical.cpp                 |  18 ++--
 test/model_nbody.cpp             |   8 +-
 test/opt_checks.cpp              |  25 +++---
 test/rel.cpp                     |   8 +-
 test/select.cpp                  |  12 +--
 test/taylor_adaptive.cpp         |   4 +-
 test/taylor_adaptive_batch.cpp   |   6 +-
 test/taylor_adaptive_mp.cpp      |   2 +-
 test/taylor_atan2.cpp            |   8 +-
 test/taylor_kepE.cpp             |   8 +-
 test/taylor_kepF.cpp             |  16 ++--
 test/taylor_kepF_mp.cpp          |   3 +-
 test/taylor_pow.cpp              |  32 +++----
 test/taylor_prod.cpp             |  12 +--
 test/taylor_relu.cpp             |  20 ++---
 test/taylor_relu_mp.cpp          |  10 +--
 test/taylor_square.cpp           |  16 ++--
 test/test_utils.hpp              |  14 +++
 23 files changed, 283 insertions(+), 163 deletions(-)

diff --git a/include/heyoka/detail/i_data.hpp b/include/heyoka/detail/i_data.hpp
index be49f7f49..f86c49034 100644
--- a/include/heyoka/detail/i_data.hpp
+++ b/include/heyoka/detail/i_data.hpp
@@ -145,8 +145,12 @@ struct taylor_adaptive_batch<T>::i_data {
     std::vector<T> m_state;
     // Times.
     std::vector<T> m_time_hi, m_time_lo;
-    // The LLVM machinery.
-    llvm_state m_llvm;
+    // The LLVM (multi)state.
+    std::variant<llvm_state, llvm_multi_state> m_llvm_state;
+    // A template LLVM state we keep around to create states
+    // similar to m_llvm_state as needed. This is created with the
+    // same settings as m_llvm_state.
+    llvm_state m_tplt_state;
     // Dimension of the system.
     std::uint32_t m_dim{};
     // Taylor decomposition.
@@ -159,10 +163,18 @@ struct taylor_adaptive_batch<T>::i_data {
     bool m_high_accuracy{};
     // Compact mode.
     bool m_compact_mode{};
-    // The steppers.
+    // The stepper types (non-compact mode).
     using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept;
     using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept;
-    std::variant<step_f_t, step_f_e_t> m_step_f;
+    // The stepper types (compact mode). These have an additional argument - the tape pointer.
+    using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept;
+    using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept;
+    // The stepper.
+    std::variant<step_f_t, step_f_e_t, c_step_f_t, c_step_f_e_t> m_step_f;
+    // Size/alignment for the compact mode tape.
+    std::array<std::size_t, 2> m_tape_sa{};
+    // Compact mode tape.
+    detail::aligned_buffer_t m_tape;
     // The vector of parameters.
     std::vector<T> m_pars;
     // The vector for the Taylor coefficients.
@@ -221,6 +233,8 @@ struct taylor_adaptive_batch<T>::i_data {
     i_data &operator=(i_data &&) noexcept = delete;
 
     ~i_data();
+
+    void init_cm_tape();
 };
 
 HEYOKA_END_NAMESPACE
diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp
index 39498c1e1..b3de93017 100644
--- a/include/heyoka/taylor.hpp
+++ b/include/heyoka/taylor.hpp
@@ -909,6 +909,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch
     explicit taylor_adaptive_batch(private_ctor_t, llvm_state);
 
     HEYOKA_DLL_LOCAL void check_variational(const char *) const;
+    HEYOKA_DLL_LOCAL void assign_stepper(bool);
 
     // Input type for Taylor map computation.
     using tm_input_t = mdspan<const T, dextents<std::uint32_t, 1>>;
@@ -952,7 +953,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch
 
     ~taylor_adaptive_batch();
 
-    [[nodiscard]] const llvm_state &get_llvm_state() const;
+    [[nodiscard]] const std::variant<llvm_state, llvm_multi_state> &get_llvm_state() const;
 
     [[nodiscard]] const taylor_dc_t &get_decomposition() const;
 
diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp
index cbb27b9ac..45e431c9a 100644
--- a/src/detail/i_data.cpp
+++ b/src/detail/i_data.cpp
@@ -217,6 +217,25 @@ HEYOKA_TAYLOR_ADAPTIVE_I_DATA_INST(mppp::real)
 
 #undef HEYOKA_TAYLOR_ADAPTIVE_I_DATA_INST
 
+// Helper to initialise the compact-mode tape. Assumes an empty tape.
+template <typename T>
+void taylor_adaptive_batch<T>::i_data::init_cm_tape()
+{
+    assert(!m_tape);
+
+    const auto [sz, al] = m_tape_sa;
+
+    if (m_compact_mode) {
+        assert(sz != 0u);
+        assert(al != 0u);
+
+        m_tape = detail::make_aligned_buffer(sz, al);
+    } else {
+        assert(sz == 0u);
+        assert(al == 0u);
+    }
+}
+
 template <typename T>
 void taylor_adaptive_batch<T>::i_data::save(boost::archive::binary_oarchive &ar, unsigned) const
 {
@@ -224,13 +243,15 @@ void taylor_adaptive_batch<T>::i_data::save(boost::archive::binary_oarchive &ar,
     ar << m_state;
     ar << m_time_hi;
     ar << m_time_lo;
-    ar << m_llvm;
+    ar << m_llvm_state;
+    ar << m_tplt_state;
     ar << m_dim;
     ar << m_dc;
     ar << m_order;
     ar << m_tol;
     ar << m_high_accuracy;
     ar << m_compact_mode;
+    ar << m_tape_sa;
     ar << m_pars;
     ar << m_tc;
     ar << m_last_h;
@@ -262,13 +283,15 @@ void taylor_adaptive_batch<T>::i_data::load(boost::archive::binary_iarchive &ar,
     ar >> m_state;
     ar >> m_time_hi;
     ar >> m_time_lo;
-    ar >> m_llvm;
+    ar >> m_llvm_state;
+    ar >> m_tplt_state;
     ar >> m_dim;
     ar >> m_dc;
     ar >> m_order;
     ar >> m_tol;
     ar >> m_high_accuracy;
     ar >> m_compact_mode;
+    ar >> m_tape_sa;
     ar >> m_pars;
     ar >> m_tc;
     ar >> m_last_h;
@@ -293,20 +316,31 @@ void taylor_adaptive_batch<T>::i_data::load(boost::archive::binary_iarchive &ar,
     ar >> m_tm_data;
 
     // Recover the function pointers.
-    m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
+    m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
+
+    // Reconstruct the compact mode tape, if necessary.
+    m_tape.reset();
+    init_cm_tape();
 }
 
+// NOTE: this ctor provides only partial initialisation of the data members.
+// The rest of the initialisation is performed from the integrator ctor.
+// NOTE: m_llvm_state is inited as a single llvm_state regardless of the use
+// of compact mode. It will be converted into a multi state if needed at a
+// later stage.
 template <typename T>
-taylor_adaptive_batch<T>::i_data::i_data(llvm_state s) : m_llvm(std::move(s))
+taylor_adaptive_batch<T>::i_data::i_data(llvm_state s)
+    : m_llvm_state(std::move(s)), m_tplt_state(std::get<0>(m_llvm_state).make_similar())
 {
 }
 
 template <typename T>
 taylor_adaptive_batch<T>::i_data::i_data(const i_data &other)
     : m_batch_size(other.m_batch_size), m_state(other.m_state), m_time_hi(other.m_time_hi), m_time_lo(other.m_time_lo),
-      m_llvm(other.m_llvm), m_dim(other.m_dim), m_dc(other.m_dc), m_order(other.m_order), m_tol(other.m_tol),
-      m_high_accuracy(other.m_high_accuracy), m_compact_mode(other.m_compact_mode), m_pars(other.m_pars),
-      m_tc(other.m_tc), m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_pinf(other.m_pinf), m_minf(other.m_minf),
+      m_llvm_state(other.m_llvm_state), m_tplt_state(other.m_tplt_state), m_dim(other.m_dim), m_dc(other.m_dc),
+      m_order(other.m_order), m_tol(other.m_tol), m_high_accuracy(other.m_high_accuracy),
+      m_compact_mode(other.m_compact_mode), m_tape_sa(other.m_tape_sa), m_pars(other.m_pars), m_tc(other.m_tc),
+      m_last_h(other.m_last_h), m_d_out(other.m_d_out), m_pinf(other.m_pinf), m_minf(other.m_minf),
       m_delta_ts(other.m_delta_ts), m_step_res(other.m_step_res), m_prop_res(other.m_prop_res),
       m_ts_count(other.m_ts_count), m_min_abs_h(other.m_min_abs_h), m_max_abs_h(other.m_max_abs_h),
       m_cur_max_delta_ts(other.m_cur_max_delta_ts), m_pfor_ts(other.m_pfor_ts), m_t_dir(other.m_t_dir),
@@ -314,7 +348,11 @@ taylor_adaptive_batch<T>::i_data::i_data(const i_data &other)
       m_nf_detected(other.m_nf_detected), m_d_out_time(other.m_d_out_time), m_vsys(other.m_vsys),
       m_tm_data(other.m_tm_data)
 {
-    m_d_out_f = reinterpret_cast<d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
+    // Recover the function pointers.
+    m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
+
+    // Init the compact mode tape, if necessary.
+    init_cm_tape();
 }
 
 template <typename T>
diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp
index e97b4d0df..43e2f9788 100644
--- a/src/taylor_adaptive_batch.cpp
+++ b/src/taylor_adaptive_batch.cpp
@@ -41,6 +41,7 @@
 #endif
 
 #include <heyoka/continuous_output.hpp>
+#include <heyoka/detail/aligned_buffer.hpp>
 #include <heyoka/detail/dfloat.hpp>
 #include <heyoka/detail/ed_data.hpp>
 #include <heyoka/detail/event_detection.hpp>
@@ -53,6 +54,7 @@
 #include <heyoka/exceptions.hpp>
 #include <heyoka/expression.hpp>
 #include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
 #include <heyoka/s11n.hpp>
 #include <heyoka/step_callback.hpp>
 #include <heyoka/taylor.hpp>
@@ -113,9 +115,9 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tol);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order);
-    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dc);
-    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_last_h);
@@ -138,6 +140,8 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_time);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_vsys);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tm_data);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape_sa);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape);
 
     // Init the data members.
     m_batch_size = batch_size;
@@ -243,10 +247,14 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
     m_order = detail::taylor_order_from_tol(m_tol);
 
     // Determine the external fp type.
-    auto *ext_fp_t = detail::to_external_llvm_type<T>(m_llvm.context());
+    auto *ext_fp_t = detail::to_external_llvm_type<T>(std::get<0>(m_llvm_state).context());
 
     // Determine the internal fp type.
-    auto *fp_t = detail::internal_llvm_type_like(m_llvm, m_tol);
+    auto *fp_t = detail::internal_llvm_type_like(std::get<0>(m_llvm_state), m_tol);
+
+    // The state(s) which will be returned by the construction of the stepper function.
+    // If we are not in compact mode, this vector will remain empty.
+    std::vector<llvm_state> states;
 
     // Add the stepper function.
     if (with_events) {
@@ -259,11 +267,13 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
             ee.push_back(ev.get_expression());
         }
 
-        m_dc = detail::taylor_add_adaptive_step_with_events(m_llvm, ext_fp_t, fp_t, "step_e", sys, batch_size,
-                                                            compact_mode, ee, high_accuracy, parallel_mode, m_order);
+        std::tie(m_dc, m_tape_sa, states)
+            = detail::taylor_add_adaptive_step_with_events(std::get<0>(m_llvm_state), fp_t, "step_e", sys, batch_size,
+                                                           compact_mode, ee, high_accuracy, parallel_mode, m_order);
     } else {
-        m_dc = detail::taylor_add_adaptive_step(m_llvm, ext_fp_t, fp_t, "step", sys, batch_size, high_accuracy,
-                                                compact_mode, parallel_mode, m_order);
+        std::tie(m_dc, m_tape_sa, states)
+            = detail::taylor_add_adaptive_step(std::get<0>(m_llvm_state), ext_fp_t, fp_t, "step", sys, batch_size,
+                                               high_accuracy, compact_mode, parallel_mode, m_order);
     }
 
     // Fix m_pars' size, if necessary.
@@ -282,27 +292,43 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
     // Log runtimes in trace mode.
     spdlog::stopwatch sw;
 
-    // Add the function for the computation of
-    // the dense output.
-    detail::taylor_add_d_out_function(m_llvm, ext_fp_t, m_dim, m_order, m_batch_size, high_accuracy);
+    // Add the function for the computation of the dense output.
+    // NOTE: in compact mode, the dense output function will be added to the main state.
+    detail::taylor_add_d_out_function(std::get<0>(m_llvm_state), ext_fp_t, m_dim, m_order, m_batch_size, high_accuracy);
 
     detail::get_logger()->trace("Taylor batch dense output runtime: {}", sw);
     sw.reset();
 
-    // Run the jit.
-    m_llvm.compile();
+    // Run the jit compilation.
+    if (compact_mode) {
+        // Add the main state to the list of states.
+        states.push_back(std::move(std::get<0>(m_llvm_state)));
 
-    detail::get_logger()->trace("Taylor batch LLVM compilation runtime: {}", sw);
+        // Reverse the list of states so that we start with the
+        // compilation of the main state first, which may be bigger.
+        std::ranges::reverse(states);
 
-    // Fetch the stepper.
-    if (with_events) {
-        m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(m_llvm.jit_lookup("step_e"));
+        // Create the multi state and assign it.
+        m_llvm_state = llvm_multi_state(std::move(states));
+
+        // Compile.
+        std::get<1>(m_llvm_state).compile();
+
+        // Create the storage for the tape of derivatives.
+        const auto [sz, al] = m_tape_sa;
+        m_tape = detail::make_aligned_buffer(sz, al);
     } else {
-        m_step_f = reinterpret_cast<typename i_data::step_f_t>(m_llvm.jit_lookup("step"));
+        std::get<0>(m_llvm_state).compile();
     }
 
+    detail::get_logger()->trace("Taylor batch LLVM compilation runtime: {}", sw);
+
+    // Fetch the stepper.
+    assign_stepper(with_events);
+
     // Fetch the function to compute the dense output.
-    m_d_out_f = reinterpret_cast<typename i_data::d_out_f_t>(m_llvm.jit_lookup("d_out_f"));
+    m_d_out_f = std::visit(
+        [](auto &s) { return reinterpret_cast<typename i_data::d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
 
     // Setup the vector for the Taylor coefficients.
     // NOTE: the size of m_state.size() already takes
@@ -351,22 +377,22 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
     m_d_out_time.resize(m_batch_size);
 
     // Init the event data structure if needed.
-    // NOTE: this can be done in parallel with the rest of the constructor,
+    // NOTE: in principle this can be done in parallel with the rest of the constructor,
     // once we have m_order/m_dim/m_batch_size and we are done using tes/ntes.
     if (with_events) {
-        m_ed_data = std::make_unique<ed_data>(m_llvm.make_similar(), std::move(tes), std::move(ntes), m_order, m_dim,
-                                              m_batch_size);
+        m_ed_data = std::make_unique<ed_data>(m_tplt_state.make_similar(), std::move(tes), std::move(ntes), m_order,
+                                              m_dim, m_batch_size);
     }
 
     if (auto_ic_setup) {
         // Finish the automatic setup of the ics for a variational
         // integrator.
-        detail::setup_variational_ics_t0(m_llvm, m_state, m_pars, m_time_hi.data(), std::get<1>(vsys), m_batch_size,
-                                         m_high_accuracy, m_compact_mode);
+        detail::setup_variational_ics_t0(m_tplt_state, m_state, m_pars, m_time_hi.data(), std::get<1>(vsys),
+                                         m_batch_size, m_high_accuracy, m_compact_mode);
     }
 
     if (is_variational) {
-        m_tm_data.emplace(std::get<1>(vsys), 0, m_llvm, m_batch_size);
+        m_tm_data.emplace(std::get<1>(vsys), 0, m_tplt_state, m_batch_size);
     }
 
     // Move vsys in.
@@ -384,11 +410,7 @@ taylor_adaptive_batch<T>::taylor_adaptive_batch(const taylor_adaptive_batch &oth
     : m_i_data(std::make_unique<i_data>(*other.m_i_data)),
       m_ed_data(other.m_ed_data ? std::make_unique<ed_data>(*other.m_ed_data) : nullptr)
 {
-    if (m_ed_data) {
-        m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(m_i_data->m_llvm.jit_lookup("step_e"));
-    } else {
-        m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_t>(m_i_data->m_llvm.jit_lookup("step"));
-    }
+    assign_stepper(static_cast<bool>(m_ed_data));
 }
 
 template <typename T>
@@ -447,12 +469,8 @@ void taylor_adaptive_batch<T>::load_impl(Archive &ar, unsigned version)
         ar >> m_i_data;
         ar >> m_ed_data;
 
-        // Recover the function pointers.
-        if (m_ed_data) {
-            m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(m_i_data->m_llvm.jit_lookup("step_e"));
-        } else {
-            m_i_data->m_step_f = reinterpret_cast<typename i_data::step_f_t>(m_i_data->m_llvm.jit_lookup("step"));
-        }
+        // Recover the stepper.
+        assign_stepper(static_cast<bool>(m_ed_data));
         // LCOV_EXCL_START
     } catch (...) {
         // Reset to def-cted state in case of exceptions.
@@ -602,6 +620,7 @@ void taylor_adaptive_batch<T>::step_impl(const std::vector<T> &max_delta_ts, boo
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_time_copy_lo);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_nf_detected);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_d_out_f);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tape);
 
     using std::abs;
     using std::isfinite;
@@ -633,11 +652,17 @@ void taylor_adaptive_batch<T>::step_impl(const std::vector<T> &max_delta_ts, boo
         return false;
     };
 
-    if (m_step_f.index() == 0u) {
+    if (m_step_f.index() == 0u || m_step_f.index() == 2u) {
         assert(!m_ed_data); // LCOV_EXCL_LINE
 
-        std::get<0>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(),
-                              wtc ? m_tc.data() : nullptr);
+        // Invoke the vanilla stepper.
+        if (m_step_f.index() == 0u) {
+            std::get<0>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(),
+                                  wtc ? m_tc.data() : nullptr);
+        } else {
+            std::get<2>(m_step_f)(m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(),
+                                  wtc ? m_tc.data() : nullptr, m_tape.get());
+        }
 
         // Update the times and the last timesteps, and write out the result.
         for (std::uint32_t i = 0; i < m_batch_size; ++i) {
@@ -673,8 +698,13 @@ void taylor_adaptive_batch<T>::step_impl(const std::vector<T> &max_delta_ts, boo
 
         // Invoke the stepper for event handling. We will record the norm infinity of the state vector +
         // event equations at the beginning of the timestep for later use.
-        std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(), m_delta_ts.data(),
-                              edd.m_max_abs_state.data());
+        if (m_step_f.index() == 1u) {
+            std::get<1>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(),
+                                  m_delta_ts.data(), edd.m_max_abs_state.data());
+        } else {
+            std::get<3>(m_step_f)(edd.m_ev_jet.data(), m_state.data(), m_pars.data(), m_time_hi.data(),
+                                  m_delta_ts.data(), edd.m_max_abs_state.data(), m_tape.get());
+        }
 
         // Compute the maximum absolute error on the Taylor series of the event equations, which we will use for
         // automatic cooldown deduction. If max_abs_state is not finite, set it to inf so that
@@ -1081,7 +1111,7 @@ taylor_adaptive_batch<T>::propagate_until_impl(const puntil_arg_t &ts_, std::siz
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_high_accuracy);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_dim);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_order);
-    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tplt_state);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_tc);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_pinf);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_res);
@@ -1215,7 +1245,7 @@ taylor_adaptive_batch<T>::propagate_until_impl(const puntil_arg_t &ts_, std::siz
             }
 
             // Construct the return value.
-            continuous_output_batch<T> ret(m_llvm.make_similar());
+            continuous_output_batch<T> ret(m_tplt_state.make_similar());
 
             // Fill in the data.
             ret.m_batch_size = m_batch_size;
@@ -1986,9 +2016,9 @@ taylor_adaptive_batch<T>::propagate_grid_impl(const std::vector<T> &grid, std::s
 }
 
 template <typename T>
-const llvm_state &taylor_adaptive_batch<T>::get_llvm_state() const
+const std::variant<llvm_state, llvm_multi_state> &taylor_adaptive_batch<T>::get_llvm_state() const
 {
-    return m_i_data->m_llvm;
+    return m_i_data->m_llvm_state;
 }
 
 template <typename T>
@@ -2291,6 +2321,29 @@ void taylor_adaptive_batch<T>::check_variational(const char *fname) const
     }
 }
 
+// Helper to fetch the stepper function from m_llvm_state.
+template <typename T>
+void taylor_adaptive_batch<T>::assign_stepper(bool with_events)
+{
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_compact_mode);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_step_f);
+    HEYOKA_TAYLOR_REF_FROM_I_DATA(m_llvm_state);
+
+    if (with_events) {
+        if (m_compact_mode) {
+            m_step_f = reinterpret_cast<typename i_data::c_step_f_e_t>(std::get<1>(m_llvm_state).jit_lookup("step_e"));
+        } else {
+            m_step_f = reinterpret_cast<typename i_data::step_f_e_t>(std::get<0>(m_llvm_state).jit_lookup("step_e"));
+        }
+    } else {
+        if (m_compact_mode) {
+            m_step_f = reinterpret_cast<typename i_data::c_step_f_t>(std::get<1>(m_llvm_state).jit_lookup("step"));
+        } else {
+            m_step_f = reinterpret_cast<typename i_data::step_f_t>(std::get<0>(m_llvm_state).jit_lookup("step"));
+        }
+    }
+}
+
 template <typename T>
 const std::vector<expression> &taylor_adaptive_batch<T>::get_vargs() const
 {
diff --git a/test/llvm_state.cpp b/test/llvm_state.cpp
index 57d61782f..821524df3 100644
--- a/test/llvm_state.cpp
+++ b/test/llvm_state.cpp
@@ -108,7 +108,7 @@ TEST_CASE("copy semantics")
                                   kw::fast_math = true,
                                   kw::mname = "sample state"};
 
-        const auto &s = ta.get_llvm_state();
+        const auto &s = std::get<0>(ta.get_llvm_state());
 
         REQUIRE(s.module_name() == "sample state");
         REQUIRE(s.get_opt_level() == 2u);
diff --git a/test/logical.cpp b/test/logical.cpp
index bf7a39ce9..a45329e8e 100644
--- a/test/logical.cpp
+++ b/test/logical.cpp
@@ -460,9 +460,8 @@ TEST_CASE("taylor_adaptive")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(
-                    boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and.var_var_num."));
-                REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or"));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_and.var_var_num."));
+                REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_or"));
             }
 
             ta1.propagate_until(5.);
@@ -481,8 +480,8 @@ TEST_CASE("taylor_adaptive")
                 kw::pars = {1.24}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or.var_var_par."));
-                REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and"));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_or.var_var_par."));
+                REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_and"));
             }
 
             ta1.propagate_until(5.);
@@ -514,9 +513,8 @@ TEST_CASE("taylor_adaptive_batch")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(
-                    boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and.var_var_num."));
-                REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or"));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_and.var_var_num."));
+                REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_or"));
             }
 
             ta1.propagate_until(5.);
@@ -541,8 +539,8 @@ TEST_CASE("taylor_adaptive_batch")
                 kw::pars = {1.24, 1.25}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_or.var_var_par."));
-                REQUIRE(!boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.logical_and"));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.logical_or.var_var_par."));
+                REQUIRE(!ir_contains(ta2, "heyoka.taylor_c_diff.logical_and"));
             }
 
             ta1.propagate_until(5.);
diff --git a/test/model_nbody.cpp b/test/model_nbody.cpp
index 7b89d0485..1911c0123 100644
--- a/test/model_nbody.cpp
+++ b/test/model_nbody.cpp
@@ -67,9 +67,11 @@ TEST_CASE("nbody")
 
         // Check that llvm.pow appears only maximum 3 times: its declaration plus 2 uses
         // for determining the timestep size. Vectorisation may further reduce this number.
-        std::vector<boost::iterator_range<std::string::const_iterator>> pow_matches;
-        boost::find_all(pow_matches, ta.get_llvm_state().get_ir(), "@llvm.pow");
-        REQUIRE(pow_matches.size() <= 3u);
+        for (auto cur_ir : std::get<1>(ta.get_llvm_state()).get_ir()) {
+            std::vector<boost::iterator_range<std::string::const_iterator>> pow_matches;
+            boost::find_all(pow_matches, cur_ir, "@llvm.pow");
+            REQUIRE(pow_matches.size() <= 3u);
+        }
 
         llvm_state s;
         std::vector<expression> vars;
diff --git a/test/opt_checks.cpp b/test/opt_checks.cpp
index df46a5049..1a60ef906 100644
--- a/test/opt_checks.cpp
+++ b/test/opt_checks.cpp
@@ -6,6 +6,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#include <variant>
 #include <vector>
 
 #include <boost/algorithm/string/find_iterator.hpp>
@@ -30,20 +31,20 @@ TEST_CASE("function inlining")
 
     auto ta = taylor_adaptive<double>{sys, std::vector<double>(36u, 0.), kw::compact_mode = true};
 
-    auto md_ir = ta.get_llvm_state().get_ir();
+    for (auto md_ir : std::get<1>(ta.get_llvm_state()).get_ir()) {
+        using string_find_iterator = boost::find_iterator<std::string::iterator>;
 
-    using string_find_iterator = boost::find_iterator<std::string::iterator>;
+        auto count = 0u;
+        for (auto it = boost::make_find_iterator(md_ir, boost::first_finder("define ", boost::is_iequal()));
+             it != string_find_iterator(); ++it) {
+            ++count;
+        }
 
-    auto count = 0u;
-    for (auto it = boost::make_find_iterator(md_ir, boost::first_finder("define ", boost::is_iequal()));
-         it != string_find_iterator(); ++it) {
-        ++count;
+        // NOTE: in general we expect 3 functions definitions, but auto-vectorization
+        // could bump up this number. I think 6 is the maximum right now (3 possible
+        // vector width on x86 - 2, 4, 8).
+        REQUIRE(count <= 6u);
     }
-
-    // NOTE: in general we expect 3 functions definitions, but auto-vectorization
-    // could bump up this number. I think 6 is the maximum right now (3 possible
-    // vector width on x86 - 2, 4, 8).
-    REQUIRE(count <= 6u);
 }
 
 // Vectorization of the pow() function when determining
@@ -54,7 +55,7 @@ TEST_CASE("pow vect")
 
 #if defined(HEYOKA_WITH_SLEEF)
 
-    auto md_ir = ta.get_llvm_state().get_ir();
+    auto md_ir = std::get<0>(ta.get_llvm_state()).get_ir();
 
     const auto &tf = detail::get_target_features();
 
diff --git a/test/rel.cpp b/test/rel.cpp
index 3b9123113..0b74b77eb 100644
--- a/test/rel.cpp
+++ b/test/rel.cpp
@@ -308,7 +308,7 @@ TEST_CASE("taylor_adaptive")
                                        kw::opt_level = opt_level};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_gt.var_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_gt.var_num."));
             }
 
             ta1.propagate_until(5.);
@@ -326,7 +326,7 @@ TEST_CASE("taylor_adaptive")
                                   kw::pars = {1.24}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_lt.var_par."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_lt.var_par."));
             }
 
             ta1.propagate_until(5.);
@@ -358,7 +358,7 @@ TEST_CASE("taylor_adaptive_batch")
                                         kw::opt_level = opt_level};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_gt.var_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_gt.var_num."));
             }
 
             ta1.propagate_until(5.);
@@ -382,7 +382,7 @@ TEST_CASE("taylor_adaptive_batch")
                                         kw::pars = {1.24, 1.25}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.rel_lt.var_par."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.rel_lt.var_par."));
             }
 
             ta1.propagate_until(5.);
diff --git a/test/select.cpp b/test/select.cpp
index df2afa337..be9c2c068 100644
--- a/test/select.cpp
+++ b/test/select.cpp
@@ -269,7 +269,7 @@ TEST_CASE("taylor_adaptive")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_var_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_var_num."));
             }
 
             ta1.propagate_until(5.);
@@ -287,7 +287,7 @@ TEST_CASE("taylor_adaptive")
                                   kw::pars = {1.}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_par_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_par_num."));
             }
 
             ta1.propagate_until(5.);
@@ -305,7 +305,7 @@ TEST_CASE("taylor_adaptive")
                                   kw::pars = {1.}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.par_par_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.par_par_num."));
             }
 
             ta1.propagate_until(5.);
@@ -337,7 +337,7 @@ TEST_CASE("taylor_adaptive_batch")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_var_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_var_num."));
             }
 
             ta1.propagate_until(5.);
@@ -362,7 +362,7 @@ TEST_CASE("taylor_adaptive_batch")
                 kw::pars = {1., 1.}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.var_par_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.var_par_num."));
             }
 
             ta1.propagate_until(5.);
@@ -386,7 +386,7 @@ TEST_CASE("taylor_adaptive_batch")
                                         kw::pars = {1., 1.}};
 
             if (opt_level == 0u && cm) {
-                REQUIRE(boost::contains(ta2.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.select.par_par_num."));
+                REQUIRE(ir_contains(ta2, "heyoka.taylor_c_diff.select.par_par_num."));
             }
 
             ta1.propagate_until(5.);
diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp
index 7c2f9406c..9229f4c44 100644
--- a/test/taylor_adaptive.cpp
+++ b/test/taylor_adaptive.cpp
@@ -1688,7 +1688,7 @@ void s11n_test_impl()
             ia >> ta;
         }
 
-        REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir());
+        REQUIRE(std::get<1>(ta.get_llvm_state()).get_ir() == std::get<1>(ta_copy.get_llvm_state()).get_ir());
         REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition());
         REQUIRE(ta.get_order() == ta_copy.get_order());
         REQUIRE(ta.get_tol() == ta_copy.get_tol());
@@ -1753,7 +1753,7 @@ void s11n_test_impl()
             ia >> ta;
         }
 
-        REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir());
+        REQUIRE(std::get<0>(ta.get_llvm_state()).get_ir() == std::get<0>(ta_copy.get_llvm_state()).get_ir());
         REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition());
         REQUIRE(ta.get_order() == ta_copy.get_order());
         REQUIRE(ta.get_dim() == ta_copy.get_dim());
diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp
index b1bd279b9..acb4ab50a 100644
--- a/test/taylor_adaptive_batch.cpp
+++ b/test/taylor_adaptive_batch.cpp
@@ -1070,7 +1070,7 @@ void s11n_test_impl()
             ia >> ta;
         }
 
-        REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir());
+        REQUIRE(std::get<1>(ta.get_llvm_state()).get_ir() == std::get<1>(ta_copy.get_llvm_state()).get_ir());
         REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition());
         REQUIRE(ta.get_order() == ta_copy.get_order());
         REQUIRE(ta.get_tol() == ta_copy.get_tol());
@@ -1143,7 +1143,7 @@ void s11n_test_impl()
             ia >> ta;
         }
 
-        REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir());
+        REQUIRE(std::get<0>(ta.get_llvm_state()).get_ir() == std::get<0>(ta_copy.get_llvm_state()).get_ir());
         REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition());
         REQUIRE(ta.get_order() == ta_copy.get_order());
         REQUIRE(ta.get_tol() == ta_copy.get_tol());
@@ -2130,7 +2130,7 @@ TEST_CASE("pow rho sleef")
     auto ta = taylor_adaptive_batch<double>{
         {prime(x) = rhs_x, prime(v) = rhs_v}, std::vector<double>(8u, 0.), 4u, kw::tol = 1e-6};
 
-    const auto ir = ta.get_llvm_state().get_ir();
+    const auto ir = std::get<0>(ta.get_llvm_state()).get_ir();
 
     // NOTE: run the check only if avx2 is available.
     if (!boost::algorithm::contains(ir, "+avx2")) {
diff --git a/test/taylor_adaptive_mp.cpp b/test/taylor_adaptive_mp.cpp
index 5d8c5c78d..aadf9990a 100644
--- a/test/taylor_adaptive_mp.cpp
+++ b/test/taylor_adaptive_mp.cpp
@@ -1113,7 +1113,7 @@ TEST_CASE("s11n")
                 ia >> ta;
             }
 
-            REQUIRE(ta.get_llvm_state().get_ir() == ta_copy.get_llvm_state().get_ir());
+            REQUIRE(std::get<1>(ta.get_llvm_state()).get_ir() == std::get<1>(ta_copy.get_llvm_state()).get_ir());
             REQUIRE(ta.get_decomposition() == ta_copy.get_decomposition());
             REQUIRE(ta.get_order() == ta_copy.get_order());
             REQUIRE(ta.get_tol() == ta_copy.get_tol());
diff --git a/test/taylor_atan2.cpp b/test/taylor_atan2.cpp
index 5eaf92aa2..e7fa2b102 100644
--- a/test/taylor_atan2.cpp
+++ b/test/taylor_atan2.cpp
@@ -116,7 +116,7 @@ TEST_CASE("taylor atan2")
                                             kw::pars = {b}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.num_par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.num_par"));
             }
 
             ta.step(true);
@@ -321,7 +321,7 @@ TEST_CASE("taylor atan2")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.var_num"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.var_num"));
             }
 
             ta.step(true);
@@ -545,7 +545,7 @@ TEST_CASE("taylor atan2")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.num_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.num_var"));
             }
 
             ta.step(true);
@@ -770,7 +770,7 @@ TEST_CASE("taylor atan2")
                                             kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.atan2.var_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.atan2.var_var"));
             }
 
             ta.step(true);
diff --git a/test/taylor_kepE.cpp b/test/taylor_kepE.cpp
index 6b7ec9ea7..9f9b15c4f 100644
--- a/test/taylor_kepE.cpp
+++ b/test/taylor_kepE.cpp
@@ -104,7 +104,7 @@ TEST_CASE("taylor kepE")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.num_num"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.num_num"));
             }
 
             ta.step(true);
@@ -306,7 +306,7 @@ TEST_CASE("taylor kepE")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.var_num"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.var_num"));
             }
 
             ta.step(true);
@@ -552,7 +552,7 @@ TEST_CASE("taylor kepE")
                 kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.num_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.num_var"));
             }
 
             ta.step(true);
@@ -773,7 +773,7 @@ TEST_CASE("taylor kepE")
                                             kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepE.var_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepE.var_var"));
             }
 
             ta.step(true);
diff --git a/test/taylor_kepF.cpp b/test/taylor_kepF.cpp
index a951ce2be..dad9899d4 100644
--- a/test/taylor_kepF.cpp
+++ b/test/taylor_kepF.cpp
@@ -93,7 +93,7 @@ TEST_CASE("taylor kepF")
                                                   kw::pars = {fp_t(.1), fp_t(.2)}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.num_par_num"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.num_par_num"));
             }
 
             ta.step(true);
@@ -133,7 +133,7 @@ TEST_CASE("taylor kepF")
                                                   kw::pars = {fp_t(.1), fp_t(.2)}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.num_par_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.num_par_var"));
             }
 
             ta.step(true);
@@ -185,7 +185,7 @@ TEST_CASE("taylor kepF")
                                                   kw::pars = {fp_t(.2), fp_t(.2)}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.num_var_par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.num_var_par"));
             }
 
             ta.step(true);
@@ -246,7 +246,7 @@ TEST_CASE("taylor kepF")
                                                   kw::pars = {fp_t(.2), fp_t(.2)}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_num_par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_num_par"));
             }
 
             ta.step(true);
@@ -307,7 +307,7 @@ TEST_CASE("taylor kepF")
                                                   kw::pars = {fp_t(.2), fp_t(.2)}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.par_var_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.par_var_var"));
             }
 
             ta.step(true);
@@ -368,7 +368,7 @@ TEST_CASE("taylor kepF")
                                                   kw::pars = {fp_t(.2), fp_t(.2)}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_par_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_par_var"));
             }
 
             ta.step(true);
@@ -429,7 +429,7 @@ TEST_CASE("taylor kepF")
                                                   kw::pars = {fp_t(.2), fp_t(.2)}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_var_par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_var_par"));
             }
 
             ta.step(true);
@@ -496,7 +496,7 @@ TEST_CASE("taylor kepF")
                                               kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.kepF.var_var_var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.kepF.var_var_var"));
             }
 
             ta.step(true);
diff --git a/test/taylor_kepF_mp.cpp b/test/taylor_kepF_mp.cpp
index e775377fe..57e7b37ba 100644
--- a/test/taylor_kepF_mp.cpp
+++ b/test/taylor_kepF_mp.cpp
@@ -62,8 +62,7 @@ TEST_CASE("kepF")
                                                         kw::pars = {fp_t(.1, prec)}};
 
                         if (opt_level == 0u && cm) {
-                            REQUIRE(
-                                boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.kepF.num_par_var"));
+                            REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.kepF.num_par_var"));
                         }
 
                         ta.step(true);
diff --git a/test/taylor_pow.cpp b/test/taylor_pow.cpp
index 3b2de6300..be2e6f00b 100644
--- a/test/taylor_pow.cpp
+++ b/test/taylor_pow.cpp
@@ -59,35 +59,35 @@ TEST_CASE("taylor pow approx")
     {
         auto ta = taylor_adaptive{{prime(x) = pow(x, -1.5) + pow(x, 1 / 3.)}, {2.}, kw::tol = .1, kw::opt_level = 0};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.pow"));
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt"));
+        REQUIRE(ir_contains(ta, "@llvm.pow"));
+        REQUIRE(ir_contains(ta, "@llvm.sqrt"));
     }
 
     {
         auto ta = taylor_adaptive{std::vector{std::pair{x, pow(par[0], -1.5)}}, {2.}, kw::tol = .1, kw::opt_level = 0};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt"));
+        REQUIRE(ir_contains(ta, "@llvm.sqrt"));
     }
 
     {
         auto ta
             = taylor_adaptive{std::vector{std::pair{x, pow(-1.5_dbl, par[0])}}, {2.}, kw::tol = .1, kw::opt_level = 0};
 
-        REQUIRE(!boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt"));
+        REQUIRE(!ir_contains(ta, "@llvm.sqrt"));
     }
 
     {
         auto ta = taylor_adaptive{
             std::vector{std::pair{x, pow(x, -1.5) + pow(x, 1 / 3.)}}, {2.}, kw::tol = .1, kw::opt_level = 0};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.pow"));
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt"));
+        REQUIRE(ir_contains(ta, "@llvm.pow"));
+        REQUIRE(ir_contains(ta, "@llvm.sqrt"));
     }
 
     {
         auto ta = taylor_adaptive{std::vector{std::pair{x, pow(par[0], -1.5)}}, {2.}, kw::tol = .1, kw::opt_level = 0};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@llvm.sqrt"));
+        REQUIRE(ir_contains(ta, "@llvm.sqrt"));
     }
 
     {
@@ -97,28 +97,28 @@ TEST_CASE("taylor pow approx")
                                   kw::opt_level = 0,
                                   kw::compact_mode = true};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow."));
+        REQUIRE(ir_contains(ta, "taylor_c_diff.pow."));
     }
 
     {
         auto ta = taylor_adaptive{
             std::vector{std::pair{x, pow(x, 2_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_square."));
+        REQUIRE(ir_contains(ta, "taylor_c_diff.pow_square."));
     }
 
     {
         auto ta = taylor_adaptive{
             std::vector{std::pair{x, pow(x, .5_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_sqrt."));
+        REQUIRE(ir_contains(ta, "taylor_c_diff.pow_sqrt."));
     }
 
     {
         auto ta = taylor_adaptive{
             std::vector{std::pair{x, pow(x, 1.5_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_pos_small_half_3."));
+        REQUIRE(ir_contains(ta, "taylor_c_diff.pow_pos_small_half_3."));
     }
 
     {
@@ -128,21 +128,21 @@ TEST_CASE("taylor pow approx")
                                   kw::opt_level = 0,
                                   kw::compact_mode = true};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_neg_small_half_3."));
+        REQUIRE(ir_contains(ta, "taylor_c_diff.pow_neg_small_half_3."));
     }
 
     {
         auto ta = taylor_adaptive{
             std::vector{std::pair{x, pow(x, 4_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_pos_small_int_4."));
+        REQUIRE(ir_contains(ta, "taylor_c_diff.pow_pos_small_int_4."));
     }
 
     {
         auto ta = taylor_adaptive{
             std::vector{std::pair{x, pow(x, -4_dbl)}}, {2.}, kw::tol = .1, kw::opt_level = 0, kw::compact_mode = true};
 
-        REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.pow_neg_small_int_4."));
+        REQUIRE(ir_contains(ta, "taylor_c_diff.pow_neg_small_int_4."));
     }
 }
 
@@ -168,7 +168,7 @@ TEST_CASE("taylor pow")
                                             kw::pars = {fp_t{1} / fp_t{3}}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.pow.num_par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.pow.num_par"));
             }
 
             ta.step(true);
@@ -422,7 +422,7 @@ TEST_CASE("taylor pow")
                                         kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.pow.var_num"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.pow.var_num"));
             }
 
             ta.step(true);
diff --git a/test/taylor_prod.cpp b/test/taylor_prod.cpp
index 271edfac5..0d301aa39 100644
--- a/test/taylor_prod.cpp
+++ b/test/taylor_prod.cpp
@@ -158,7 +158,7 @@ TEST_CASE("taylor mul")
                                                   kw::pars = {fp_t{0}, fp_t{0}, fp_t{3}, fp_t{3}}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg."));
+                REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg."));
             }
 
             ta.step(true);
@@ -205,7 +205,7 @@ TEST_CASE("taylor mul")
                                             kw::pars = {fp_t{-2}}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg."));
+                REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg."));
             }
 
             ta.step(true);
@@ -603,8 +603,8 @@ TEST_CASE("taylor mul")
                                             kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg."));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod."));
+                REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg."));
+                REQUIRE(ir_contains(ta, "taylor_c_diff.prod."));
             }
 
             ta.step(true);
@@ -675,8 +675,8 @@ TEST_CASE("taylor mul")
                                                   kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod_neg."));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "taylor_c_diff.prod."));
+                REQUIRE(ir_contains(ta, "taylor_c_diff.prod_neg."));
+                REQUIRE(ir_contains(ta, "taylor_c_diff.prod."));
             }
 
             ta.step(true);
diff --git a/test/taylor_relu.cpp b/test/taylor_relu.cpp
index 43ac00302..5da516713 100644
--- a/test/taylor_relu.cpp
+++ b/test/taylor_relu.cpp
@@ -72,8 +72,8 @@ TEST_CASE("taylor relu relup")
                                                   kw::pars = {fp_t{-1}, fp_t{2}, fp_t{4}, fp_t{-3}}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu.par"));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup.par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu.par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup.par"));
             }
 
             ta.step(true);
@@ -118,8 +118,8 @@ TEST_CASE("taylor relu relup")
                                                   kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu.var"));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup.var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu.var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup.var"));
             }
 
             ta.step(true);
@@ -182,9 +182,9 @@ TEST_CASE("taylor relu relup leaky")
                                               kw::pars = {fp_t{-1}, fp_t{2}, fp_t{4}, fp_t{-3}}};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu_0x"));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup_0x"));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), ".par"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu_0x"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup_0x"));
+                REQUIRE(ir_contains(ta, ".par"));
             }
 
             ta.step(true);
@@ -229,9 +229,9 @@ TEST_CASE("taylor relu relup leaky")
                                                   kw::opt_level = opt_level};
 
             if (opt_level == 0u && compact_mode) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relu_0x"));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "@heyoka.taylor_c_diff.relup_0x"));
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), ".var"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relu_0x"));
+                REQUIRE(ir_contains(ta, "@heyoka.taylor_c_diff.relup_0x"));
+                REQUIRE(ir_contains(ta, ".var"));
             }
 
             ta.step(true);
diff --git a/test/taylor_relu_mp.cpp b/test/taylor_relu_mp.cpp
index be5a6872b..f07638a8e 100644
--- a/test/taylor_relu_mp.cpp
+++ b/test/taylor_relu_mp.cpp
@@ -54,8 +54,8 @@ TEST_CASE("relu")
                                                         kw::opt_level = opt_level};
 
                         if (opt_level == 0u && cm) {
-                            REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relu.var"));
-                            REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relup.var"));
+                            REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relu.var"));
+                            REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relup.var"));
                         }
 
                         ta.step(true);
@@ -95,9 +95,9 @@ TEST_CASE("relu leaky")
                                                         kw::opt_level = opt_level};
 
                         if (opt_level == 0u && cm) {
-                            REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relu_0x"));
-                            REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "heyoka.taylor_c_diff.relup_0x"));
-                            REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), ".var"));
+                            REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relu_0x"));
+                            REQUIRE(ir_contains(ta, "heyoka.taylor_c_diff.relup_0x"));
+                            REQUIRE(ir_contains(ta, ".var"));
                         }
 
                         ta.step(true);
diff --git a/test/taylor_square.cpp b/test/taylor_square.cpp
index f39b6b822..871142bf3 100644
--- a/test/taylor_square.cpp
+++ b/test/taylor_square.cpp
@@ -90,7 +90,7 @@ TEST_CASE("taylor square")
                                             kw::pars = {fp_t{2}}};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_pos_small_int_2.par_num"));
+                REQUIRE(ir_contains(ta, "pow_pos_small_int_2.par_num"));
             }
 
             ta.step(true);
@@ -141,7 +141,7 @@ TEST_CASE("taylor square")
                                                   kw::pars = {fp_t{0}, fp_t{0}, fp_t{2}, fp_t{2}}};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_pos_small_int_2.par_num"));
+                REQUIRE(ir_contains(ta, "pow_pos_small_int_2.par_num"));
             }
 
             ta.step(true);
@@ -272,7 +272,7 @@ TEST_CASE("taylor square")
                                                   kw::pars = {fp_t{2}, fp_t{2}, fp_t{2}}};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_pos_small_int_2.par_num"));
+                REQUIRE(ir_contains(ta, "pow_pos_small_int_2.par_num"));
             }
 
             ta.step(true);
@@ -326,7 +326,7 @@ TEST_CASE("taylor square")
                                             kw::opt_level = opt_level};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num"));
+                REQUIRE(ir_contains(ta, "pow_square.var_num"));
             }
 
             ta.step(true);
@@ -349,7 +349,7 @@ TEST_CASE("taylor square")
                                                   kw::opt_level = opt_level};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num"));
+                REQUIRE(ir_contains(ta, "pow_square.var_num"));
             }
 
             ta.step(true);
@@ -378,7 +378,7 @@ TEST_CASE("taylor square")
                                             kw::opt_level = opt_level};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num"));
+                REQUIRE(ir_contains(ta, "pow_square.var_num"));
             }
 
             ta.step(true);
@@ -403,7 +403,7 @@ TEST_CASE("taylor square")
                                                   kw::opt_level = opt_level};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num"));
+                REQUIRE(ir_contains(ta, "pow_square.var_num"));
             }
 
             ta.step(true);
@@ -439,7 +439,7 @@ TEST_CASE("taylor square")
                                                   kw::opt_level = opt_level};
 
             if (compact_mode && opt_level == 0u) {
-                REQUIRE(boost::contains(ta.get_llvm_state().get_ir(), "pow_square.var_num"));
+                REQUIRE(ir_contains(ta, "pow_square.var_num"));
             }
 
             ta.step(true);
diff --git a/test/test_utils.hpp b/test/test_utils.hpp
index 52df16fb6..c4a10b132 100644
--- a/test/test_utils.hpp
+++ b/test/test_utils.hpp
@@ -11,6 +11,7 @@
 
 #include <heyoka/config.hpp>
 
+#include <algorithm>
 #include <array>
 #include <cmath>
 #include <cstddef>
@@ -21,8 +22,11 @@
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
+#include <boost/algorithm/string/predicate.hpp>
+
 #include <xtensor-blas/xlinalg.hpp>
 #include <xtensor/xfixed.hpp>
 #include <xtensor/xshape.hpp>
@@ -285,6 +289,16 @@ template <typename T>
 void compare_batch_scalar(const std::vector<std::pair<heyoka::expression, heyoka::expression>> &, unsigned, bool, bool,
                           std::mt19937 &, float, float, T = T(1000.));
 
+bool ir_contains(const auto &ta, const char *str)
+{
+    if (ta.get_compact_mode()) {
+        return std::ranges::any_of(std::get<1>(ta.get_llvm_state()).get_ir(),
+                                   [&](const auto &ir) { return boost::contains(ir, str); });
+    } else {
+        return boost::contains(std::get<0>(ta.get_llvm_state()).get_ir(), str);
+    }
+}
+
 } // namespace heyoka_test
 
 #endif

From 4d6f0133721f1f0e13dc6e5047cd83a84021b1bc Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 27 Aug 2024 09:28:27 +0200
Subject: [PATCH 06/30] Update the known issues page.

---
 doc/known_issues.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/known_issues.rst b/doc/known_issues.rst
index a822d395b..599f17127 100644
--- a/doc/known_issues.rst
+++ b/doc/known_issues.rst
@@ -22,6 +22,12 @@ Unsolved
 Solved
 ======
 
+* Due to an `upstream bug <https://github.com/llvm/llvm-project/issues/88115>`__,
+  the option for selecting the code used model for JIT compilation
+  (added in heyoka 6.0.0) is ignored by LLVM and the default code model
+  is always used. This issue affects all LLVM versions up to and including LLVM 18.
+  A patch for LLVM 18 that rectifies the issue is available
+  `here <https://github.com/llvm/llvm-project/pull/90599>`__.
 * Certain LLVM versions fail to correctly free memory when objects used to
   implement just-in-time compilation are destroyed. In practice this may result
   in exhausting the available RAM if many integrators and/or compiled functions

From 0900a07c4ff5710bc613425e6094c4acd035aed0 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Tue, 27 Aug 2024 18:35:45 +0200
Subject: [PATCH 07/30] Initial work on re-enabling parallel mode for the
 integrators.

---
 src/detail/llvm_helpers.cpp |   4 +
 src/taylor_01.cpp           |  92 +++++++--------
 src/taylor_02.cpp           | 218 ++++++++++++++++++++++++++++++++----
 3 files changed, 241 insertions(+), 73 deletions(-)

diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index c5a4afc2c..bf81c3e93 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -1358,6 +1358,8 @@ llvm::CallInst *llvm_invoke_external(llvm_state &s, const std::string &name, llv
         // Add the function attributes.
         callee_f->setAttributes(attrs);
     } else {
+        // LCOV_EXCL_START
+
         // The function declaration exists already. Check that it is only a
         // declaration and not a definition.
         if (!callee_f->isDeclaration()) {
@@ -1374,6 +1376,8 @@ llvm::CallInst *llvm_invoke_external(llvm_state &s, const std::string &name, llv
         }
         // NOTE: in the future we should consider adding more checks here
         // (e.g., argument types, return type, attributes, etc.).
+
+        // LCOV_EXCL_STOP
     }
 
     // Create the function call.
diff --git a/src/taylor_01.cpp b/src/taylor_01.cpp
index 3ebec981d..3d1498396 100644
--- a/src/taylor_01.cpp
+++ b/src/taylor_01.cpp
@@ -1106,64 +1106,58 @@ void taylor_add_d_out_function(llvm_state &s, llvm::Type *fp_scal_t, std::uint32
     builder.SetInsertPoint(orig_bb);
 }
 
+namespace
+{
+
+// NOTE: this is the function which computes the
+// Taylor derivatives for a subrange in a block.
+// A block consists of ncalls invocations of the same
+// Taylor derivative function with different arguments.
+// [begin, end) is a subrange of [0, ncalls). tape_ptr
+// is a pointer to the tape of derivatives, par_ptr and
+// time_ptr are pointers to the arrays of parameter value(s)
+// and time value(s). cur_order is the current Taylor order.
+using block_subrange_f = void (*)(std::uint32_t begin, std::uint32_t end, void *tape_ptr, const void *par_ptr,
+                                  const void *time_ptr, std::uint32_t cur_order) noexcept;
+
+} // namespace
+
 } // namespace detail
 
 HEYOKA_END_NAMESPACE
 
-// NOTE: this is the worker function that is invoked to compute
-// in parallel all the derivatives of a block in parallel mode.
-extern "C" HEYOKA_DLL_PUBLIC void heyoka_cm_par_looper(std::uint32_t ncalls,
-                                                       void (*fptr)(std::uint32_t, std::uint32_t) noexcept) noexcept
+// This function computes the Taylor derivatives for a segment in parallel mode.
+//
+// f_arr is the array of functions for the computations of the derivatives in the block
+// subranges, ncalls_ptr is an array containing the number of times each function in
+// f_arr must be called. Both f_arr and ncalls_ptr are arrays of size nfuncs.
+// tape/par/time_ptr are pointers to the tape/parameter/time values. cur_order is the Taylor
+// order at which the computation of the derivatives must be performed.
+extern "C" HEYOKA_DLL_PUBLIC void heyoka_taylor_cm_par_segment(const heyoka::detail::block_subrange_f *f_arr,
+                                                               const std::uint32_t *ncalls_ptr, std::uint32_t nfuncs,
+                                                               void *tape_ptr, const void *par_ptr,
+                                                               const void *time_ptr, std::uint32_t cur_order) noexcept
 {
     try {
-        oneapi::tbb::parallel_for(oneapi::tbb::blocked_range<std::uint32_t>(0, ncalls),
-                                  [fptr](const auto &range) { fptr(range.begin(), range.end()); });
+        oneapi::tbb::parallel_for(oneapi::tbb::blocked_range<std::uint32_t>(0, nfuncs),
+                                  [ncalls_ptr, f_arr, tape_ptr, par_ptr, time_ptr, cur_order](const auto &func_range) {
+                                      for (auto f_idx = func_range.begin(); f_idx != func_range.end(); ++f_idx) {
+                                          const auto cur_ncalls = ncalls_ptr[f_idx];
+                                          auto *cur_f = f_arr[f_idx];
+
+                                          oneapi::tbb::parallel_for(
+                                              oneapi::tbb::blocked_range<std::uint32_t>(0, cur_ncalls),
+                                              [cur_f, tape_ptr, par_ptr, time_ptr, cur_order](const auto &call_range) {
+                                                  cur_f(call_range.begin(), call_range.end(), tape_ptr, par_ptr,
+                                                        time_ptr, cur_order);
+                                              });
+                                      }
+                                  });
         // LCOV_EXCL_START
     } catch (const std::exception &ex) {
-        heyoka::detail::get_logger()->critical("Exception caught in the parallel mode looper: {}", ex.what());
+        heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker: {}", ex.what());
     } catch (...) {
-        heyoka::detail::get_logger()->critical("Exception caught in the parallel mode looper");
+        heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker");
     }
     // LCOV_EXCL_STOP
 }
-
-HEYOKA_BEGIN_NAMESPACE
-
-namespace detail
-{
-
-namespace
-{
-
-// NOTE: use typedef to minimise issues
-// when mucking around with the preprocessor.
-using par_f_ptr = void (*)() noexcept;
-
-} // namespace
-
-} // namespace detail
-
-HEYOKA_END_NAMESPACE
-
-// NOTE: this is the parallel invoker that gets called from LLVM
-// to run multiple parallel workers within a segment at the same time, i.e.,
-// to process multiple blocks within a segment concurrently.
-// We need to generate multiple instantiatiation of this function
-// up to the limit HEYOKA_CM_PAR_MAX_INVOKE_N defined in config.hpp.
-
-#define HEYOKA_CM_PAR_INVOKE(_0, N, _1)                                                                                \
-    extern "C" HEYOKA_DLL_PUBLIC void heyoka_cm_par_invoke_##N(                                                        \
-        BOOST_PP_ENUM_PARAMS(N, heyoka::detail::par_f_ptr f)) noexcept                                                 \
-    {                                                                                                                  \
-        try {                                                                                                          \
-            BOOST_PP_IF(BOOST_PP_SUB(N, 1), oneapi::tbb::parallel_invoke(BOOST_PP_ENUM_PARAMS(N, f)), f0());           \
-        } catch (const std::exception &ex) {                                                                           \
-            heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker: {}", ex.what());    \
-        } catch (...) {                                                                                                \
-            heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker");                   \
-        }                                                                                                              \
-    }
-
-BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_ADD(HEYOKA_CM_PAR_MAX_INVOKE_N, 1), HEYOKA_CM_PAR_INVOKE, _0)
-
-#undef HEYOKA_CM_PAR_INVOKE
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 0a3c03a95..0adc738a0 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -634,6 +634,193 @@ using taylor_cm_seg_f_list_t
     = std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
                llvm_func_name_compare>;
 
+// Helper to codegen the computation of the Taylor derivatives for a segment
+// from a taylor_cm_seg_f_list_t in sequential mode.
+//
+// s is the llvm state in which we are operating, fp_vec_type the internal vector type we are using
+// for computations, seg_map is the taylor_cm_seg_f_list_t containing the list of functions for the computation
+// of Taylor derivatives within a segment, n_uvars the total number of u variables in the decomposition.
+void taylor_cm_codegen_segment_diff_sequential(llvm_state &s, llvm::Type *fp_vec_type,
+                                               const taylor_cm_seg_f_list_t &seg_map, std::uint32_t n_uvars)
+{
+    // Fetch the current builder.
+    auto &bld = s.builder();
+
+    // Fetch the arguments from the driver prototype.
+    auto *driver_f = bld.GetInsertBlock()->getParent();
+    assert(driver_f != nullptr);
+    assert(driver_f->arg_size() == 4u);
+    auto *tape_ptr = driver_f->args().begin();
+    auto *par_ptr = driver_f->args().begin() + 1;
+    auto *time_ptr = driver_f->args().begin() + 2;
+    auto *cur_order = driver_f->args().begin() + 3;
+
+    // Compute the derivatives for this segment.
+    for (const auto &[func, fpair] : seg_map) {
+        const auto &[ncalls, gens] = fpair;
+
+        taylor_cm_codegen_block_diff(s, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type,
+                                     n_uvars);
+    }
+}
+
+// Helper to codegen the computation of the Taylor derivatives for a segment
+// from a taylor_cm_seg_f_list_t in parallel mode.
+//
+// s is the llvm state in which we are operating, fp_vec_type the internal vector type we are using
+// for computations, seg_map is the taylor_cm_seg_f_list_t containing the list of functions for the computation
+// of Taylor derivatives within a segment, n_uvars the total number of u variables in the decomposition.
+void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_type,
+                                             const taylor_cm_seg_f_list_t &seg_map, std::uint32_t n_uvars)
+{
+    // NOTE: in parallel mode, we introduce worker functions that operate similarly to
+    // taylor_cm_codegen_block_diff(), except that they do not process an entire block
+    // but only a subrange of a block. These worker functions are then invoked in parallel
+    // by heyoka_taylor_cm_par_segment(). In order to pass the worker functions to
+    // heyoka_taylor_cm_par_segment(), we need to store pointers to them in global arrays,
+    // together with the information on how many times each function must be called.
+
+    auto &bld = s.builder();
+    auto &ctx = s.context();
+    auto &md = s.module();
+
+    // Fetch the current insertion block, so that we can restore it later.
+    auto *orig_bb = bld.GetInsertBlock();
+
+    // Fetch several types for the current context.
+    auto *ptr_tp = llvm::PointerType::getUnqual(ctx);
+    auto *i32_tp = bld.getInt32Ty();
+    auto *void_tp = bld.getVoidTy();
+
+    // Init the vectors with the constant initializers for the workers/ncalls arrays.
+    std::vector<llvm::Constant *> workers_arr, ncalls_arr;
+
+    // Generate the workers for each block.
+    for (const auto &[func, fpair] : seg_map) {
+        const auto &[ncalls, gens] = fpair;
+
+        // Create the prototype for the current worker. The arguments are:
+        //
+        // - int32 begin/end call indices,
+        // - tape pointer (read & write),
+        // - par pointer (read-only),
+        // - time pointer (read-only),
+        // - int32 current Taylor order.
+        //
+        // The pointer arguments cannot overlap.
+        std::vector<llvm::Type *> worker_args{i32_tp, i32_tp, ptr_tp, ptr_tp, ptr_tp, i32_tp};
+
+        // The worker does not return anything.
+        auto *worker_proto = llvm::FunctionType::get(void_tp, worker_args, false);
+        assert(worker_proto != nullptr); // LCOV_EXCL_LINE
+
+        // Create the worker.
+        auto *worker = llvm::Function::Create(worker_proto, llvm::Function::InternalLinkage, "", &md);
+
+        // NOTE: the worker cannot call itself recursively.
+        worker->addFnAttr(llvm::Attribute::NoRecurse);
+
+        // Add the arguments' attributes.
+        auto *begin_arg = worker->args().begin();
+        begin_arg->setName("begin");
+
+        auto *end_arg = worker->args().begin() + 1;
+        end_arg->setName("end");
+
+        auto *tape_ptr_arg = worker->args().begin() + 2;
+        tape_ptr_arg->setName("tape_ptr");
+        tape_ptr_arg->addAttr(llvm::Attribute::NoCapture);
+        tape_ptr_arg->addAttr(llvm::Attribute::NoAlias);
+
+        auto *par_ptr_arg = worker->args().begin() + 3;
+        par_ptr_arg->setName("par_ptr");
+        par_ptr_arg->addAttr(llvm::Attribute::NoCapture);
+        par_ptr_arg->addAttr(llvm::Attribute::NoAlias);
+        par_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
+
+        auto *time_ptr_arg = worker->args().begin() + 4;
+        time_ptr_arg->setName("time_ptr");
+        time_ptr_arg->addAttr(llvm::Attribute::NoCapture);
+        time_ptr_arg->addAttr(llvm::Attribute::NoAlias);
+        time_ptr_arg->addAttr(llvm::Attribute::ReadOnly);
+
+        auto *order_arg = worker->args().begin() + 5;
+        order_arg->setName("order");
+
+        // Create a new basic block to start insertion into.
+        auto *bb = llvm::BasicBlock::Create(ctx, "entry", worker);
+        assert(bb != nullptr); // LCOV_EXCL_LINE
+        bld.SetInsertPoint(bb);
+
+        // Loop over the begin/end range.
+        llvm_loop_u32(s, begin_arg, end_arg, [&](llvm::Value *cur_call_idx) {
+            // Create the u variable index from the first generator.
+            auto u_idx = gens[0](cur_call_idx);
+
+            // Initialise the vector of arguments with which func must be called. The following
+            // initial arguments are always present:
+            // - current Taylor order,
+            // - u index of the variable,
+            // - tape of derivatives,
+            // - pointer to the param values,
+            // - pointer to the time value(s).
+            std::vector<llvm::Value *> args{order_arg, u_idx, tape_ptr_arg, par_ptr_arg, time_ptr_arg};
+
+            // Create the other arguments via the generators.
+            for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                args.push_back(gens[i](cur_call_idx));
+            }
+
+            // Calculate the derivative and store the result.
+            taylor_c_store_diff(s, fp_vec_type, tape_ptr_arg, n_uvars, order_arg, u_idx, bld.CreateCall(func, args));
+        });
+
+        // Return.
+        bld.CreateRetVoid();
+
+        // Add a pointer to the current worker to workers_arr.
+        workers_arr.push_back(worker);
+
+        // Add ncalls to ncalls_arr.
+        ncalls_arr.push_back(bld.getInt32(boost::numeric_cast<std::uint32_t>(ncalls)));
+    }
+
+    // Restore the original insertion block in the driver.
+    bld.SetInsertPoint(orig_bb);
+
+    // Generate the global variables for workers_arr and ncalls_arr, and fetch pointers
+    // to their first elements.
+    auto *workers_arr_tp = llvm::ArrayType::get(ptr_tp, boost::numeric_cast<std::uint64_t>(workers_arr.size()));
+    auto *workers_arr_carr = llvm::ConstantArray::get(workers_arr_tp, workers_arr);
+    auto *workers_arr_gv = new llvm::GlobalVariable(md, workers_arr_carr->getType(), true,
+                                                    llvm::GlobalVariable::InternalLinkage, workers_arr_carr);
+    auto *workers_ptr
+        = bld.CreateInBoundsGEP(workers_arr_carr->getType(), workers_arr_gv, {bld.getInt32(0), bld.getInt32(0)});
+
+    auto *ncalls_arr_tp = llvm::ArrayType::get(ptr_tp, boost::numeric_cast<std::uint64_t>(ncalls_arr.size()));
+    auto *ncalls_arr_carr = llvm::ConstantArray::get(ncalls_arr_tp, ncalls_arr);
+    auto *ncalls_arr_gv = new llvm::GlobalVariable(md, ncalls_arr_carr->getType(), true,
+                                                   llvm::GlobalVariable::InternalLinkage, ncalls_arr_carr);
+    auto *ncalls_ptr
+        = bld.CreateInBoundsGEP(ncalls_arr_carr->getType(), ncalls_arr_gv, {bld.getInt32(0), bld.getInt32(0)});
+
+    // Fetch the arguments for heyoka_taylor_cm_par_segment() from the driver prototype.
+    auto *driver_f = bld.GetInsertBlock()->getParent();
+    assert(driver_f != nullptr);
+    assert(driver_f->arg_size() == 4u);
+    auto *tape_ptr = driver_f->args().begin();
+    auto *par_ptr = driver_f->args().begin() + 1;
+    auto *time_ptr = driver_f->args().begin() + 2;
+    auto *cur_order = driver_f->args().begin() + 3;
+
+    // Invoke heyoka_taylor_cm_par_segment().
+    llvm_invoke_external(s, "heyoka_taylor_cm_par_segment", void_tp,
+                         {workers_ptr, ncalls_ptr, bld.getInt32(boost::numeric_cast<std::uint32_t>(seg_map.size())),
+                          tape_ptr, par_ptr, time_ptr, cur_order},
+                         llvm::AttributeList::get(ctx, llvm::AttributeList::FunctionIndex,
+                                                  {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn}));
+}
+
 // Helper to codegen the computation of the Taylor derivatives for a segment.
 //
 // seg is the segment, start_u_idx the index of the first u variable in the segment, s the llvm state
@@ -641,14 +828,11 @@ using taylor_cm_seg_f_list_t
 // the total number of u variables, high_accuracy the high accuracy flag.
 taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint32_t start_u_idx, llvm_state &s,
                                                       llvm::Type *fp_t, std::uint32_t batch_size, std::uint32_t n_uvars,
-                                                      bool high_accuracy)
+                                                      bool high_accuracy, bool parallel_mode)
 {
     // Fetch the internal vector type.
     auto *fp_vec_type = make_vector_type(fp_t, batch_size);
 
-    // Fetch the current builder.
-    auto &bld = s.builder();
-
     // This structure maps a function to sets of arguments
     // with which the function is to be called. For instance, if function
     // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
@@ -766,21 +950,10 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint
         }
     }
 
-    // Fetch the arguments from the driver prototype.
-    auto *driver_f = bld.GetInsertBlock()->getParent();
-    assert(driver_f != nullptr);
-    assert(driver_f->arg_size() == 4u);
-    auto *tape_ptr = driver_f->args().begin();
-    auto *par_ptr = driver_f->args().begin() + 1;
-    auto *time_ptr = driver_f->args().begin() + 2;
-    auto *cur_order = driver_f->args().begin() + 3;
-
-    // Compute the derivatives for this segment.
-    for (const auto &[func, fpair] : seg_map) {
-        const auto &[ncalls, gens] = fpair;
-
-        taylor_cm_codegen_block_diff(s, func, ncalls, gens, tape_ptr, par_ptr, time_ptr, cur_order, fp_vec_type,
-                                     n_uvars);
+    if (parallel_mode) {
+        taylor_cm_codegen_segment_diff_parallel(s, fp_vec_type, seg_map, n_uvars);
+    } else {
+        taylor_cm_codegen_segment_diff_sequential(s, fp_vec_type, seg_map, n_uvars);
     }
 
     return seg_map;
@@ -803,9 +976,6 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
                                                  std::uint32_t n_uvars, std::uint32_t order, std::uint32_t batch_size,
                                                  bool high_accuracy, bool parallel_mode, std::uint32_t max_svf_idx)
 {
-    // TODO implement.
-    (void)parallel_mode;
-
     // Init the list of states.
     // NOTE: we use lists here because it is convenient to have
     // pointer/reference stability when iteratively constructing
@@ -928,8 +1098,8 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
         auto *fp_t = llvm_clone_type(*cur_state, main_fp_t);
 
         // Codegen the computation of the derivatives for this segment.
-        const auto seg_map
-            = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars, high_accuracy);
+        const auto seg_map = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars,
+                                                            high_accuracy, parallel_mode);
 
         // Update the number of codegenned blocks.
         n_cg_blocks += seg_map.size();

From b9e1a161fc60247214494af0463d2e6fd2f932db Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 09:13:37 +0200
Subject: [PATCH 08/30] Small cleanups, internal renames and docs.

---
 src/taylor_01.cpp | 51 ++++++++++++++++++++++++-----------------------
 src/taylor_02.cpp | 14 +++++++------
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/src/taylor_01.cpp b/src/taylor_01.cpp
index 3d1498396..679a4a33c 100644
--- a/src/taylor_01.cpp
+++ b/src/taylor_01.cpp
@@ -25,11 +25,6 @@
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/graph/adjacency_list.hpp>
 #include <boost/numeric/conversion/cast.hpp>
-#include <boost/preprocessor/arithmetic/add.hpp>
-#include <boost/preprocessor/arithmetic/sub.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/enum_params.hpp>
-#include <boost/preprocessor/repetition/repeat_from_to.hpp>
 
 #include <oneapi/tbb/blocked_range.h>
 #include <oneapi/tbb/parallel_for.h>
@@ -1109,16 +1104,20 @@ void taylor_add_d_out_function(llvm_state &s, llvm::Type *fp_scal_t, std::uint32
 namespace
 {
 
-// NOTE: this is the function which computes the
+// NOTE: this is the worker function type which computes the
 // Taylor derivatives for a subrange in a block.
 // A block consists of ncalls invocations of the same
 // Taylor derivative function with different arguments.
+// Workers are created on the LLVM side when parallel mode is
+// active.
+//
 // [begin, end) is a subrange of [0, ncalls). tape_ptr
 // is a pointer to the tape of derivatives, par_ptr and
 // time_ptr are pointers to the arrays of parameter value(s)
-// and time value(s). cur_order is the current Taylor order.
-using block_subrange_f = void (*)(std::uint32_t begin, std::uint32_t end, void *tape_ptr, const void *par_ptr,
-                                  const void *time_ptr, std::uint32_t cur_order) noexcept;
+// and time value(s). order is the desired Taylor order for
+// the computation of the derivatives.
+using block_worker_f = void (*)(std::uint32_t begin, std::uint32_t end, void *tape_ptr, const void *par_ptr,
+                                const void *time_ptr, std::uint32_t order) noexcept;
 
 } // namespace
 
@@ -1126,38 +1125,40 @@ using block_subrange_f = void (*)(std::uint32_t begin, std::uint32_t end, void *
 
 HEYOKA_END_NAMESPACE
 
-// This function computes the Taylor derivatives for a segment in parallel mode.
+// This function computes the Taylor derivatives for a segment in parallel mode. It is invoked
+// from LLVM after the creation of the worker functions that compute the Taylor derivatives
+// for a subrange in a block.
 //
-// f_arr is the array of functions for the computations of the derivatives in the block
-// subranges, ncalls_ptr is an array containing the number of times each function in
-// f_arr must be called. Both f_arr and ncalls_ptr are arrays of size nfuncs.
-// tape/par/time_ptr are pointers to the tape/parameter/time values. cur_order is the Taylor
-// order at which the computation of the derivatives must be performed.
-extern "C" HEYOKA_DLL_PUBLIC void heyoka_taylor_cm_par_segment(const heyoka::detail::block_subrange_f *f_arr,
-                                                               const std::uint32_t *ncalls_ptr, std::uint32_t nfuncs,
+// worker_arr is the array of worker functions for the computations of the derivatives in the block
+// subranges, ncalls_arr is an array containing the number of times each function in
+// worker_arr must be called. Both worker_arr and ncalls_arr are arrays of size nfuncs.
+// tape/par/time_ptr are pointers to the tape/parameter/time values. order is the desired Taylor order for
+// the computation of the derivatives.
+extern "C" HEYOKA_DLL_PUBLIC void heyoka_taylor_cm_par_segment(const heyoka::detail::block_worker_f *worker_arr,
+                                                               const std::uint32_t *ncalls_arr, std::uint32_t nfuncs,
                                                                void *tape_ptr, const void *par_ptr,
-                                                               const void *time_ptr, std::uint32_t cur_order) noexcept
+                                                               const void *time_ptr, std::uint32_t order) noexcept
 {
     try {
         oneapi::tbb::parallel_for(oneapi::tbb::blocked_range<std::uint32_t>(0, nfuncs),
-                                  [ncalls_ptr, f_arr, tape_ptr, par_ptr, time_ptr, cur_order](const auto &func_range) {
+                                  [ncalls_arr, worker_arr, tape_ptr, par_ptr, time_ptr, order](const auto &func_range) {
                                       for (auto f_idx = func_range.begin(); f_idx != func_range.end(); ++f_idx) {
-                                          const auto cur_ncalls = ncalls_ptr[f_idx];
-                                          auto *cur_f = f_arr[f_idx];
+                                          const auto cur_ncalls = ncalls_arr[f_idx];
+                                          auto *cur_f = worker_arr[f_idx];
 
                                           oneapi::tbb::parallel_for(
                                               oneapi::tbb::blocked_range<std::uint32_t>(0, cur_ncalls),
-                                              [cur_f, tape_ptr, par_ptr, time_ptr, cur_order](const auto &call_range) {
+                                              [cur_f, tape_ptr, par_ptr, time_ptr, order](const auto &call_range) {
                                                   cur_f(call_range.begin(), call_range.end(), tape_ptr, par_ptr,
-                                                        time_ptr, cur_order);
+                                                        time_ptr, order);
                                               });
                                       }
                                   });
         // LCOV_EXCL_START
     } catch (const std::exception &ex) {
-        heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker: {}", ex.what());
+        heyoka::detail::get_logger()->critical("Exception caught in heyoka_taylor_cm_par_segment(): {}", ex.what());
     } catch (...) {
-        heyoka::detail::get_logger()->critical("Exception caught in the parallel mode invoker");
+        heyoka::detail::get_logger()->critical("Exception caught in heyoka_taylor_cm_par_segment()");
     }
     // LCOV_EXCL_STOP
 }
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 0adc738a0..4c311d726 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -655,7 +655,7 @@ void taylor_cm_codegen_segment_diff_sequential(llvm_state &s, llvm::Type *fp_vec
     auto *time_ptr = driver_f->args().begin() + 2;
     auto *cur_order = driver_f->args().begin() + 3;
 
-    // Compute the derivatives for this segment.
+    // Generate the code for the computation of the derivatives for this segment.
     for (const auto &[func, fpair] : seg_map) {
         const auto &[ncalls, gens] = fpair;
 
@@ -680,6 +680,7 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t
     // heyoka_taylor_cm_par_segment(), we need to store pointers to them in global arrays,
     // together with the information on how many times each function must be called.
 
+    // Fetch builder/context/module.
     auto &bld = s.builder();
     auto &ctx = s.context();
     auto &md = s.module();
@@ -687,7 +688,7 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t
     // Fetch the current insertion block, so that we can restore it later.
     auto *orig_bb = bld.GetInsertBlock();
 
-    // Fetch several types for the current context.
+    // Fetch several types from the current context.
     auto *ptr_tp = llvm::PointerType::getUnqual(ctx);
     auto *i32_tp = bld.getInt32Ty();
     auto *void_tp = bld.getVoidTy();
@@ -701,11 +702,11 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t
 
         // Create the prototype for the current worker. The arguments are:
         //
-        // - int32 begin/end call indices,
+        // - int32 begin/end call range,
         // - tape pointer (read & write),
         // - par pointer (read-only),
         // - time pointer (read-only),
-        // - int32 current Taylor order.
+        // - int32 Taylor order.
         //
         // The pointer arguments cannot overlap.
         std::vector<llvm::Type *> worker_args{i32_tp, i32_tp, ptr_tp, ptr_tp, ptr_tp, i32_tp};
@@ -811,12 +812,12 @@ void taylor_cm_codegen_segment_diff_parallel(llvm_state &s, llvm::Type *fp_vec_t
     auto *tape_ptr = driver_f->args().begin();
     auto *par_ptr = driver_f->args().begin() + 1;
     auto *time_ptr = driver_f->args().begin() + 2;
-    auto *cur_order = driver_f->args().begin() + 3;
+    auto *order = driver_f->args().begin() + 3;
 
     // Invoke heyoka_taylor_cm_par_segment().
     llvm_invoke_external(s, "heyoka_taylor_cm_par_segment", void_tp,
                          {workers_ptr, ncalls_ptr, bld.getInt32(boost::numeric_cast<std::uint32_t>(seg_map.size())),
-                          tape_ptr, par_ptr, time_ptr, cur_order},
+                          tape_ptr, par_ptr, time_ptr, order},
                          llvm::AttributeList::get(ctx, llvm::AttributeList::FunctionIndex,
                                                   {llvm::Attribute::NoUnwind, llvm::Attribute::WillReturn}));
 }
@@ -950,6 +951,7 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint
         }
     }
 
+    // Generate the code for the computation of the Taylor derivatives.
     if (parallel_mode) {
         taylor_cm_codegen_segment_diff_parallel(s, fp_vec_type, seg_map, n_uvars);
     } else {

From 148b2fbecfc5d44a09d7deb731b1acf6c80d4433 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 10:07:05 +0200
Subject: [PATCH 09/30] Add a flag to the llvm_multi_state constructor to
 enable parallel JIT.

---
 include/heyoka/llvm_state.hpp | 16 +++++++-
 src/llvm_state.cpp            | 74 +++++++++++++++++++++++------------
 test/llvm_multi_state.cpp     | 13 ++++--
 3 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 23ebb8536..086e0c631 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -347,6 +347,19 @@ struct llvm_mc_value {
 std::optional<llvm_mc_value> llvm_state_mem_cache_lookup(const std::vector<std::string> &, unsigned);
 void llvm_state_mem_cache_try_insert(std::vector<std::string>, unsigned, llvm_mc_value);
 
+// The default setting for the parjit flag for llvm_multi_state.
+// There is evidence of an LLVM thread scheduling bug when parallel compilation
+// is active, that rarely results in multiply-defined symbols for external C
+// functions, which leads to compilation failure. So far, we have been able to
+// trigger this issue only on Linux aarch64.
+inline constexpr bool default_parjit =
+#if defined(HEYOKA_ARCH_ARM) && defined(__linux__)
+    false
+#else
+    true
+#endif
+    ;
+
 } // namespace detail
 
 class HEYOKA_DLL_PUBLIC llvm_multi_state
@@ -371,7 +384,7 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
 
 public:
     llvm_multi_state();
-    explicit llvm_multi_state(std::vector<llvm_state>);
+    explicit llvm_multi_state(std::vector<llvm_state>, bool = detail::default_parjit);
     template <typename R>
         requires std::ranges::input_range<R>
                  && std::same_as<llvm_state, std::remove_cvref_t<std::ranges::range_reference_t<R>>>
@@ -393,6 +406,7 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
     [[nodiscard]] unsigned get_opt_level() const noexcept;
     [[nodiscard]] bool get_slp_vectorize() const noexcept;
     [[nodiscard]] code_model get_code_model() const noexcept;
+    [[nodiscard]] bool get_parjit() const noexcept;
 
     [[nodiscard]] std::vector<std::string> get_ir() const;
     [[nodiscard]] std::vector<std::string> get_bc() const;
diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp
index 8cf7139f3..da7908e14 100644
--- a/src/llvm_state.cpp
+++ b/src/llvm_state.cpp
@@ -1551,6 +1551,8 @@ struct multi_jit {
     // NOTE: this is the total number of modules, including
     // the master module.
     const unsigned m_n_modules = 0;
+    // Flag to signal that we are enabling parallel compilation.
+    const bool m_parjit;
     // NOTE: enumerate the LLVM members here in the same order
     // as llvm_state, as this is important to ensure proper
     // destruction order.
@@ -1570,7 +1572,7 @@ struct multi_jit {
     std::vector<std::string> m_ir_snapshots;
     std::vector<std::string> m_bc_snapshots;
 
-    explicit multi_jit(unsigned, unsigned, code_model, bool, bool);
+    explicit multi_jit(unsigned, unsigned, code_model, bool, bool, bool);
     multi_jit(const multi_jit &) = delete;
     multi_jit(multi_jit &&) noexcept = delete;
     llvm_multi_state &operator=(const multi_jit &) = delete;
@@ -1613,8 +1615,9 @@ constexpr auto master_module_name = "heyoka.master";
 
 // NOTE: this largely replicates the logic from the constructors of llvm_state and llvm_state::jit.
 // NOTE: make sure to coordinate changes in this constructor with llvm_state::jit.
-multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize)
-    : m_n_modules(n_modules)
+multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, bool force_avx512, bool slp_vectorize,
+                     bool parjit)
+    : m_n_modules(n_modules), m_parjit(parjit)
 {
     assert(n_modules >= 2u);
 
@@ -1637,31 +1640,37 @@ multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model,
     lljit_builder.setJITTargetMachineBuilder(jtmb);
 
 #if 0
-    // Create a task dispatcher.
-    auto tdisp = std::make_unique<tbb_task_dispatcher>();
 
-    // Create an ExecutorProcessControl.
-    auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp));
-    // LCOV_EXCL_START
-    if (!epc) {
-        auto err = epc.takeError();
+    if (m_parjit) {
+        // Create a task dispatcher.
+        auto tdisp = std::make_unique<tbb_task_dispatcher>();
 
-        std::string err_report;
-        llvm::raw_string_ostream ostr(err_report);
+        // Create an ExecutorProcessControl.
+        auto epc = llvm::orc::SelfExecutorProcessControl::Create(nullptr, std::move(tdisp));
+        // LCOV_EXCL_START
+        if (!epc) {
+            auto err = epc.takeError();
 
-        ostr << err;
+            std::string err_report;
+            llvm::raw_string_ostream ostr(err_report);
 
-        throw std::invalid_argument(
-            fmt::format("Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str()));
+            ostr << err;
+
+            throw std::invalid_argument(fmt::format(
+                "Could not create a SelfExecutorProcessControl. The full error message is:\n{}", ostr.str()));
+        }
+        // LCOV_EXCL_STOP
+
+        // Set it in the lljit builder.
+        lljit_builder.setExecutorProcessControl(std::move(*epc));
     }
-    // LCOV_EXCL_STOP
 
-    // Set it in the lljit builder.
-    lljit_builder.setExecutorProcessControl(std::move(*epc));
 #else
 
-    // Set the number of compilation threads.
-    lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency());
+    if (m_parjit) {
+        // Set the number of compilation threads.
+        lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency());
+    }
 
 #endif
 
@@ -1815,6 +1824,9 @@ struct llvm_multi_state::impl {
         // Store the states.
         ar << m_states;
 
+        // Store the parjit flag.
+        ar << m_jit->m_parjit;
+
         // Store the object files and the snapshots. These may be empty.
         ar << m_jit->m_object_files;
         ar << m_jit->m_ir_snapshots;
@@ -1837,10 +1849,14 @@ struct llvm_multi_state::impl {
 
         assert(!m_states.empty());
 
+        // Load the parjit flag.
+        bool parjit{};
+        ar >> parjit;
+
         // Reset the jit with a new one.
-        m_jit = std::make_unique<detail::multi_jit>(boost::safe_numerics::safe<unsigned>(m_states.size()) + 1,
-                                                    m_states[0].get_opt_level(), m_states[0].get_code_model(),
-                                                    m_states[0].force_avx512(), m_states[0].get_slp_vectorize());
+        m_jit = std::make_unique<detail::multi_jit>(
+            boost::safe_numerics::safe<unsigned>(m_states.size()) + 1, m_states[0].get_opt_level(),
+            m_states[0].get_code_model(), m_states[0].force_avx512(), m_states[0].get_slp_vectorize(), parjit);
 
         // Load the object files and the snapshots.
         ar >> m_jit->m_object_files;
@@ -1871,7 +1887,7 @@ struct llvm_multi_state::impl {
 
 llvm_multi_state::llvm_multi_state() = default;
 
-llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states_)
+llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states_, bool parjit)
 {
     // Fetch a const ref, as we want to make extra sure we do not modify
     // states_ until we move it to construct the impl.
@@ -1940,7 +1956,7 @@ llvm_multi_state::llvm_multi_state(std::vector<llvm_state> states_)
 
     // Create the multi_jit.
     auto jit = std::make_unique<detail::multi_jit>(boost::safe_numerics::safe<unsigned>(states.size()) + 1, opt_level,
-                                                   c_model, force_avx512, slp_vectorize);
+                                                   c_model, force_avx512, slp_vectorize, parjit);
 
     // Build and assign the implementation.
     impl imp{.m_states = std::move(states_), .m_jit = std::move(jit)};
@@ -1956,7 +1972,7 @@ llvm_multi_state::llvm_multi_state(const llvm_multi_state &other)
     impl imp{.m_states = other.m_impl->m_states,
              .m_jit = std::make_unique<detail::multi_jit>(other.m_impl->m_jit->m_n_modules, other.get_opt_level(),
                                                           other.get_code_model(), other.force_avx512(),
-                                                          other.get_slp_vectorize())};
+                                                          other.get_slp_vectorize(), other.get_parjit())};
     m_impl = std::make_unique<impl>(std::move(imp));
 
     if (other.is_compiled()) {
@@ -2131,6 +2147,11 @@ code_model llvm_multi_state::get_code_model() const noexcept
     return m_impl->m_states[0].get_code_model();
 }
 
+bool llvm_multi_state::get_parjit() const noexcept
+{
+    return m_impl->m_jit->m_parjit;
+}
+
 bool llvm_multi_state::is_compiled() const noexcept
 {
     return !m_impl->m_jit->m_module;
@@ -2346,6 +2367,7 @@ std::ostream &operator<<(std::ostream &os, const llvm_multi_state &s)
     oss << "SLP vectorization : " << s.get_slp_vectorize() << '\n';
     oss << "Code model        : " << s.get_code_model() << '\n';
     oss << "Optimisation level: " << s.get_opt_level() << '\n';
+    oss << "Parallel JIT      : " << s.get_parjit() << '\n';
     oss << "Data layout       : " << s.m_impl->m_states[0].m_jitter->m_lljit->getDataLayout().getStringRepresentation()
         << '\n';
     oss << "Target triple     : " << s.m_impl->m_states[0].m_jitter->get_target_triple().str() << '\n';
diff --git a/test/llvm_multi_state.cpp b/test/llvm_multi_state.cpp
index bda7b476c..f8589b046 100644
--- a/test/llvm_multi_state.cpp
+++ b/test/llvm_multi_state.cpp
@@ -100,6 +100,7 @@ TEST_CASE("basic")
         REQUIRE(ms.get_code_model() == code_model::large);
         REQUIRE(ms.get_n_modules() == 5u);
         REQUIRE(!ms.is_compiled());
+        REQUIRE(ms.get_parjit() == detail::default_parjit);
 
         ms.compile();
 
@@ -121,7 +122,7 @@ TEST_CASE("basic")
         llvm_state s{kw::opt_level = 1u, kw::fast_math = true, kw::force_avx512 = true, kw::slp_vectorize = true,
                      kw::code_model = code_model::large};
 
-        llvm_multi_state ms{{s, s, s, s}};
+        llvm_multi_state ms{{s, s, s, s}, false};
 
         auto ms2 = std::move(ms);
 
@@ -132,6 +133,7 @@ TEST_CASE("basic")
         REQUIRE(ms2.get_code_model() == code_model::large);
         REQUIRE(ms2.get_n_modules() == 5u);
         REQUIRE(!ms2.is_compiled());
+        REQUIRE(!ms2.get_parjit());
 
         ms2.compile();
 
@@ -163,7 +165,7 @@ TEST_CASE("copy semantics")
     add_cfunc<double>(s1, "f1", {x * y}, {x, y}, kw::compact_mode = true);
     add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
 
-    llvm_multi_state ms{{s1, s2}};
+    llvm_multi_state ms{{s1, s2}, false};
 
     auto ms_copy = ms;
 
@@ -175,6 +177,7 @@ TEST_CASE("copy semantics")
     REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
     REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
     REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE(!ms_copy.get_parjit());
     REQUIRE_THROWS_MATCHES(
         ms_copy.get_object_code(), std::invalid_argument,
         Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
@@ -227,6 +230,7 @@ TEST_CASE("copy semantics")
     REQUIRE(ms_copy2.get_opt_level() == ms.get_opt_level());
     REQUIRE(ms_copy2.get_slp_vectorize() == ms.get_slp_vectorize());
     REQUIRE(ms_copy2.get_code_model() == ms.get_code_model());
+    REQUIRE(!ms_copy2.get_parjit());
     REQUIRE_NOTHROW(ms_copy2.jit_lookup("f1"));
     REQUIRE_NOTHROW(ms_copy2.jit_lookup("f2"));
 
@@ -259,6 +263,7 @@ TEST_CASE("copy semantics")
     REQUIRE(ms_copy3.get_opt_level() == ms.get_opt_level());
     REQUIRE(ms_copy3.get_slp_vectorize() == ms.get_slp_vectorize());
     REQUIRE(ms_copy3.get_code_model() == ms.get_code_model());
+    REQUIRE(!ms_copy3.get_parjit());
     REQUIRE_NOTHROW(ms_copy3.jit_lookup("f1"));
     REQUIRE_NOTHROW(ms_copy3.jit_lookup("f2"));
 
@@ -298,7 +303,7 @@ TEST_CASE("s11n")
     add_cfunc<double>(s2, "f2", {x / y}, {x, y}, kw::compact_mode = true);
 
     // Uncompiled.
-    llvm_multi_state ms{{s1, s2}};
+    llvm_multi_state ms{{s1, s2}, false};
 
     std::stringstream ss;
 
@@ -322,6 +327,7 @@ TEST_CASE("s11n")
     REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
     REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
     REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE(!ms_copy.get_parjit());
     REQUIRE_THROWS_MATCHES(
         ms_copy.get_object_code(), std::invalid_argument,
         Message("The function 'get_object_code' can be invoked only after the llvm_multi_state has been compiled"));
@@ -353,6 +359,7 @@ TEST_CASE("s11n")
     REQUIRE(ms_copy.get_opt_level() == ms.get_opt_level());
     REQUIRE(ms_copy.get_slp_vectorize() == ms.get_slp_vectorize());
     REQUIRE(ms_copy.get_code_model() == ms.get_code_model());
+    REQUIRE(!ms_copy.get_parjit());
     REQUIRE_NOTHROW(ms_copy.jit_lookup("f1"));
     REQUIRE_NOTHROW(ms_copy.jit_lookup("f2"));
 

From 5668e21f63ae48dec0e0faaabd430892f7b09435 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 10:09:32 +0200
Subject: [PATCH 10/30] Bump up the class s11n version numbers for the adaptive
 integrators.

---
 include/heyoka/taylor.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp
index b3de93017..be7d7f77c 100644
--- a/include/heyoka/taylor.hpp
+++ b/include/heyoka/taylor.hpp
@@ -1193,7 +1193,8 @@ namespace detail
 //      which resulted also in changes in the event detection data structure.
 // - 4: switched to pimpl implementation for i_data.
 // - 5: removed m_state_vars/m_rhs, variational ODE data.
-inline constexpr int taylor_adaptive_s11n_version = 5;
+// - 6: added parallel JIT compilation for compact mode.
+inline constexpr int taylor_adaptive_s11n_version = 6;
 
 // Boost s11n class version history for taylor_adaptive_batch:
 // - 1: added the m_state_vars and m_rhs members.
@@ -1201,7 +1202,8 @@ inline constexpr int taylor_adaptive_s11n_version = 5;
 //      which resulted also in changes in the event detection data structure.
 // - 3: switched to pimpl implementation for i_data.
 // - 4: removed m_state_vars/m_rhs, variational ODE data.
-inline constexpr int taylor_adaptive_batch_s11n_version = 4;
+// - 5: added parallel JIT compilation for compact mode.
+inline constexpr int taylor_adaptive_batch_s11n_version = 5;
 
 } // namespace detail
 

From 58071994525d3ddaed8faea9b6d2e7b4db721836 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 11:14:13 +0200
Subject: [PATCH 11/30] Internal doc bits, coverage fixes.

---
 src/detail/i_data.cpp         | 28 ++++++++++++++++++++--------
 src/taylor_adaptive_batch.cpp |  2 ++
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/detail/i_data.cpp b/src/detail/i_data.cpp
index 45e431c9a..3eaf66e31 100644
--- a/src/detail/i_data.cpp
+++ b/src/detail/i_data.cpp
@@ -103,13 +103,13 @@ void taylor_adaptive<T>::i_data::init_cm_tape()
     const auto [sz, al] = m_tape_sa;
 
     if (m_compact_mode) {
-        assert(sz != 0u);
-        assert(al != 0u);
+        assert(sz != 0u); // LCOV_EXCL_LINE
+        assert(al != 0u); // LCOV_EXCL_LINE
 
         m_tape = detail::make_aligned_buffer(sz, al);
     } else {
-        assert(sz == 0u);
-        assert(al == 0u);
+        assert(sz == 0u); // LCOV_EXCL_LINE
+        assert(al == 0u); // LCOV_EXCL_LINE
     }
 }
 
@@ -157,6 +157,9 @@ void taylor_adaptive<T>::i_data::load(boost::archive::binary_iarchive &ar, unsig
     ar >> m_tm_data;
 
     // Recover the function pointers.
+    // NOTE: here we are recovering only the dense output function pointer because recovering
+    // the correct stepper requires information which is available only from the integrator
+    // class (hence, we do it from there).
     m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
 
     // Reconstruct the compact mode tape, if necessary.
@@ -184,6 +187,9 @@ taylor_adaptive<T>::i_data::i_data(const i_data &other)
       m_tm_data(other.m_tm_data)
 {
     // Recover the function pointers.
+    // NOTE: here we are recovering only the dense output function pointer because recovering
+    // the correct stepper requires information which is available only from the integrator
+    // class (hence, we do it from there).
     m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
 
     // Init the compact mode tape, if necessary.
@@ -226,13 +232,13 @@ void taylor_adaptive_batch<T>::i_data::init_cm_tape()
     const auto [sz, al] = m_tape_sa;
 
     if (m_compact_mode) {
-        assert(sz != 0u);
-        assert(al != 0u);
+        assert(sz != 0u); // LCOV_EXCL_LINE
+        assert(al != 0u); // LCOV_EXCL_LINE
 
         m_tape = detail::make_aligned_buffer(sz, al);
     } else {
-        assert(sz == 0u);
-        assert(al == 0u);
+        assert(sz == 0u); // LCOV_EXCL_LINE
+        assert(al == 0u); // LCOV_EXCL_LINE
     }
 }
 
@@ -316,6 +322,9 @@ void taylor_adaptive_batch<T>::i_data::load(boost::archive::binary_iarchive &ar,
     ar >> m_tm_data;
 
     // Recover the function pointers.
+    // NOTE: here we are recovering only the dense output function pointer because recovering
+    // the correct stepper requires information which is available only from the integrator
+    // class (hence, we do it from there).
     m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
 
     // Reconstruct the compact mode tape, if necessary.
@@ -349,6 +358,9 @@ taylor_adaptive_batch<T>::i_data::i_data(const i_data &other)
       m_tm_data(other.m_tm_data)
 {
     // Recover the function pointers.
+    // NOTE: here we are recovering only the dense output function pointer because recovering
+    // the correct stepper requires information which is available only from the integrator
+    // class (hence, we do it from there).
     m_d_out_f = std::visit([](auto &s) { return reinterpret_cast<d_out_f_t>(s.jit_lookup("d_out_f")); }, m_llvm_state);
 
     // Init the compact mode tape, if necessary.
diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp
index 43e2f9788..1c9c6a042 100644
--- a/src/taylor_adaptive_batch.cpp
+++ b/src/taylor_adaptive_batch.cpp
@@ -2322,6 +2322,8 @@ void taylor_adaptive_batch<T>::check_variational(const char *fname) const
 }
 
 // Helper to fetch the stepper function from m_llvm_state.
+// NOTE: this is exactly identical to the scalar integrator code.
+// Should we write a separate common helper for this at one point?
 template <typename T>
 void taylor_adaptive_batch<T>::assign_stepper(bool with_events)
 {

From ddccd2069b925ffae0ae270e55c0b22b4ea3cc2a Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 12:14:28 +0200
Subject: [PATCH 12/30] A couple of test additions.

---
 test/taylor_adaptive.cpp       | 20 ++++++++++++++++++++
 test/taylor_adaptive_batch.cpp | 20 ++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp
index 9229f4c44..150a2296f 100644
--- a/test/taylor_adaptive.cpp
+++ b/test/taylor_adaptive.cpp
@@ -1701,6 +1701,8 @@ void s11n_test_impl()
         REQUIRE(ta.get_tc() == ta_copy.get_tc());
         REQUIRE(ta.get_last_h() == ta_copy.get_last_h());
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
+        REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
+        REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
 
         REQUIRE(value_type_index(ta.get_t_events()[0].get_callback())
                 == value_type_index(ta_copy.get_t_events()[0].get_callback()));
@@ -1763,6 +1765,8 @@ void s11n_test_impl()
         REQUIRE(ta.get_tc() == ta_copy.get_tc());
         REQUIRE(ta.get_last_h() == ta_copy.get_last_h());
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
+        REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_ir() == std::get<0>(ta.get_llvm_state()).get_ir());
+        REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_bc() == std::get<0>(ta.get_llvm_state()).get_bc());
 
         // Take a step in ta and in ta_copy.
         ta.step(true);
@@ -1826,6 +1830,14 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_tol() == ta.get_tol());
     REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy());
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+
+    ta.step();
+    ta_copy.step();
+
+    REQUIRE(ta.get_state() == ta_copy.get_state());
+    REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
 
     ta_copy = taylor_adaptive<fp_t>{};
     ta_copy = ta;
@@ -1835,6 +1847,14 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_tol() == ta.get_tol());
     REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy());
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+
+    ta.step();
+    ta_copy.step();
+
+    REQUIRE(ta.get_state() == ta_copy.get_state());
+    REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
 }
 
 #if defined(HEYOKA_ARCH_PPC)
diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp
index acb4ab50a..047191b87 100644
--- a/test/taylor_adaptive_batch.cpp
+++ b/test/taylor_adaptive_batch.cpp
@@ -1083,6 +1083,8 @@ void s11n_test_impl()
         REQUIRE(ta.get_tc() == ta_copy.get_tc());
         REQUIRE(ta.get_last_h() == ta_copy.get_last_h());
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
+        REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
+        REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
 
         REQUIRE(ta.get_step_res() == ta_copy.get_step_res());
         REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res());
@@ -1154,6 +1156,8 @@ void s11n_test_impl()
         REQUIRE(ta.get_tc() == ta_copy.get_tc());
         REQUIRE(ta.get_last_h() == ta_copy.get_last_h());
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
+        REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_ir() == std::get<0>(ta.get_llvm_state()).get_ir());
+        REQUIRE(std::get<0>(ta_copy.get_llvm_state()).get_bc() == std::get<0>(ta.get_llvm_state()).get_bc());
 
         REQUIRE(value_type_index(ta.get_t_events()[0].get_callback())
                 == value_type_index(ta_copy.get_t_events()[0].get_callback()));
@@ -1717,6 +1721,14 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_tol() == ta.get_tol());
     REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy());
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+
+    ta.step();
+    ta_copy.step();
+
+    REQUIRE(ta.get_state() == ta_copy.get_state());
+    REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
 
     ta_copy = taylor_adaptive_batch<fp_t>{};
     ta_copy = ta;
@@ -1726,6 +1738,14 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_tol() == ta.get_tol());
     REQUIRE(ta_copy.get_high_accuracy() == ta.get_high_accuracy());
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+
+    ta.step();
+    ta_copy.step();
+
+    REQUIRE(ta.get_state() == ta_copy.get_state());
+    REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
 }
 
 // Test case for the propagate_*() functions not considering

From eec949abb7587268255f484c8481d0ba405e2f64 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 12:37:36 +0200
Subject: [PATCH 13/30] More test additions.

---
 test/taylor_adaptive.cpp       | 16 ++++++++++++++++
 test/taylor_adaptive_batch.cpp | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp
index 150a2296f..ee77593f5 100644
--- a/test/taylor_adaptive.cpp
+++ b/test/taylor_adaptive.cpp
@@ -1727,6 +1727,12 @@ void s11n_test_impl()
         ta_copy.update_d_output(-.1, true);
 
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
+
+        // Also run a propagation with continuous output to test that
+        // the m_tplt_state member is correctly copied.
+        auto prop_res = ta.propagate_for(10., kw::c_output = true);
+        auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true);
+        REQUIRE((*std::get<4>(prop_res))(4.1) == (*std::get<4>(prop_copy_res))(4.1));
     }
 
     // Test without events.
@@ -1839,6 +1845,12 @@ TEST_CASE("copy semantics")
     REQUIRE(ta.get_state() == ta_copy.get_state());
     REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
 
+    // Also run a propagation with continuous output to test that
+    // the m_tplt_state member is correctly copied.
+    auto prop_res = ta.propagate_for(10., kw::c_output = true);
+    auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true);
+    REQUIRE((*std::get<4>(prop_res))(4.1) == (*std::get<4>(prop_copy_res))(4.1));
+
     ta_copy = taylor_adaptive<fp_t>{};
     ta_copy = ta;
 
@@ -1855,6 +1867,10 @@ TEST_CASE("copy semantics")
 
     REQUIRE(ta.get_state() == ta_copy.get_state());
     REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
+
+    prop_res = ta.propagate_for(10., kw::c_output = true);
+    prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true);
+    REQUIRE((*std::get<4>(prop_res))(14.1) == (*std::get<4>(prop_copy_res))(14.1));
 }
 
 #if defined(HEYOKA_ARCH_PPC)
diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp
index 047191b87..a05788c6b 100644
--- a/test/taylor_adaptive_batch.cpp
+++ b/test/taylor_adaptive_batch.cpp
@@ -1105,6 +1105,12 @@ void s11n_test_impl()
         ta_copy.update_d_output({-.1, -.11}, true);
 
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
+
+        // Also run a propagation with continuous output to test that
+        // the m_tplt_state member is correctly copied.
+        auto prop_res = ta.propagate_for(10., kw::c_output = true);
+        auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true);
+        REQUIRE((*std::get<0>(prop_res))(4.1) == (*std::get<0>(prop_copy_res))(4.1));
     }
 
     // A test with events.
@@ -1730,6 +1736,12 @@ TEST_CASE("copy semantics")
     REQUIRE(ta.get_state() == ta_copy.get_state());
     REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
 
+    // Also run a propagation with continuous output to test that
+    // the m_tplt_state member is correctly copied.
+    auto prop_res = ta.propagate_for(10., kw::c_output = true);
+    auto prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true);
+    REQUIRE((*std::get<0>(prop_res))(4.1) == (*std::get<0>(prop_copy_res))(4.1));
+
     ta_copy = taylor_adaptive_batch<fp_t>{};
     ta_copy = ta;
 
@@ -1746,6 +1758,10 @@ TEST_CASE("copy semantics")
 
     REQUIRE(ta.get_state() == ta_copy.get_state());
     REQUIRE(ta.get_dtime() == ta_copy.get_dtime());
+
+    prop_res = ta.propagate_for(10., kw::c_output = true);
+    prop_copy_res = ta_copy.propagate_for(10., kw::c_output = true);
+    REQUIRE((*std::get<0>(prop_res))(14.1) == (*std::get<0>(prop_copy_res))(14.1));
 }
 
 // Test case for the propagate_*() functions not considering

From 1c0f1493ab2e0301657dbb9798488d68626ec01b Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 14:58:06 +0200
Subject: [PATCH 14/30] Make sure to test compact mode propagation too in the
 batch integrator.

---
 test/taylor_adaptive_batch.cpp | 820 +++++++++++++++++----------------
 1 file changed, 419 insertions(+), 401 deletions(-)

diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp
index a05788c6b..ec6cb755c 100644
--- a/test/taylor_adaptive_batch.cpp
+++ b/test/taylor_adaptive_batch.cpp
@@ -162,270 +162,282 @@ TEST_CASE("propagate grid")
 {
     using Catch::Matchers::Message;
 
-    auto [x, v] = make_vars("x", "v");
-
-    auto ta = taylor_adaptive_batch<double>{
-        {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}, 4u};
-
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({}), std::invalid_argument,
-        Message(
-            "Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if the time grid is empty"));
-
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({1.}), std::invalid_argument,
-        Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
-                "the grid has a size of 1, which is not a multiple of the batch size (4)"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({1., 2.}), std::invalid_argument,
-        Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
-                "the grid has a size of 2, which is not a multiple of the batch size (4)"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({1., 2., 3., 4., 5.}), std::invalid_argument,
-        Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
-                "the grid has a size of 5, which is not a multiple of the batch size (4)"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({0., 0., 1., 4.}), std::invalid_argument,
-        Message("When invoking propagate_grid(), the first element of the time grid "
-                "must match the current time coordinate - however, the first element of the time grid at "
-                "batch index 2 has a "
-                "value of 1, while the current time coordinate is 0"));
-
-    ta.set_time({0., 0., std::numeric_limits<double>::infinity(), 0.});
-
-    REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0.}), std::invalid_argument,
-                           Message("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if "
-                                   "the current time is not finite"));
-
-    ta.set_time({0., 0., 0., 0.});
+    for (auto cm : {true, false}) {
+        auto [x, v] = make_vars("x", "v");
 
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({0., 0., std::numeric_limits<double>::infinity(), 0.}), std::invalid_argument,
-        Message(
-            "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({0., 0., 0., 0., 0., std::numeric_limits<double>::infinity(), 0., 0.}), std::invalid_argument,
-        Message(
-            "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., -1., 1.}), std::invalid_argument,
-                           Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
-                                   "Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., std::numeric_limits<double>::infinity()}),
-        std::invalid_argument,
-        Message(
-            "A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 0., 0., 2.}), std::invalid_argument,
-                           Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
-                                   "Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 2.}), std::invalid_argument,
-                           Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
-                                   "Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 0., 1., 1., 2., 2., 2., 2.}), std::invalid_argument,
-                           Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
-                                   "Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 0., 2., 2., 2., 2.}), std::invalid_argument,
-                           Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
-                                   "Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 2., 1., 2.}), std::invalid_argument,
-                           Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
-                                   "Taylor integrator in batch mode"));
-
-    // Set an infinity in the state.
-    ta.get_state_data()[0] = std::numeric_limits<double>::infinity();
-
-    auto [cb, ret] = ta.propagate_grid({.0, .0, .0, .0});
-    REQUIRE(!cb);
-    REQUIRE(ret.size() == 8u);
-    REQUIRE(std::get<0>(ta.get_propagate_res()[0]) == taylor_outcome::err_nf_state);
-    REQUIRE(std::get<0>(ta.get_propagate_res()[1]) == taylor_outcome::time_limit);
-    REQUIRE(std::get<0>(ta.get_propagate_res()[2]) == taylor_outcome::time_limit);
-    REQUIRE(std::get<0>(ta.get_propagate_res()[3]) == taylor_outcome::time_limit);
+        auto ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -9.8 * sin(x)},
+                                                {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253},
+                                                4u,
+                                                kw::compact_mode = cm};
 
-    // Reset the integrator.
-    ta = taylor_adaptive_batch<double>{
-        {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}, 4u};
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({}), std::invalid_argument,
+                               Message("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode "
+                                       "if the time grid is empty"));
 
-    // Propagate to the initial time.
-    std::tie(cb, ret) = ta.propagate_grid({0., 0., 0., 0.});
-    REQUIRE(!cb);
-    REQUIRE(ret.size() == 8u);
-    REQUIRE(ret == std::vector{0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253});
-    for (auto i = 0u; i < 4u; ++i) {
-        auto [oc, min_h, max_h, nsteps] = ta.get_propagate_res()[i];
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid({1.}), std::invalid_argument,
+            Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
+                    "the grid has a size of 1, which is not a multiple of the batch size (4)"));
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid({1., 2.}), std::invalid_argument,
+            Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
+                    "the grid has a size of 2, which is not a multiple of the batch size (4)"));
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid({1., 2., 3., 4., 5.}), std::invalid_argument,
+            Message("Invalid grid size detected in propagate_grid() for an adaptive Taylor integrator in batch mode: "
+                    "the grid has a size of 5, which is not a multiple of the batch size (4)"));
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid({0., 0., 1., 4.}), std::invalid_argument,
+            Message("When invoking propagate_grid(), the first element of the time grid "
+                    "must match the current time coordinate - however, the first element of the time grid at "
+                    "batch index 2 has a "
+                    "value of 1, while the current time coordinate is 0"));
 
-        REQUIRE(oc == taylor_outcome::time_limit);
-        REQUIRE(min_h == std::numeric_limits<double>::infinity());
-        REQUIRE(max_h == 0);
-        REQUIRE(nsteps == 0u);
-    }
+        ta.set_time({0., 0., std::numeric_limits<double>::infinity(), 0.});
 
-    // Switch to the harmonic oscillator.
-    ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid({0., 0., 0., 0.}), std::invalid_argument,
+            Message("Cannot invoke propagate_grid() in an adaptive Taylor integrator in batch mode if "
+                    "the current time is not finite"));
+
+        ta.set_time({0., 0., 0., 0.});
+
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., std::numeric_limits<double>::infinity(), 0.}),
+                               std::invalid_argument,
+                               Message("A non-finite time value was passed to propagate_grid() in an adaptive Taylor "
+                                       "integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 0., std::numeric_limits<double>::infinity(), 0., 0.}),
+                               std::invalid_argument,
+                               Message("A non-finite time value was passed to propagate_grid() in an adaptive Taylor "
+                                       "integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., -1., 1.}), std::invalid_argument,
+                               Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
+                                       "Taylor integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., std::numeric_limits<double>::infinity()}),
+            std::invalid_argument,
+            Message("A non-finite time value was passed to propagate_grid() in an adaptive Taylor integrator in batch "
+                    "mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 0., 0., 2.}),
+                               std::invalid_argument,
+                               Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
+                                       "Taylor integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 2.}),
+                               std::invalid_argument,
+                               Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
+                                       "Taylor integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 0., 1., 1., 2., 2., 2., 2.}),
+                               std::invalid_argument,
+                               Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
+                                       "Taylor integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 0., 2., 2., 2., 2.}),
+                               std::invalid_argument,
+                               Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
+                                       "Taylor integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_grid({0., 0., 0., 0., 1., 1., 1., 1., 2., 2., 1., 2.}),
+                               std::invalid_argument,
+                               Message("A non-monotonic time grid was passed to propagate_grid() in an adaptive "
+                                       "Taylor integrator in batch mode"));
+
+        // Set an infinity in the state.
+        ta.get_state_data()[0] = std::numeric_limits<double>::infinity();
+
+        auto [cb, ret] = ta.propagate_grid({.0, .0, .0, .0});
+        REQUIRE(!cb);
+        REQUIRE(ret.size() == 8u);
+        REQUIRE(std::get<0>(ta.get_propagate_res()[0]) == taylor_outcome::err_nf_state);
+        REQUIRE(std::get<0>(ta.get_propagate_res()[1]) == taylor_outcome::time_limit);
+        REQUIRE(std::get<0>(ta.get_propagate_res()[2]) == taylor_outcome::time_limit);
+        REQUIRE(std::get<0>(ta.get_propagate_res()[3]) == taylor_outcome::time_limit);
+
+        // Reset the integrator.
+        ta = taylor_adaptive_batch<double>{
+            {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253}, 4u};
+
+        // Propagate to the initial time.
+        std::tie(cb, ret) = ta.propagate_grid({0., 0., 0., 0.});
+        REQUIRE(!cb);
+        REQUIRE(ret.size() == 8u);
+        REQUIRE(ret == std::vector{0.05, 0.025, 0.051, 0.0251, 0.052, 0.0252, 0.053, 0.0253});
+        for (auto i = 0u; i < 4u; ++i) {
+            auto [oc, min_h, max_h, nsteps] = ta.get_propagate_res()[i];
+
+            REQUIRE(oc == taylor_outcome::time_limit);
+            REQUIRE(min_h == std::numeric_limits<double>::infinity());
+            REQUIRE(max_h == 0);
+            REQUIRE(nsteps == 0u);
+        }
 
-    // Integrate forward over a dense grid from ~0 to ~10.
-    std::vector<double> grid;
-    for (auto i = 0u; i < 1000u; ++i) {
-        for (auto j = 0; j < 4; ++j) {
-            grid.push_back(i / 100.);
-            if (i != 0u) {
-                grid.back() += j / 10.;
+        // Switch to the harmonic oscillator.
+        ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
+
+        // Integrate forward over a dense grid from ~0 to ~10.
+        std::vector<double> grid;
+        for (auto i = 0u; i < 1000u; ++i) {
+            for (auto j = 0; j < 4; ++j) {
+                grid.push_back(i / 100.);
+                if (i != 0u) {
+                    grid.back() += j / 10.;
+                }
             }
         }
-    }
 
-    std::tie(cb, ret) = ta.propagate_grid(grid);
+        std::tie(cb, ret) = ta.propagate_grid(grid);
 
-    REQUIRE(!cb);
-    REQUIRE(ret.size() == 8000ull);
+        REQUIRE(!cb);
+        REQUIRE(ret.size() == 8000ull);
 
-    for (auto i = 0u; i < 4u; ++i) {
-        REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
-        REQUIRE(ta.get_time()[i] == grid[3996u + i]);
-    }
+        for (auto i = 0u; i < 4u; ++i) {
+            REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
+            REQUIRE(ta.get_time()[i] == grid[3996u + i]);
+        }
 
-    for (auto i = 0u; i < 1000u; ++i) {
-        for (auto j = 0u; j < 4u; ++j) {
-            REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.));
-            REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.));
+        for (auto i = 0u; i < 1000u; ++i) {
+            for (auto j = 0u; j < 4u; ++j) {
+                REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.));
+                REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.));
+            }
         }
-    }
 
-    // Do the same backwards.
-    ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
-    grid.clear();
-    for (auto i = 0u; i < 1000u; ++i) {
-        for (auto j = 0; j < 4; ++j) {
-            grid.push_back(i / -100.);
-            if (i != 0u) {
-                grid.back() += j / -10.;
+        // Do the same backwards.
+        ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
+        grid.clear();
+        for (auto i = 0u; i < 1000u; ++i) {
+            for (auto j = 0; j < 4; ++j) {
+                grid.push_back(i / -100.);
+                if (i != 0u) {
+                    grid.back() += j / -10.;
+                }
             }
         }
-    }
 
-    std::tie(cb, ret) = ta.propagate_grid(grid);
+        std::tie(cb, ret) = ta.propagate_grid(grid);
 
-    REQUIRE(!cb);
-    REQUIRE(ret.size() == 8000ull);
+        REQUIRE(!cb);
+        REQUIRE(ret.size() == 8000ull);
 
-    for (auto i = 0u; i < 4u; ++i) {
-        REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
-        REQUIRE(ta.get_time()[i] == grid[3996u + i]);
-    }
+        for (auto i = 0u; i < 4u; ++i) {
+            REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
+            REQUIRE(ta.get_time()[i] == grid[3996u + i]);
+        }
 
-    for (auto i = 0u; i < 1000u; ++i) {
-        for (auto j = 0u; j < 4u; ++j) {
-            REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.));
-            REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.));
+        for (auto i = 0u; i < 1000u; ++i) {
+            for (auto j = 0u; j < 4u; ++j) {
+                REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 10000.));
+                REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 10000.));
+            }
         }
-    }
 
-    // Random testing.
-    ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
-    std::fill(grid.begin(), grid.begin() + 4, 0.);
-    std::uniform_real_distribution<double> rdist(0., .1);
-    for (auto i = 1u; i < 1000u; ++i) {
-        for (auto j = 0u; j < 4u; ++j) {
-            grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng);
+        // Random testing.
+        ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
+        std::fill(grid.begin(), grid.begin() + 4, 0.);
+        std::uniform_real_distribution<double> rdist(0., .1);
+        for (auto i = 1u; i < 1000u; ++i) {
+            for (auto j = 0u; j < 4u; ++j) {
+                grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng);
+            }
         }
-    }
 
-    std::tie(cb, ret) = ta.propagate_grid(grid);
+        std::tie(cb, ret) = ta.propagate_grid(grid);
 
-    REQUIRE(!cb);
-    REQUIRE(ret.size() == 8000ull);
+        REQUIRE(!cb);
+        REQUIRE(ret.size() == 8000ull);
 
-    for (auto i = 0u; i < 4u; ++i) {
-        REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
-        REQUIRE(ta.get_time()[i] == grid[3996u + i]);
-    }
+        for (auto i = 0u; i < 4u; ++i) {
+            REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
+            REQUIRE(ta.get_time()[i] == grid[3996u + i]);
+        }
 
-    for (auto i = 0u; i < 1000u; ++i) {
-        for (auto j = 0u; j < 4u; ++j) {
-            REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 400000.));
-            REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 400000.));
+        for (auto i = 0u; i < 1000u; ++i) {
+            for (auto j = 0u; j < 4u; ++j) {
+                REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 400000.));
+                REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 400000.));
+            }
         }
-    }
 
-    // Do it backwards too.
-    ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
-    std::fill(grid.begin(), grid.begin() + 4, 0.);
-    rdist = std::uniform_real_distribution<double>(-.1, 0.);
-    for (auto i = 1u; i < 1000u; ++i) {
-        for (auto j = 0u; j < 4u; ++j) {
-            grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng);
+        // Do it backwards too.
+        ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0., 0., 0., 1., 1.1, 1.2, 1.3}, 4u};
+        std::fill(grid.begin(), grid.begin() + 4, 0.);
+        rdist = std::uniform_real_distribution<double>(-.1, 0.);
+        for (auto i = 1u; i < 1000u; ++i) {
+            for (auto j = 0u; j < 4u; ++j) {
+                grid[i * 4u + j] = grid[(i - 1u) * 4u + j] + rdist(rng);
+            }
         }
-    }
 
-    std::tie(cb, ret) = ta.propagate_grid(grid);
+        std::tie(cb, ret) = ta.propagate_grid(grid);
 
-    REQUIRE(!cb);
-    REQUIRE(ret.size() == 8000ull);
+        REQUIRE(!cb);
+        REQUIRE(ret.size() == 8000ull);
 
-    for (auto i = 0u; i < 4u; ++i) {
-        REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
-        REQUIRE(ta.get_time()[i] == grid[3996u + i]);
-    }
+        for (auto i = 0u; i < 4u; ++i) {
+            REQUIRE(std::get<0>(ta.get_propagate_res()[i]) == taylor_outcome::time_limit);
+            REQUIRE(ta.get_time()[i] == grid[3996u + i]);
+        }
 
-    for (auto i = 0u; i < 1000u; ++i) {
-        for (auto j = 0u; j < 4u; ++j) {
-            REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 800000.));
-            REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 800000.));
+        for (auto i = 0u; i < 1000u; ++i) {
+            for (auto j = 0u; j < 4u; ++j) {
+                REQUIRE(ret[8u * i + j] == approximately((1 + j / 10.) * std::sin(grid[i * 4u + j]), 800000.));
+                REQUIRE(ret[8u * i + j + 4u] == approximately((1 + j / 10.) * std::cos(grid[i * 4u + j]), 800000.));
+            }
         }
-    }
 
-    // Test the callback is moved.
-    ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4};
-    step_callback_batch<double> f_cb_grid(cb_functor_grid{});
-    value_ptr<cb_functor_grid>(f_cb_grid)->n_copies_after = value_ptr<cb_functor_grid>(f_cb_grid)->n_copies;
-    auto [out_cb, _] = ta.propagate_grid({0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.},
-                                         kw::callback = std::move(f_cb_grid));
-    // Invoke again the callback to ensure no copies have been made.
-    out_cb(ta);
-    REQUIRE(value_isa<cb_functor_grid>(out_cb));
-
-    // Do the same test with the range overload, moving in the callbacks initially stored
-    // in a range. This will check that the logic that converts the input range into
-    // a step callback does proper forwarding.
-    std::vector cf_vec = {cb_functor_grid{}, cb_functor_grid{}};
-    cf_vec[0].n_copies_after = cf_vec[0].n_copies;
-    cf_vec[1].n_copies_after = cf_vec[1].n_copies;
-    std::tie(out_cb, _) = ta.propagate_grid(
-        {100., 100., 100., 100., 101., 101., 101., 101., 102., 102., 102., 102.},
-        kw::callback
-        = cf_vec | std::views::transform([](cb_functor_grid &c) -> cb_functor_grid && { return std::move(c); }));
-    out_cb(ta);
-    REQUIRE(value_isa<step_callback_batch_set<double>>(out_cb));
-    REQUIRE(value_isa<cb_functor_grid>(value_ref<step_callback_batch_set<double>>(out_cb)[0]));
-
-    // Callback attempts to change the time coordinate.
-    ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4};
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid(
-            {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, kw::callback =
-                                                                              [](auto &tint) {
-                                                                                  tint.set_time(-100.);
+        // Test the callback is moved.
+        ta = taylor_adaptive_batch<double>{
+            {prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4};
+        step_callback_batch<double> f_cb_grid(cb_functor_grid{});
+        value_ptr<cb_functor_grid>(f_cb_grid)->n_copies_after = value_ptr<cb_functor_grid>(f_cb_grid)->n_copies;
+        auto [out_cb, _] = ta.propagate_grid({0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.},
+                                             kw::callback = std::move(f_cb_grid));
+        // Invoke again the callback to ensure no copies have been made.
+        out_cb(ta);
+        REQUIRE(value_isa<cb_functor_grid>(out_cb));
 
-                                                                                  return true;
-                                                                              }),
-        std::runtime_error,
-        Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the "
-                "time coordinate of the integrator - this is not supported"));
+        // Do the same test with the range overload, moving in the callbacks initially stored
+        // in a range. This will check that the logic that converts the input range into
+        // a step callback does proper forwarding.
+        std::vector cf_vec = {cb_functor_grid{}, cb_functor_grid{}};
+        cf_vec[0].n_copies_after = cf_vec[0].n_copies;
+        cf_vec[1].n_copies_after = cf_vec[1].n_copies;
+        std::tie(out_cb, _) = ta.propagate_grid(
+            {100., 100., 100., 100., 101., 101., 101., 101., 102., 102., 102., 102.},
+            kw::callback
+            = cf_vec | std::views::transform([](cb_functor_grid &c) -> cb_functor_grid && { return std::move(c); }));
+        out_cb(ta);
+        REQUIRE(value_isa<step_callback_batch_set<double>>(out_cb));
+        REQUIRE(value_isa<cb_functor_grid>(value_ref<step_callback_batch_set<double>>(out_cb)[0]));
 
-    // Try also with a single time coord.
-    ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4};
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_grid(
-            {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.},
-            kw::callback =
-                [](auto &tint) {
-                    tint.set_time({tint.get_time()[0], -100., tint.get_time()[2], tint.get_time()[3]});
-
-                    return true;
-                }),
-        std::runtime_error,
-        Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the "
-                "time coordinate of the integrator - this is not supported"));
+        // Callback attempts to change the time coordinate.
+        ta = taylor_adaptive_batch<double>{
+            {prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4};
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid(
+                {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.}, kw::callback =
+                                                                                  [](auto &tint) {
+                                                                                      tint.set_time(-100.);
+
+                                                                                      return true;
+                                                                                  }),
+            std::runtime_error,
+            Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the "
+                    "time coordinate of the integrator - this is not supported"));
+
+        // Try also with a single time coord.
+        ta = taylor_adaptive_batch<double>{
+            {prime(x) = v, prime(v) = -x}, {0., 0.01, 0.02, 0.03, 1., 1.01, 1.02, 1.03}, 4};
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_grid(
+                {0., 0., 0., 0., 10., 10., 10., 10., 100., 100., 100., 100.},
+                kw::callback =
+                    [](auto &tint) {
+                        tint.set_time({tint.get_time()[0], -100., tint.get_time()[2], tint.get_time()[3]});
+
+                        return true;
+                    }),
+            std::runtime_error,
+            Message("The invocation of the callback passed to propagate_grid() resulted in the alteration of the "
+                    "time coordinate of the integrator - this is not supported"));
+    }
 }
 
 // A test to make sure the propagate functions deal correctly
@@ -518,196 +530,202 @@ TEST_CASE("propagate for_until")
 
     auto [x, v] = make_vars("x", "v");
 
-    auto ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.06, 0.025, 0.026}, 2u};
-    auto ta_copy = ta;
+    for (auto cm : {true, false}) {
+        auto ta = taylor_adaptive_batch<double>{
+            {prime(x) = v, prime(v) = -9.8 * sin(x)}, {0.05, 0.06, 0.025, 0.026}, 2u, kw::compact_mode = cm};
+        auto ta_copy = ta;
 
-    // Error modes.
-    REQUIRE_THROWS_MATCHES(ta.propagate_until({0., std::numeric_limits<double>::infinity()}), std::invalid_argument,
-                           Message("A non-finite time was passed to the propagate_until() function of an adaptive "
-                                   "Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_until({10., 11.}, kw::max_delta_t = std::vector<double>{1}), std::invalid_argument,
-        Message("Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, "
+        // Error modes.
+        REQUIRE_THROWS_MATCHES(ta.propagate_until({0., std::numeric_limits<double>::infinity()}), std::invalid_argument,
+                               Message("A non-finite time was passed to the propagate_until() function of an adaptive "
+                                       "Taylor integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_until({10., 11.}, kw::max_delta_t = std::vector<double>{1}), std::invalid_argument,
+            Message(
+                "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, "
                 "but the number of specified timesteps is 1"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_until({10., 11.}, kw::max_delta_t = {1., 2., 3.}), std::invalid_argument,
-        Message("Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, "
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_until({10., 11.}, kw::max_delta_t = {1., 2., 3.}), std::invalid_argument,
+            Message(
+                "Invalid number of max timesteps specified in a Taylor integrator in batch mode: the batch size is 2, "
                 "but the number of specified timesteps is 3"));
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_until({10., 11.}, kw::max_delta_t = {1., std::numeric_limits<double>::quiet_NaN()}),
-        std::invalid_argument,
-        Message("A nan max_delta_t was passed to the propagate_until() function of an adaptive "
-                "Taylor integrator in batch mode"));
-    REQUIRE_THROWS_MATCHES(ta.propagate_until({10., 11.}, kw::max_delta_t = {1., -1.}), std::invalid_argument,
-                           Message("A non-positive max_delta_t was passed to the propagate_until() function of an "
-                                   "adaptive Taylor integrator in batch mode"));
-
-    ta.set_time({0., std::numeric_limits<double>::lowest()});
-
-    REQUIRE_THROWS_MATCHES(
-        ta.propagate_until({10., std::numeric_limits<double>::max()}, kw::max_delta_t = std::vector<double>{}),
-        std::invalid_argument,
-        Message("The final time passed to the propagate_until() function of an adaptive Taylor "
-                "integrator in batch mode results in an overflow condition"));
-
-    ta.set_time({0., 0.});
-
-    // Propagate forward in time limiting the timestep size and passing in a callback.
-    auto counter0 = 0ul, counter1 = counter0;
-
-    auto cb = [&counter0, &counter1](taylor_adaptive_batch<double> &t) {
-        if (t.get_last_h()[0] != 0) {
-            ++counter0;
-        }
-        if (t.get_last_h()[1] != 0) {
-            ++counter1;
-        }
-
-        return true;
-    };
-
-    ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb);
-    ta_copy.propagate_until({10., 11.});
-
-    REQUIRE(ta.get_time() == std::vector{10., 11.});
-    REQUIRE(counter0 == 100000ul);
-    REQUIRE(counter1 == 220000ul);
-    REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
-
-    REQUIRE(ta_copy.get_time() == std::vector{10., 11.});
-    REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
-
-    REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
-    REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
-    REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
-    REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
-
-    // Scalar input time.
-    auto ta_copy2 = ta, ta_copy3 = ta;
-    ta_copy2.propagate_until(20.);
-    ta_copy3.propagate_until({20., 20.});
-    REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
-
-    // Try also with max_delta_t.
-    ta_copy2.propagate_until(30., kw::max_delta_t = std::vector{1e-4, 5e-5});
-    ta_copy3.propagate_until({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5});
-    REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
-
-    // Do propagate_for() too.
-    ta.propagate_for({10., 11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb);
-    ta_copy.propagate_for({10., 11.});
-
-    // Scalar input time.
-    ta_copy2.propagate_for(20.);
-    ta_copy3.propagate_for({20., 20.});
-    REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
-
-    // Try also with max_delta_t.
-    ta_copy2.propagate_for(30., kw::max_delta_t = std::vector{1e-4, 5e-5});
-    ta_copy3.propagate_for({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5});
-    REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
-
-    REQUIRE(ta.get_time() == std::vector{20., 22.});
-    REQUIRE(counter0 == 200000ul);
-    REQUIRE(counter1 == 440000ul);
-    REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
-
-    REQUIRE(ta_copy.get_time() == std::vector{20., 22.});
-    REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
-
-    REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
-    REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
-    REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
-    REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_until({10., 11.}, kw::max_delta_t = {1., std::numeric_limits<double>::quiet_NaN()}),
+            std::invalid_argument,
+            Message("A nan max_delta_t was passed to the propagate_until() function of an adaptive "
+                    "Taylor integrator in batch mode"));
+        REQUIRE_THROWS_MATCHES(ta.propagate_until({10., 11.}, kw::max_delta_t = {1., -1.}), std::invalid_argument,
+                               Message("A non-positive max_delta_t was passed to the propagate_until() function of an "
+                                       "adaptive Taylor integrator in batch mode"));
 
-    // Do backwards in time too.
-    ta.propagate_for({-10., -11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb);
-    ta_copy.propagate_for({-10., -11.});
+        ta.set_time({0., std::numeric_limits<double>::lowest()});
 
-    REQUIRE(ta.get_time() == std::vector{10., 11.});
-    REQUIRE(counter0 == 300000ul);
-    REQUIRE(counter1 == 660000ul);
-    REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+        REQUIRE_THROWS_MATCHES(
+            ta.propagate_until({10., std::numeric_limits<double>::max()}, kw::max_delta_t = std::vector<double>{}),
+            std::invalid_argument,
+            Message("The final time passed to the propagate_until() function of an adaptive Taylor "
+                    "integrator in batch mode results in an overflow condition"));
 
-    REQUIRE(ta_copy.get_time() == std::vector{10., 11.});
-    REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+        ta.set_time({0., 0.});
 
-    REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
-    REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
-    REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
-    REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
+        // Propagate forward in time limiting the timestep size and passing in a callback.
+        auto counter0 = 0ul, counter1 = counter0;
 
-    ta.propagate_until({0., 0.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb);
-    ta_copy.propagate_until({0., 0.});
+        auto cb = [&counter0, &counter1](taylor_adaptive_batch<double> &t) {
+            if (t.get_last_h()[0] != 0) {
+                ++counter0;
+            }
+            if (t.get_last_h()[1] != 0) {
+                ++counter1;
+            }
 
-    REQUIRE(ta.get_time() == std::vector{0., 0.});
-    REQUIRE(counter0 == 400000ul);
-    REQUIRE(counter1 == 880000ul);
-    REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+            return true;
+        };
 
-    REQUIRE(ta_copy.get_time() == std::vector{0., 0.});
-    REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
-                        [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+        ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb);
+        ta_copy.propagate_until({10., 11.});
+
+        REQUIRE(ta.get_time() == std::vector{10., 11.});
+        REQUIRE(counter0 == 100000ul);
+        REQUIRE(counter1 == 220000ul);
+        REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta_copy.get_time() == std::vector{10., 11.});
+        REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
+        REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
+        REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
+        REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
+
+        // Scalar input time.
+        auto ta_copy2 = ta, ta_copy3 = ta;
+        ta_copy2.propagate_until(20.);
+        ta_copy3.propagate_until({20., 20.});
+        REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
+
+        // Try also with max_delta_t.
+        ta_copy2.propagate_until(30., kw::max_delta_t = std::vector{1e-4, 5e-5});
+        ta_copy3.propagate_until({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5});
+        REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
+
+        // Do propagate_for() too.
+        ta.propagate_for({10., 11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb);
+        ta_copy.propagate_for({10., 11.});
+
+        // Scalar input time.
+        ta_copy2.propagate_for(20.);
+        ta_copy3.propagate_for({20., 20.});
+        REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
+
+        // Try also with max_delta_t.
+        ta_copy2.propagate_for(30., kw::max_delta_t = std::vector{1e-4, 5e-5});
+        ta_copy3.propagate_for({30., 30.}, kw::max_delta_t = std::vector{1e-4, 5e-5});
+        REQUIRE(ta_copy2.get_state() == ta_copy3.get_state());
+
+        REQUIRE(ta.get_time() == std::vector{20., 22.});
+        REQUIRE(counter0 == 200000ul);
+        REQUIRE(counter1 == 440000ul);
+        REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta_copy.get_time() == std::vector{20., 22.});
+        REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
+        REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
+        REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
+        REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
+
+        // Do backwards in time too.
+        ta.propagate_for({-10., -11.}, kw::max_delta_t = std::vector{1e-4, 5e-5}, kw::callback = cb);
+        ta_copy.propagate_for({-10., -11.});
+
+        REQUIRE(ta.get_time() == std::vector{10., 11.});
+        REQUIRE(counter0 == 300000ul);
+        REQUIRE(counter1 == 660000ul);
+        REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta_copy.get_time() == std::vector{10., 11.});
+        REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
+        REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
+        REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
+        REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
+
+        ta.propagate_until({0., 0.}, kw::max_delta_t = {1e-4, 5e-5}, kw::callback = cb);
+        ta_copy.propagate_until({0., 0.});
+
+        REQUIRE(ta.get_time() == std::vector{0., 0.});
+        REQUIRE(counter0 == 400000ul);
+        REQUIRE(counter1 == 880000ul);
+        REQUIRE(std::all_of(ta.get_propagate_res().begin(), ta.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta_copy.get_time() == std::vector{0., 0.});
+        REQUIRE(std::all_of(ta_copy.get_propagate_res().begin(), ta_copy.get_propagate_res().end(),
+                            [](const auto &t) { return std::get<0>(t) == taylor_outcome::time_limit; }));
+
+        REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
+        REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
+        REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
+        REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
+
+        // Try with scalar max_delta_t.
+        ta_copy = ta;
+        ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 1e-4});
+        ta_copy.propagate_until({10., 11.}, kw::max_delta_t = 1e-4);
+        REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res());
 
-    REQUIRE(ta.get_state()[0] == approximately(ta_copy.get_state()[0], 1000.));
-    REQUIRE(ta.get_state()[1] == approximately(ta_copy.get_state()[1], 1000.));
-    REQUIRE(ta.get_state()[2] == approximately(ta_copy.get_state()[2], 1000.));
-    REQUIRE(ta.get_state()[3] == approximately(ta_copy.get_state()[3], 1000.));
+        ta.propagate_for({10., 11.}, kw::max_delta_t = {1e-4, 1e-4});
+        ta_copy.propagate_for({10., 11.}, kw::max_delta_t = 1e-4);
+        REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res());
 
-    // Try with scalar max_delta_t.
-    ta_copy = ta;
-    ta.propagate_until({10., 11.}, kw::max_delta_t = {1e-4, 1e-4});
-    ta_copy.propagate_until({10., 11.}, kw::max_delta_t = 1e-4);
-    REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res());
-
-    ta.propagate_for({10., 11.}, kw::max_delta_t = {1e-4, 1e-4});
-    ta_copy.propagate_for({10., 11.}, kw::max_delta_t = 1e-4);
-    REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res());
-
-    // Test the callback is moved.
-    step_callback_batch<double> f_cb_until(cb_functor_until{});
-    value_ptr<cb_functor_until>(f_cb_until)->n_copies_after = value_ptr<cb_functor_until>(f_cb_until)->n_copies;
-    auto [_, out_cb] = ta.propagate_until(20., kw::callback = std::move(f_cb_until));
-    // Invoke again the callback to ensure no copies have been made.
-    out_cb(ta);
-
-    step_callback_batch<double> f_cb_for(cb_functor_for{});
-    value_ptr<cb_functor_for>(f_cb_for)->n_copies_after = value_ptr<cb_functor_for>(f_cb_for)->n_copies;
-    std::tie(_, out_cb) = ta.propagate_for(10., kw::callback = std::move(f_cb_for));
-    out_cb(ta);
-    REQUIRE(value_isa<cb_functor_for>(out_cb));
-
-    // Do the same test with the range overload, moving in the callbacks initially stored
-    // in a range. This will check that the logic that converts the input range into
-    // a step callback does proper forwarding.
-    {
-        std::vector cf_vec = {cb_functor_for{}, cb_functor_for{}};
-        cf_vec[0].n_copies_after = cf_vec[0].n_copies;
-        cf_vec[1].n_copies_after = cf_vec[1].n_copies;
-        std::tie(_, out_cb) = ta.propagate_for(
-            10., kw::callback
-                 = cf_vec | std::views::transform([](cb_functor_for &c) -> cb_functor_for && { return std::move(c); }));
+        // Test the callback is moved.
+        step_callback_batch<double> f_cb_until(cb_functor_until{});
+        value_ptr<cb_functor_until>(f_cb_until)->n_copies_after = value_ptr<cb_functor_until>(f_cb_until)->n_copies;
+        auto [_, out_cb] = ta.propagate_until(20., kw::callback = std::move(f_cb_until));
+        // Invoke again the callback to ensure no copies have been made.
         out_cb(ta);
-        REQUIRE(value_isa<step_callback_batch_set<double>>(out_cb));
-    }
 
-    {
-        std::vector cf_vec = {cb_functor_until{}, cb_functor_until{}};
-        cf_vec[0].n_copies_after = cf_vec[0].n_copies;
-        cf_vec[1].n_copies_after = cf_vec[1].n_copies;
-        std::tie(_, out_cb) = ta.propagate_until(
-            50., kw::callback = cf_vec | std::views::transform([](cb_functor_until &c) -> cb_functor_until && {
-                                    return std::move(c);
-                                }));
+        step_callback_batch<double> f_cb_for(cb_functor_for{});
+        value_ptr<cb_functor_for>(f_cb_for)->n_copies_after = value_ptr<cb_functor_for>(f_cb_for)->n_copies;
+        std::tie(_, out_cb) = ta.propagate_for(10., kw::callback = std::move(f_cb_for));
         out_cb(ta);
-        REQUIRE(value_isa<step_callback_batch_set<double>>(out_cb));
+        REQUIRE(value_isa<cb_functor_for>(out_cb));
+
+        // Do the same test with the range overload, moving in the callbacks initially stored
+        // in a range. This will check that the logic that converts the input range into
+        // a step callback does proper forwarding.
+        {
+            std::vector cf_vec = {cb_functor_for{}, cb_functor_for{}};
+            cf_vec[0].n_copies_after = cf_vec[0].n_copies;
+            cf_vec[1].n_copies_after = cf_vec[1].n_copies;
+            std::tie(_, out_cb) = ta.propagate_for(
+                10., kw::callback = cf_vec | std::views::transform([](cb_functor_for &c) -> cb_functor_for && {
+                                        return std::move(c);
+                                    }));
+            out_cb(ta);
+            REQUIRE(value_isa<step_callback_batch_set<double>>(out_cb));
+        }
+
+        {
+            std::vector cf_vec = {cb_functor_until{}, cb_functor_until{}};
+            cf_vec[0].n_copies_after = cf_vec[0].n_copies;
+            cf_vec[1].n_copies_after = cf_vec[1].n_copies;
+            std::tie(_, out_cb) = ta.propagate_until(
+                50., kw::callback = cf_vec | std::views::transform([](cb_functor_until &c) -> cb_functor_until && {
+                                        return std::move(c);
+                                    }));
+            out_cb(ta);
+            REQUIRE(value_isa<step_callback_batch_set<double>>(out_cb));
+        }
     }
 }
 

From dc64f66ec06697211161dd747175f9cb0eee9029 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 15:08:02 +0200
Subject: [PATCH 15/30] Fix CMake warning when building the benchmarks.

---
 benchmark/CMakeLists.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 8f3af7bc2..c01f48243 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,6 +1,16 @@
+# NOTE: we look for Boost in CONFIG mode first, as that has become the official supported way
+# of locating Boost in recent Boost/CMake versions. If we fail, we try again in
+# MODULE mode as last resort.
 # NOTE: don't find a specific version as we already checked
 # outside that the Boost version is appropriate.
-find_package(Boost REQUIRED COMPONENTS program_options)
+find_package(Boost QUIET COMPONENTS program_options CONFIG)
+if(NOT ${Boost_FOUND})
+    message(STATUS "Boost not found in CONFIG mode, retrying in MODULE mode.")
+    find_package(Boost QUIET MODULE COMPONENTS program_options)
+endif()
+if(NOT ${Boost_FOUND})
+    message(FATAL_ERROR "Could not locate Boost in either CONFIG or MODULE mode.")
+endif()
 if(NOT TARGET Boost::program_options)
     message(STATUS "The 'Boost::program_options' imported target is missing, creating it.")
     add_library(Boost::program_options UNKNOWN IMPORTED)

From 9a2827f72c58cd216b1eb67ed6ce2fcf8f95f345 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Wed, 28 Aug 2024 15:17:41 +0200
Subject: [PATCH 16/30] Fix n body creation benchmark.

---
 benchmark/n_body_creation.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmark/n_body_creation.cpp b/benchmark/n_body_creation.cpp
index 5da4e508c..9b166f0b0 100644
--- a/benchmark/n_body_creation.cpp
+++ b/benchmark/n_body_creation.cpp
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <iostream>
 #include <numeric>
+#include <variant>
 #include <vector>
 
 #include <boost/program_options.hpp>
@@ -56,7 +57,13 @@ int main(int argc, char *argv[])
         std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start)
             .count());
 
-    std::cout << ta.get_llvm_state().get_ir() << '\n';
+    if (compact_mode) {
+        for (auto ir : std::get<1>(ta.get_llvm_state()).get_ir()) {
+            std::cout << ir << '\n';
+        }
+    } else {
+        std::cout << std::get<0>(ta.get_llvm_state()).get_ir() << '\n';
+    }
 
     auto counter = 0u;
     for (const auto &ex : ta.get_decomposition()) {

From 70a93bd0e1ad6f02901789fb7d29606b61e0fbce Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 09:29:20 +0200
Subject: [PATCH 17/30] Doc tweaks.

---
 doc/install.rst                | 2 +-
 doc/tut_extended_precision.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/install.rst b/doc/install.rst
index 7beed7bf4..cad4aeac9 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -70,7 +70,7 @@ heyoka (and all its dependencies) have been compiled with a compiler supporting
 128-bit precision
 ^^^^^^^^^^^^^^^^^
 
-On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit ARM),
+On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit Linux ARM),
 quadruple-precision integrations are always supported via ``long double``. Otherwise,
 on platforms such as x86-64, quadruple-precision computations are supported if:
 
diff --git a/doc/tut_extended_precision.rst b/doc/tut_extended_precision.rst
index 00e0cb77c..a01d9acef 100644
--- a/doc/tut_extended_precision.rst
+++ b/doc/tut_extended_precision.rst
@@ -11,7 +11,7 @@ not only in single and double precision, but also in extended precision. Specifi
 
 How these extended precision floating-point types can be accessed and used from C++ varies depending on the platform. The 80-bit
 extended-precision format is available as the C++ ``long double`` type on most platforms based on Intel x86 processors. Quadruple-precision
-computations are supported either via the ``long double`` type (e.g., on 64-bit ARM processors) or via the the :cpp:class:`mppp::real128` type
+computations are supported either via the ``long double`` type (e.g., on 64-bit Linux ARM) or via the the :cpp:class:`mppp::real128` type
 (provided that the platform supports the nonstandard ``__float128`` floating-point type and that heyoka was compiled with support
 for the mp++ library - see the :ref:`installation instructions <installation>`).
 

From 49509ce22d94624c6f4342c50aaa3bccc0123d2a Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 10:05:53 +0200
Subject: [PATCH 18/30] Add the option of specifying whether or not to enable
 parallel JIT compilation to the integrators and to cfunc.

---
 include/heyoka/expression.hpp  | 20 +++++++++--
 include/heyoka/kw.hpp          |  3 +-
 include/heyoka/llvm_state.hpp  |  3 +-
 include/heyoka/taylor.hpp      | 27 +++++++++++----
 src/cfunc_class.cpp            | 10 +++---
 src/expression_cfunc.cpp       | 11 +++---
 src/taylor_adaptive.cpp        |  4 +--
 src/taylor_adaptive_batch.cpp  |  4 +--
 test/cfunc.cpp                 | 15 ++++-----
 test/make_multi_cfunc.cpp      | 61 +++++++++++++++++++---------------
 test/taylor_adaptive.cpp       | 10 ++++--
 test/taylor_adaptive_batch.cpp | 16 ++++++---
 12 files changed, 118 insertions(+), 66 deletions(-)

diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index a5c099c12..72b01683b 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -699,7 +699,7 @@ auto cfunc_common_opts(const KwArgs &...kw_args)
 template <typename>
 std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
 make_multi_cfunc(llvm_state, const std::string &, const std::vector<expression> &, const std::vector<expression> &,
-                 std::uint32_t, bool, bool, long long);
+                 std::uint32_t, bool, bool, long long, bool);
 
 } // namespace detail
 
@@ -818,13 +818,27 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS cfunc
             }
         }();
 
+        // Parallel JIT compilation.
+        auto parjit = [&p]() -> bool {
+            if constexpr (p.has(kw::parjit)) {
+                if constexpr (std::integral<std::remove_cvref_t<decltype(p(kw::parjit))>>) {
+                    return static_cast<bool>(p(kw::parjit));
+                } else {
+                    static_assert(detail::always_false_v<T>, "Invalid type for the 'parjit' keyword argument.");
+                }
+            } else {
+                return detail::default_parjit;
+            }
+        }();
+
         // Build the template llvm_state from the keyword arguments.
         llvm_state s(kw_args...);
 
-        return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec);
+        return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec,
+                               parjit);
     }
     explicit cfunc(std::vector<expression>, std::vector<expression>,
-                   std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool>);
+                   std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool, bool>);
 
     HEYOKA_DLL_LOCAL void check_valid(const char *) const;
 
diff --git a/include/heyoka/kw.hpp b/include/heyoka/kw.hpp
index 8c8276e21..b7c8c163c 100644
--- a/include/heyoka/kw.hpp
+++ b/include/heyoka/kw.hpp
@@ -21,7 +21,7 @@ HEYOKA_BEGIN_NAMESPACE
 namespace kw
 {
 
-// llvm_state.
+// llvm_state/llvm_multi_state.
 IGOR_MAKE_NAMED_ARGUMENT(mname);
 IGOR_MAKE_NAMED_ARGUMENT(opt_level);
 IGOR_MAKE_NAMED_ARGUMENT(fast_math);
@@ -34,6 +34,7 @@ IGOR_MAKE_NAMED_ARGUMENT(fast_math);
 IGOR_MAKE_NAMED_ARGUMENT(force_avx512);
 IGOR_MAKE_NAMED_ARGUMENT(slp_vectorize);
 IGOR_MAKE_NAMED_ARGUMENT(code_model);
+IGOR_MAKE_NAMED_ARGUMENT(parjit);
 
 // cfunc API.
 IGOR_MAKE_NAMED_ARGUMENT(batch_size);
diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 086e0c631..759f624e4 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -388,7 +388,8 @@ class HEYOKA_DLL_PUBLIC llvm_multi_state
     template <typename R>
         requires std::ranges::input_range<R>
                  && std::same_as<llvm_state, std::remove_cvref_t<std::ranges::range_reference_t<R>>>
-    explicit llvm_multi_state(R &&rng) : llvm_multi_state(std::vector(std::ranges::begin(rng), std::ranges::end(rng)))
+    explicit llvm_multi_state(R &&rng, bool parjit = detail::default_parjit)
+        : llvm_multi_state(std::vector(std::ranges::begin(rng), std::ranges::end(rng)), parjit)
     {
     }
     llvm_multi_state(const llvm_multi_state &);
diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp
index be7d7f77c..fd7f65003 100644
--- a/include/heyoka/taylor.hpp
+++ b/include/heyoka/taylor.hpp
@@ -235,7 +235,20 @@ auto taylor_adaptive_common_ops(const KwArgs &...kw_args)
         }
     }();
 
-    return std::tuple{high_accuracy, std::move(tol), compact_mode, std::move(pars), parallel_mode};
+    // Parallel JIT compilation.
+    auto parjit = [&p]() -> bool {
+        if constexpr (p.has(kw::parjit)) {
+            if constexpr (std::integral<std::remove_cvref_t<decltype(p(kw::parjit))>>) {
+                return static_cast<bool>(p(kw::parjit));
+            } else {
+                static_assert(always_false_v<T>, "Invalid type for the 'parjit' keyword argument.");
+            }
+        } else {
+            return default_parjit;
+        }
+    }();
+
+    return std::tuple{high_accuracy, std::move(tol), compact_mode, std::move(pars), parallel_mode, parjit};
 }
 
 // Small helper to construct a default value for the max_delta_t
@@ -443,7 +456,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada
     // Private implementation-detail constructor machinery.
     using sys_t = std::variant<std::vector<std::pair<expression, expression>>, var_ode_sys>;
     void finalise_ctor_impl(sys_t, std::vector<T>, std::optional<T>, std::optional<T>, bool, bool, std::vector<T>,
-                            std::vector<t_event_t>, std::vector<nt_event_t>, bool, std::optional<long long>);
+                            std::vector<t_event_t>, std::vector<nt_event_t>, bool, std::optional<long long>, bool);
     template <typename... KwArgs>
     void finalise_ctor(sys_t sys, std::vector<T> state, const KwArgs &...kw_args)
     {
@@ -463,7 +476,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada
                 }
             }();
 
-            auto [high_accuracy, tol, compact_mode, pars, parallel_mode]
+            auto [high_accuracy, tol, compact_mode, pars, parallel_mode, parjit]
                 = detail::taylor_adaptive_common_ops<T>(kw_args...);
 
             // Extract the terminal events, if any.
@@ -499,7 +512,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive : public detail::taylor_ada
 
             finalise_ctor_impl(std::move(sys), std::move(state), std::move(tm), std::move(tol), high_accuracy,
                                compact_mode, std::move(pars), std::move(tes), std::move(ntes), parallel_mode,
-                               std::move(prec));
+                               std::move(prec), parjit);
         }
     }
 
@@ -853,7 +866,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch
     // Private implementation-detail constructor machinery.
     using sys_t = std::variant<std::vector<std::pair<expression, expression>>, var_ode_sys>;
     void finalise_ctor_impl(sys_t, std::vector<T>, std::uint32_t, std::vector<T>, std::optional<T>, bool, bool,
-                            std::vector<T>, std::vector<t_event_t>, std::vector<nt_event_t>, bool);
+                            std::vector<T>, std::vector<t_event_t>, std::vector<nt_event_t>, bool, bool);
     template <typename... KwArgs>
     void finalise_ctor(sys_t sys, std::vector<T> state, std::uint32_t batch_size, const KwArgs &...kw_args)
     {
@@ -875,7 +888,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch
                 }
             }();
 
-            auto [high_accuracy, tol, compact_mode, pars, parallel_mode]
+            auto [high_accuracy, tol, compact_mode, pars, parallel_mode, parjit]
                 = detail::taylor_adaptive_common_ops<T>(kw_args...);
 
             // Extract the terminal events, if any.
@@ -898,7 +911,7 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS taylor_adaptive_batch
 
             finalise_ctor_impl(std::move(sys), std::move(state), batch_size, std::move(tm), std::move(tol),
                                high_accuracy, compact_mode, std::move(pars), std::move(tes), std::move(ntes),
-                               parallel_mode);
+                               parallel_mode, parjit);
         }
     }
 
diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp
index 93882b47c..9bcc50b2d 100644
--- a/src/cfunc_class.cpp
+++ b/src/cfunc_class.cpp
@@ -182,7 +182,7 @@ struct cfunc<T>::impl {
     // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
     explicit impl(std::vector<expression> fn, std::vector<expression> vars, llvm_state s,
                   std::optional<std::uint32_t> batch_size, bool high_accuracy, bool compact_mode, bool parallel_mode,
-                  long long prec, bool check_prec)
+                  long long prec, bool check_prec, bool parjit)
         : m_fn(std::move(fn)), m_vars(std::move(vars)), m_states(std::array{s, s, s}), m_prec(prec),
           m_check_prec(check_prec), m_high_accuracy(high_accuracy), m_compact_mode(compact_mode),
           m_parallel_mode(parallel_mode)
@@ -207,7 +207,7 @@ struct cfunc<T>::impl {
         if (compact_mode) {
             // Build the multi cfunc, and assign the internal members.
             std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc<T>(
-                std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec);
+                std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec, parjit);
 
             // Compile.
             std::get<1>(m_states).compile();
@@ -308,15 +308,15 @@ template <typename T>
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
 cfunc<T>::cfunc(std::vector<expression> fn, std::vector<expression> vars,
                 // NOLINTNEXTLINE(performance-unnecessary-value-param)
-                std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool> tup)
+                std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool, bool> tup)
 {
     // Unpack the tuple.
-    auto &[high_accuracy, compact_mode, parallel_mode, prec, batch_size, s, check_prec] = tup;
+    auto &[high_accuracy, compact_mode, parallel_mode, prec, batch_size, s, check_prec, parjit] = tup;
 
     // Construct the impl.
     // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
     m_impl = std::make_unique<impl>(std::move(fn), std::move(vars), std::move(s), batch_size, high_accuracy,
-                                    compact_mode, parallel_mode, prec, check_prec);
+                                    compact_mode, parallel_mode, prec, check_prec, parjit);
 }
 
 template <typename T>
diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
index 5eeb93538..940586166 100644
--- a/src/expression_cfunc.cpp
+++ b/src/expression_cfunc.cpp
@@ -2035,7 +2035,7 @@ std::array<std::size_t, 2> add_multi_cfunc_impl(llvm::Type *fp_t, std::list<llvm
 std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
 make_multi_cfunc_impl(llvm::Type *fp_t, const llvm_state &tplt, const std::string &name,
                       const std::vector<expression> &fn, const std::vector<expression> &vars, std::uint32_t batch_size,
-                      bool high_accuracy, bool parallel_mode)
+                      bool high_accuracy, bool parallel_mode, bool parjit)
 {
     if (batch_size == 0u) [[unlikely]] {
         throw std::invalid_argument("The batch size of a compiled function cannot be zero");
@@ -2264,7 +2264,8 @@ make_multi_cfunc_impl(llvm::Type *fp_t, const llvm_state &tplt, const std::strin
     //
     // https://en.cppreference.com/w/cpp/ranges/as_rvalue_view
     return std::make_tuple(
-        llvm_multi_state(states_lists[0] | std::views::transform([](auto &s) -> auto && { return std::move(s); })),
+        llvm_multi_state(states_lists[0] | std::views::transform([](auto &s) -> auto && { return std::move(s); }),
+                         parjit),
         std::move(dc), std::move(tape_size_align));
 }
 
@@ -2293,7 +2294,7 @@ template <typename T>
 std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
 make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector<expression> &fn,
                  const std::vector<expression> &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode,
-                 long long prec)
+                 long long prec, bool parjit)
 {
 #if defined(HEYOKA_ARCH_PPC)
     if constexpr (std::is_same_v<T, long double>) {
@@ -2320,7 +2321,7 @@ make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector<exp
     // this throughout the rest of the implementation.
     auto *fp_t = to_internal_llvm_type<T>(tplt, prec);
 
-    return make_multi_cfunc_impl(fp_t, tplt, name, fn, vars, batch_size, high_accuracy, parallel_mode);
+    return make_multi_cfunc_impl(fp_t, tplt, name, fn, vars, batch_size, high_accuracy, parallel_mode, parjit);
 }
 
 // Explicit instantiations.
@@ -2328,7 +2329,7 @@ make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector<exp
     template HEYOKA_DLL_PUBLIC                                                                                         \
         std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>                 \
         make_multi_cfunc<T>(llvm_state, const std::string &, const std::vector<expression> &,                          \
-                            const std::vector<expression> &, std::uint32_t, bool, bool, long long);
+                            const std::vector<expression> &, std::uint32_t, bool, bool, long long, bool);
 
 HEYOKA_MAKE_MULTI_CFUNC_INST(float)
 HEYOKA_MAKE_MULTI_CFUNC_INST(double)
diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp
index 5416a9278..d6513d550 100644
--- a/src/taylor_adaptive.cpp
+++ b/src/taylor_adaptive.cpp
@@ -174,7 +174,7 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
                                             std::optional<T> time, std::optional<T> tol, bool high_accuracy,
                                             bool compact_mode, std::vector<T> pars, std::vector<t_event_t> tes,
                                             std::vector<nt_event_t> ntes, bool parallel_mode,
-                                            [[maybe_unused]] std::optional<long long> prec)
+                                            [[maybe_unused]] std::optional<long long> prec, bool parjit)
 {
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_state);
     HEYOKA_TAYLOR_REF_FROM_I_DATA(m_pars);
@@ -461,7 +461,7 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
         std::ranges::reverse(states);
 
         // Create the multi state and assign it.
-        m_llvm_state = llvm_multi_state(std::move(states));
+        m_llvm_state = llvm_multi_state(std::move(states), parjit);
 
         // Compile.
         std::get<1>(m_llvm_state).compile();
diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp
index 1c9c6a042..b29660e53 100644
--- a/src/taylor_adaptive_batch.cpp
+++ b/src/taylor_adaptive_batch.cpp
@@ -76,7 +76,7 @@ template <typename T>
 void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state, std::uint32_t batch_size,
                                                   std::vector<T> time, std::optional<T> tol, bool high_accuracy,
                                                   bool compact_mode, std::vector<T> pars, std::vector<t_event_t> tes,
-                                                  std::vector<nt_event_t> ntes, bool parallel_mode)
+                                                  std::vector<nt_event_t> ntes, bool parallel_mode, bool parjit)
 {
     // NOTE: this must hold because tol == 0 is interpreted
     // as undefined in finalise_ctor().
@@ -309,7 +309,7 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
         std::ranges::reverse(states);
 
         // Create the multi state and assign it.
-        m_llvm_state = llvm_multi_state(std::move(states));
+        m_llvm_state = llvm_multi_state(std::move(states), parjit);
 
         // Compile.
         std::get<1>(m_llvm_state).compile();
diff --git a/test/cfunc.cpp b/test/cfunc.cpp
index 4bd0ff645..8db7096a8 100644
--- a/test/cfunc.cpp
+++ b/test/cfunc.cpp
@@ -37,6 +37,7 @@
 
 #include <heyoka/expression.hpp>
 #include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
 #include <heyoka/math/time.hpp>
 #include <heyoka/s11n.hpp>
 
@@ -135,13 +136,9 @@ TEST_CASE("basic")
         REQUIRE(cf0.get_fn() == std::vector{x + y, x - y});
         REQUIRE(cf0.get_vars() == std::vector{y, x});
         REQUIRE(!cf0.get_dc().empty());
-        if (cf0.get_compact_mode()) {
-            REQUIRE(std::get<1>(cf0.get_llvm_states()).get_opt_level() == 3u);
-        } else {
-            REQUIRE(std::get<0>(cf0.get_llvm_states())[0].get_opt_level() == 3u);
-            REQUIRE(std::get<0>(cf0.get_llvm_states())[1].get_opt_level() == 3u);
-            REQUIRE(std::get<0>(cf0.get_llvm_states())[2].get_opt_level() == 3u);
-        }
+        REQUIRE(std::get<0>(cf0.get_llvm_states())[0].get_opt_level() == 3u);
+        REQUIRE(std::get<0>(cf0.get_llvm_states())[1].get_opt_level() == 3u);
+        REQUIRE(std::get<0>(cf0.get_llvm_states())[2].get_opt_level() == 3u);
         REQUIRE(cf0.get_high_accuracy() == false);
         REQUIRE(cf0.get_compact_mode() == false);
         REQUIRE(cf0.get_parallel_mode() == true);
@@ -165,13 +162,15 @@ TEST_CASE("basic")
                           kw::batch_size = custom_batch_size,
                           kw::opt_level = opt_level,
                           kw::high_accuracy = high_accuracy,
-                          kw::compact_mode = compact_mode};
+                          kw::compact_mode = compact_mode,
+                          kw::parjit = detail::default_parjit};
 
         REQUIRE(cf0.get_fn() == std::vector{x + y, x - y});
         REQUIRE(cf0.get_vars() == std::vector{y, x});
         REQUIRE(!cf0.get_dc().empty());
         if (cf0.get_compact_mode()) {
             REQUIRE(std::get<1>(cf0.get_llvm_states()).get_opt_level() == opt_level);
+            REQUIRE(std::get<1>(cf0.get_llvm_states()).get_parjit() == detail::default_parjit);
         } else {
             REQUIRE(std::get<0>(cf0.get_llvm_states())[0].get_opt_level() == opt_level);
             REQUIRE(std::get<0>(cf0.get_llvm_states())[1].get_opt_level() == opt_level);
diff --git a/test/make_multi_cfunc.cpp b/test/make_multi_cfunc.cpp
index 46c8cfc1d..53b0ff0d9 100644
--- a/test/make_multi_cfunc.cpp
+++ b/test/make_multi_cfunc.cpp
@@ -33,6 +33,7 @@
 #include <heyoka/exceptions.hpp>
 #include <heyoka/expression.hpp>
 #include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
 #include <heyoka/math/time.hpp>
 #include <heyoka/model/nbody.hpp>
 #include <heyoka/model/sgp4.hpp>
@@ -86,7 +87,8 @@ TEST_CASE("basic")
         llvm_state tplt{kw::opt_level = opt_level};
 
         auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", {x + y + heyoka::time, x - y - par[0]},
-                                                             {x, y}, 1, false, false, 0);
+                                                             {x, y}, 1, false, false, 0, detail::default_parjit);
+        REQUIRE(ms.get_parjit() == detail::default_parjit);
 
         REQUIRE(sa.size() == 1u);
 
@@ -135,7 +137,7 @@ TEST_CASE("basic")
         llvm_state tplt{kw::opt_level = opt_level};
 
         auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", {x + y + heyoka::time, x - y - par[0]},
-                                                             {x, y}, 2, false, false, 0);
+                                                             {x, y}, 2, false, false, 0, detail::default_parjit);
 
         REQUIRE(sa.size() == 2u);
 
@@ -213,8 +215,9 @@ TEST_CASE("sgp4")
 
     llvm_state tplt;
 
-    auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", model::sgp4(),
-                                                         std::vector(inputs.begin(), inputs.end()), 1, false, false, 0);
+    auto [ms, dc, sa]
+        = detail::make_multi_cfunc<double>(tplt, "test", model::sgp4(), std::vector(inputs.begin(), inputs.end()), 1,
+                                           false, false, 0, detail::default_parjit);
 
     REQUIRE(sa.size() == 1u);
 
@@ -281,7 +284,8 @@ TEST_CASE("nbody")
             std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; });
             std::ranges::sort(vars, std::less<expression>{});
 
-            auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", exs, vars, batch_size, false, false, 0);
+            auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", exs, vars, batch_size, false, false, 0,
+                                                                 detail::default_parjit);
 
             ms.compile();
 
@@ -402,7 +406,8 @@ TEST_CASE("nbody mp")
         std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; });
         std::ranges::sort(vars, std::less<expression>{});
 
-        auto [ms, dc, sa] = detail::make_multi_cfunc<mppp::real>(tplt, "test", exs, vars, 1, false, false, prec);
+        auto [ms, dc, sa] = detail::make_multi_cfunc<mppp::real>(tplt, "test", exs, vars, 1, false, false, prec,
+                                                                 detail::default_parjit);
 
         ms.compile();
 
@@ -507,7 +512,8 @@ TEST_CASE("nbody par")
             std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; });
             std::ranges::sort(vars, std::less<expression>{});
 
-            auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", exs, vars, batch_size, false, false, 0);
+            auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", exs, vars, batch_size, false, false, 0,
+                                                                 detail::default_parjit);
 
             ms.compile();
 
@@ -701,7 +707,8 @@ TEST_CASE("nbody par mp")
         std::ranges::transform(sys, std::back_inserter(vars), [](const auto &p) { return p.first; });
         std::ranges::sort(vars, std::less<expression>{});
 
-        auto [ms, dc, sa] = detail::make_multi_cfunc<mppp::real>(tplt, "test", exs, vars, 1, false, false, prec);
+        auto [ms, dc, sa] = detail::make_multi_cfunc<mppp::real>(tplt, "test", exs, vars, 1, false, false, prec,
+                                                                 detail::default_parjit);
 
         ms.compile();
 
@@ -858,8 +865,8 @@ TEST_CASE("numparams")
 
             std::generate(pars.begin(), pars.end(), gen);
 
-            auto [ms, dc, sa]
-                = detail::make_multi_cfunc<double>(tplt, "test", {1_dbl, par[0]}, {}, batch_size, false, false, 0);
+            auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", {1_dbl, par[0]}, {}, batch_size, false,
+                                                                 false, 0, detail::default_parjit);
 
             REQUIRE(((batch_size == 1u && sa.size() == 1u) || (batch_size > 1u && sa.size() == 2u)));
 
@@ -936,7 +943,7 @@ TEST_CASE("numparams mp")
         std::generate(outs.begin(), outs.end(), gen);
 
         auto [ms, dc, sa] = detail::make_multi_cfunc<mppp::real>(tplt, "test", {1_dbl, par[0], par[1], -2_dbl}, {}, 1,
-                                                                 false, false, prec);
+                                                                 false, false, prec, detail::default_parjit);
 
         ms.compile();
 
@@ -1006,8 +1013,9 @@ TEST_CASE("bogus stride")
         std::generate(ins.begin(), ins.end(), gen);
         std::generate(pars.begin(), pars.end(), gen);
 
-        auto [ms, dc, sa] = detail::make_multi_cfunc<double>(tplt, "test", {x + 2_dbl * y + par[0] * z, par[1] - x * y},
-                                                             {x, y, z}, batch_size, false, false, 0);
+        auto [ms, dc, sa]
+            = detail::make_multi_cfunc<double>(tplt, "test", {x + 2_dbl * y + par[0] * z, par[1] - x * y}, {x, y, z},
+                                               batch_size, false, false, 0, detail::default_parjit);
 
         ms.compile();
 
@@ -1047,13 +1055,13 @@ TEST_CASE("failure modes")
 {
     using Catch::Matchers::Message;
 
-    REQUIRE_THROWS_MATCHES(
-        detail::make_multi_cfunc<double>(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 0, false, false, 0),
-        std::invalid_argument, Message("The batch size of a compiled function cannot be zero"));
+    REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc<double>(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 0, false, false,
+                                                            0, detail::default_parjit),
+                           std::invalid_argument, Message("The batch size of a compiled function cannot be zero"));
 
-    REQUIRE_THROWS_MATCHES(
-        detail::make_multi_cfunc<double>(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false, true, 0),
-        std::invalid_argument, Message("Parallel mode has not been implemented yet"));
+    REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc<double>(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false, true,
+                                                            0, detail::default_parjit),
+                           std::invalid_argument, Message("Parallel mode has not been implemented yet"));
 
 #if defined(HEYOKA_ARCH_PPC)
 
@@ -1065,16 +1073,17 @@ TEST_CASE("failure modes")
 
 #if defined(HEYOKA_HAVE_REAL)
 
-    REQUIRE_THROWS_MATCHES(
-        detail::make_multi_cfunc<mppp::real>(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false, false, 0),
-        std::invalid_argument,
-        Message(fmt::format("An invalid precision value of 0 was passed to make_multi_cfunc() (the "
-                            "value must be in the [{}, {}] range)",
-                            mppp::real_prec_min(), mppp::real_prec_max())));
+    REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc<mppp::real>(llvm_state{}, "cfunc", {1_dbl, par[0]}, {}, 1, false,
+                                                                false, 0, detail::default_parjit),
+                           std::invalid_argument,
+                           Message(fmt::format("An invalid precision value of 0 was passed to make_multi_cfunc() (the "
+                                               "value must be in the [{}, {}] range)",
+                                               mppp::real_prec_min(), mppp::real_prec_max())));
 
 #endif
 
-    REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc<double>(llvm_state{}, "", {1_dbl, par[0]}, {}, 1, false, false, 0),
+    REQUIRE_THROWS_MATCHES(detail::make_multi_cfunc<double>(llvm_state{}, "", {1_dbl, par[0]}, {}, 1, false, false, 0,
+                                                            detail::default_parjit),
                            std::invalid_argument,
                            Message("A non-empty function name is required when invoking make_multi_cfunc()"));
 }
diff --git a/test/taylor_adaptive.cpp b/test/taylor_adaptive.cpp
index ee77593f5..9aa195674 100644
--- a/test/taylor_adaptive.cpp
+++ b/test/taylor_adaptive.cpp
@@ -44,6 +44,7 @@
 #include <heyoka/expression.hpp>
 #include <heyoka/func.hpp>
 #include <heyoka/kw.hpp>
+#include <heyoka/llvm_state.hpp>
 #include <heyoka/math/pow.hpp>
 #include <heyoka/math/prod.hpp>
 #include <heyoka/math/sin.hpp>
@@ -1657,7 +1658,8 @@ void s11n_test_impl()
                                           kw::nt_events = {nt_event<double>(v - par[0], s11n_nt_cb{})},
                                           kw::pars = std::vector<double>{-1e-4},
                                           kw::high_accuracy = true,
-                                          kw::compact_mode = true};
+                                          kw::compact_mode = true,
+                                          kw::parjit = detail::default_parjit};
 
         REQUIRE(ta.get_tol() == std::numeric_limits<double>::epsilon());
         REQUIRE(ta.get_high_accuracy());
@@ -1703,6 +1705,7 @@ void s11n_test_impl()
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
         REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
         REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+        REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit());
 
         REQUIRE(value_type_index(ta.get_t_events()[0].get_callback())
                 == value_type_index(ta_copy.get_t_events()[0].get_callback()));
@@ -1827,7 +1830,8 @@ TEST_CASE("copy semantics")
                                     kw::pars = std::vector<fp_t>{-1e-4},
                                     kw::high_accuracy = true,
                                     kw::compact_mode = true,
-                                    kw::tol = 1e-11};
+                                    kw::tol = 1e-11,
+                                    kw::parjit = detail::default_parjit};
 
     auto ta_copy = ta;
 
@@ -1838,6 +1842,7 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit());
 
     ta.step();
     ta_copy.step();
@@ -1861,6 +1866,7 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit());
 
     ta.step();
     ta_copy.step();
diff --git a/test/taylor_adaptive_batch.cpp b/test/taylor_adaptive_batch.cpp
index ec6cb755c..979e6823e 100644
--- a/test/taylor_adaptive_batch.cpp
+++ b/test/taylor_adaptive_batch.cpp
@@ -1059,9 +1059,13 @@ void s11n_test_impl()
 
     // Test without events.
     {
-        auto ta = taylor_adaptive_batch<double>{
-            {prime(x) = v, prime(v) = -9.8 * sin(x + par[0])}, {0., 0.01, 0.5, 0.51},    2u,
-            kw::pars = std::vector<double>{-1e-4, -1.1e-4},    kw::high_accuracy = true, kw::compact_mode = true};
+        auto ta = taylor_adaptive_batch<double>{{prime(x) = v, prime(v) = -9.8 * sin(x + par[0])},
+                                                {0., 0.01, 0.5, 0.51},
+                                                2u,
+                                                kw::pars = std::vector<double>{-1e-4, -1.1e-4},
+                                                kw::high_accuracy = true,
+                                                kw::compact_mode = true,
+                                                kw::parjit = detail::default_parjit};
 
         REQUIRE(ta.get_tol() == std::numeric_limits<double>::epsilon());
         REQUIRE(ta.get_high_accuracy());
@@ -1103,6 +1107,7 @@ void s11n_test_impl()
         REQUIRE(ta.get_d_output() == ta_copy.get_d_output());
         REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
         REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+        REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit());
 
         REQUIRE(ta.get_step_res() == ta_copy.get_step_res());
         REQUIRE(ta.get_propagate_res() == ta_copy.get_propagate_res());
@@ -1736,7 +1741,8 @@ TEST_CASE("copy semantics")
                                           kw::pars = std::vector<fp_t>{-1e-4, -1e-4},
                                           kw::high_accuracy = true,
                                           kw::compact_mode = true,
-                                          kw::tol = 1e-11};
+                                          kw::tol = 1e-11,
+                                          kw::parjit = detail::default_parjit};
 
     auto ta_copy = ta;
 
@@ -1747,6 +1753,7 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit());
 
     ta.step();
     ta_copy.step();
@@ -1770,6 +1777,7 @@ TEST_CASE("copy semantics")
     REQUIRE(ta_copy.get_compact_mode() == ta.get_compact_mode());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_ir() == std::get<1>(ta.get_llvm_state()).get_ir());
     REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_bc() == std::get<1>(ta.get_llvm_state()).get_bc());
+    REQUIRE(std::get<1>(ta_copy.get_llvm_state()).get_parjit() == std::get<1>(ta.get_llvm_state()).get_parjit());
 
     ta.step();
     ta_copy.step();

From 209646f1f5fc5500252ca522beffbe8da5c70c0a Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 10:07:48 +0200
Subject: [PATCH 19/30] Add benchmark for the compilation time of SGP4
 dynamics.

---
 benchmark/CMakeLists.txt    |  1 +
 benchmark/sgp4_dynamics.cpp | 91 +++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 benchmark/sgp4_dynamics.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index c01f48243..ae9ab8a3f 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -83,6 +83,7 @@ ADD_HEYOKA_BENCHMARK(sims_flanagan_jac)
 ADD_HEYOKA_BENCHMARK(cfunc_mt)
 ADD_HEYOKA_BENCHMARK(diff_tensors)
 ADD_HEYOKA_BENCHMARK(var_construction)
+ADD_HEYOKA_BENCHMARK(sgp4_dynamics)
 
 if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
   ADD_HEYOKA_BENCHMARK(pendulum_mp)
diff --git a/benchmark/sgp4_dynamics.cpp b/benchmark/sgp4_dynamics.cpp
new file mode 100644
index 000000000..2bb8bd782
--- /dev/null
+++ b/benchmark/sgp4_dynamics.cpp
@@ -0,0 +1,91 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <utility>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <heyoka/expression.hpp>
+#include <heyoka/kw.hpp>
+#include <heyoka/math/sum.hpp>
+#include <heyoka/math/time.hpp>
+#include <heyoka/model/sgp4.hpp>
+#include <heyoka/taylor.hpp>
+
+#include <heyoka/logging.hpp>
+
+using namespace heyoka;
+
+std::vector<std::pair<heyoka::expression, heyoka::expression>> construct_sgp4_ode()
+{
+    // Fetch sgp4's formulae.
+    auto sgp4_func = heyoka::model::sgp4();
+
+    // The variable representing tsince in the sgp4 formulae.
+    const auto tsince = heyoka::expression("tsince");
+
+    // In sgp4_func, replace the TLE data with params, and tsince
+    // with tsince + par[7].
+    sgp4_func = heyoka::subs(sgp4_func, {{"n0", heyoka::par[0]},
+                                         {"e0", heyoka::par[1]},
+                                         {"i0", heyoka::par[2]},
+                                         {"node0", heyoka::par[3]},
+                                         {"omega0", heyoka::par[4]},
+                                         {"m0", heyoka::par[5]},
+                                         {"bstar", heyoka::par[6]},
+                                         {"tsince", tsince + heyoka::par[7]}});
+
+    // Compute the rhs of the sgp4 ODE, substituting tsince with the time placeholder.
+    const auto dt = heyoka::diff_tensors(sgp4_func, {tsince});
+    auto sgp4_rhs = heyoka::subs(dt.get_jacobian(), {{tsince, heyoka::time}});
+
+    // Create the state variables for the ODE.
+    auto [x, y, z, vx, vy, vz, e, r] = heyoka::make_vars("x", "y", "z", "vx", "vy", "vz", "e", "r");
+
+    // Add the differential equation for r.
+    // NOTE: do **not** use vx/vy/vz here. Apparently, in the SGP4 algorithm, if one takes the
+    // time derivatives of x/y/z one does not get *exactly* the same values as the vx/vy/vz returned
+    // by SGP4. In order for the differential equation for r to be correct, we need the the true time
+    // derivatives of x/y/z, and we cannot use what SGP4 says are the velocities.
+    sgp4_rhs.push_back(heyoka::sum({x * sgp4_rhs[0], y * sgp4_rhs[1], z * sgp4_rhs[2]}) / r);
+
+    // Return the ODE sys.
+    using heyoka::prime;
+    return {prime(x) = sgp4_rhs[0],  prime(y) = sgp4_rhs[1],  prime(z) = sgp4_rhs[2], prime(vx) = sgp4_rhs[3],
+            prime(vy) = sgp4_rhs[4], prime(vz) = sgp4_rhs[5], prime(e) = sgp4_rhs[6], prime(r) = sgp4_rhs[7]};
+}
+
+int main(int argc, char *argv[])
+{
+    set_logger_level_trace();
+
+    namespace po = boost::program_options;
+
+    bool parjit = false;
+
+    po::options_description desc("Options");
+
+    desc.add_options()("help", "produce help message")("parjit", "parallel JIT compilation");
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+        std::cout << desc << "\n";
+        return 0;
+    }
+
+    if (vm.count("parjit")) {
+        parjit = true;
+    }
+
+    taylor_adaptive<double> ta{construct_sgp4_ode(), std::vector<double>(8u), kw::high_accuracy = true,
+                               kw::compact_mode = true, kw::parjit = parjit};
+}

From c26be3b9ad90a0e656094c363e27e2933859ebbe Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 10:14:01 +0200
Subject: [PATCH 20/30] Minor tweak to config.hpp.in.

---
 config.hpp.in | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/config.hpp.in b/config.hpp.in
index 19f41f3ca..e5fcff6a1 100644
--- a/config.hpp.in
+++ b/config.hpp.in
@@ -53,13 +53,17 @@
 
 // NOTE: handy Boost library for this since 1.73:
 // https://www.boost.org/doc/libs/1_73_0/libs/predef/doc/index.html
+//
+// NOTE: it makes sense here to handle only the GCC/MSVC macros here
+// (on the assumption that clang is identical to GCC in this respect).
+// No point in using macros provided by compilers we do not test on.
 #if defined(_ARCH_PPC) || defined(_M_PPC)
 
 #define HEYOKA_ARCH_PPC
 
 #endif
 
-#if defined(__arm__) || defined(_M_ARM) || defined(__arm) || defined(__aarch64__)
+#if defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT) || defined(__aarch64__) || defined(_M_ARM64)
 
 #define HEYOKA_ARCH_ARM
 

From cbdfc6a6ca96d43823732c73b5a221f53e54e351 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 10:18:58 +0200
Subject: [PATCH 21/30] Remove now-unused definition.

---
 config.hpp.in | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/config.hpp.in b/config.hpp.in
index e5fcff6a1..bd5d80e25 100644
--- a/config.hpp.in
+++ b/config.hpp.in
@@ -75,10 +75,6 @@
 
 #endif
 
-// Maximum number of blocks that can be processed in parallel
-// when computing the Taylor derivatives in parallel mode.
-#define HEYOKA_CM_PAR_MAX_INVOKE_N 20
-
 // Setup of the ABI versioning and tagging
 // machinery.
 

From 535d531e4db6f6f879df613085339cda324c37e7 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 11:03:42 +0200
Subject: [PATCH 22/30] Set default_parjit to false on ARM.

---
 include/heyoka/llvm_state.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp
index 759f624e4..4ee929cbf 100644
--- a/include/heyoka/llvm_state.hpp
+++ b/include/heyoka/llvm_state.hpp
@@ -351,9 +351,9 @@ void llvm_state_mem_cache_try_insert(std::vector<std::string>, unsigned, llvm_mc
 // There is evidence of an LLVM thread scheduling bug when parallel compilation
 // is active, that rarely results in multiply-defined symbols for external C
 // functions, which leads to compilation failure. So far, we have been able to
-// trigger this issue only on Linux aarch64.
+// trigger this issue only on 64-bit arm.
 inline constexpr bool default_parjit =
-#if defined(HEYOKA_ARCH_ARM) && defined(__linux__)
+#if defined(HEYOKA_ARCH_ARM)
     false
 #else
     true

From 3ec07e771c908e7a0d0039177d0d2c603c46b09c Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 14:04:47 +0200
Subject: [PATCH 23/30] Small tweak to avoid potential unused variable
 warnings.

---
 src/taylor_adaptive.cpp       | 2 +-
 src/taylor_adaptive_batch.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp
index d6513d550..0707e69a9 100644
--- a/src/taylor_adaptive.cpp
+++ b/src/taylor_adaptive.cpp
@@ -65,7 +65,7 @@
 // NOTE: this is a helper macro to reduce typing when accessing the
 // data members of i_data.
 // NOLINTNEXTLINE(bugprone-macro-parentheses)
-#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) auto &name = m_i_data->name
+#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) [[maybe_unused]] auto &name = m_i_data->name
 
 HEYOKA_BEGIN_NAMESPACE
 
diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp
index b29660e53..6bff47ca1 100644
--- a/src/taylor_adaptive_batch.cpp
+++ b/src/taylor_adaptive_batch.cpp
@@ -62,7 +62,7 @@
 // NOTE: this is a helper macro to reduce typing when accessing the
 // data members of i_data.
 // NOLINTNEXTLINE(bugprone-macro-parentheses)
-#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) auto &name = m_i_data->name
+#define HEYOKA_TAYLOR_REF_FROM_I_DATA(name) [[maybe_unused]] auto &name = m_i_data->name
 
 HEYOKA_BEGIN_NAMESPACE
 

From cbdd39ff6c7785441596cd1ffded28323dbad4da Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 14:23:48 +0200
Subject: [PATCH 24/30] Factor out the helper to compute the cost of a
 floating-point scalar operation.

---
 CMakeLists.txt                        |   1 +
 include/heyoka/detail/type_traits.hpp |   3 +
 src/cfunc_class.cpp                   |  48 +-----------
 src/detail/type_traits.cpp            | 108 ++++++++++++++++++++++++++
 4 files changed, 113 insertions(+), 47 deletions(-)
 create mode 100644 src/detail/type_traits.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8211fb370..6cfdd07bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,6 +193,7 @@ set(HEYOKA_SRC_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/type_traits.cpp"
     # NOTE: this will be an empty file in case we are not
     # building with support for real.
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp"
diff --git a/include/heyoka/detail/type_traits.hpp b/include/heyoka/detail/type_traits.hpp
index 04e16c1a2..db1126e25 100644
--- a/include/heyoka/detail/type_traits.hpp
+++ b/include/heyoka/detail/type_traits.hpp
@@ -139,6 +139,9 @@ inline constexpr bool is_x86_fp80 = is_ieee754_binaryN<T, 64>();
 template <typename T>
 inline constexpr bool is_ieee754_binary128 = is_ieee754_binaryN<T, 113>();
 
+template <typename T>
+double get_fp_unit_cost();
+
 } // namespace detail
 
 HEYOKA_END_NAMESPACE
diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp
index 9bcc50b2d..02b33b63e 100644
--- a/src/cfunc_class.cpp
+++ b/src/cfunc_class.cpp
@@ -945,53 +945,7 @@ void cfunc<T>::multi_eval(out_2d outputs, in_2d inputs, std::optional<in_2d> par
     // - the batch size.
 
     // Cost of a scalar fp operation.
-    constexpr auto fp_unit_cost = []() -> double {
-        if constexpr (std::same_as<float, T> || std::same_as<double, T>) {
-            // float and double.
-            return 1;
-        } else if constexpr (std::same_as<long double, T>) {
-            // long double.
-            if constexpr (detail::is_ieee754_binary64<T>) {
-                return 1;
-            } else if constexpr (detail::is_x86_fp80<T>) {
-                return 5;
-            } else if constexpr (detail::is_ieee754_binary128<T>) {
-#if defined(HEYOKA_ARCH_PPC)
-                return 10;
-#else
-                return 100;
-#endif
-            } else {
-#if defined(HEYOKA_ARCH_PPC)
-                // Double-double implementation.
-                return 5;
-#else
-                static_assert(detail::always_false_v<T>, "Unknown fp cost model.");
-#endif
-            }
-        }
-#if defined(HEYOKA_HAVE_REAL128)
-        else if constexpr (std::same_as<mppp::real128, T>) {
-#if defined(HEYOKA_ARCH_PPC)
-            return 10;
-#else
-            return 100;
-#endif
-        }
-#endif
-#if defined(HEYOKA_HAVE_REAL)
-        else if constexpr (std::same_as<mppp::real, T>) {
-            // NOTE: this should be improved to take into account
-            // the selected precision.
-            // NOTE: for reference, mppp::real with 113 bits of precision
-            // is slightly slower than software-implemented quadmath.
-            return 1000;
-        }
-#endif
-        else {
-            static_assert(detail::always_false_v<T>, "Unknown fp cost model.");
-        }
-    }();
+    const auto fp_unit_cost = detail::get_fp_unit_cost<T>();
 
     // Total number of fp operations: number of elementary subexpressions in the
     // decomposition * ncols.
diff --git a/src/detail/type_traits.cpp b/src/detail/type_traits.cpp
new file mode 100644
index 000000000..c2b8f9e8c
--- /dev/null
+++ b/src/detail/type_traits.cpp
@@ -0,0 +1,108 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <heyoka/config.hpp>
+
+#include <concepts>
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+#include <mp++/real128.hpp>
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+#include <mp++/real.hpp>
+
+#endif
+
+#include <heyoka/detail/type_traits.hpp>
+
+HEYOKA_BEGIN_NAMESPACE
+
+namespace detail
+{
+
+// A function to compute a rough estimate of the cost of performing
+// an elementary operation (e.g., addition/multiplication) on a scalar
+// floating-point value of type T.
+//
+// The cost is calibrated to be 1 for single/double precision values,
+// so that the unit of measure for the cost is a (very rough) approximation
+// of clock cycles.
+template <typename T>
+double get_fp_unit_cost()
+{
+    if constexpr (std::same_as<float, T> || std::same_as<double, T>) {
+        // float and double.
+        return 1;
+    } else if constexpr (std::same_as<long double, T>) {
+        // long double.
+        if constexpr (is_ieee754_binary64<T>) {
+            return 1;
+        } else if constexpr (is_x86_fp80<T>) {
+            return 5;
+        } else if constexpr (is_ieee754_binary128<T>) {
+#if defined(HEYOKA_ARCH_PPC)
+            return 10;
+#else
+            return 100;
+#endif
+        } else {
+#if defined(HEYOKA_ARCH_PPC)
+            // Double-double implementation.
+            return 5;
+#else
+            static_assert(always_false_v<T>, "Unknown fp cost model for long double.");
+#endif
+        }
+    }
+#if defined(HEYOKA_HAVE_REAL128)
+    else if constexpr (std::same_as<mppp::real128, T>) {
+#if defined(HEYOKA_ARCH_PPC)
+        return 10;
+#else
+        return 100;
+#endif
+    }
+#endif
+#if defined(HEYOKA_HAVE_REAL)
+    else if constexpr (std::same_as<mppp::real, T>) {
+        // NOTE: this should be improved to take into account
+        // the selected precision.
+        // NOTE: for reference, mppp::real with 113 bits of precision
+        // is slightly slower than software-implemented quadmath.
+        return 1000;
+    }
+#endif
+    else {
+        static_assert(always_false_v<T>, "Unknown fp cost model for an unsupported floating-point type.");
+    }
+}
+
+// Explicit instantiations.
+template double get_fp_unit_cost<float>();
+template double get_fp_unit_cost<double>();
+template double get_fp_unit_cost<long double>();
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+template double get_fp_unit_cost<mppp::real128>();
+
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+template double get_fp_unit_cost<mppp::real>();
+
+#endif
+
+} // namespace detail
+
+HEYOKA_END_NAMESPACE

From 0cddb720048539086b5882d3bc3615555457581e Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Thu, 29 Aug 2024 14:29:38 +0200
Subject: [PATCH 25/30] Minor.

---
 src/detail/type_traits.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/detail/type_traits.cpp b/src/detail/type_traits.cpp
index c2b8f9e8c..925646567 100644
--- a/src/detail/type_traits.cpp
+++ b/src/detail/type_traits.cpp
@@ -35,7 +35,7 @@ namespace detail
 //
 // The cost is calibrated to be 1 for single/double precision values,
 // so that the unit of measure for the cost is a (very rough) approximation
-// of clock cycles.
+// of a clock cycle.
 template <typename T>
 double get_fp_unit_cost()
 {

From 1e061978ce4582461578f99de7975748447d4621 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 30 Aug 2024 10:53:20 +0200
Subject: [PATCH 26/30] Internal doc additions.

---
 src/cfunc_class.cpp | 6 ++++++
 src/taylor_02.cpp   | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp
index 02b33b63e..c7edd99a2 100644
--- a/src/cfunc_class.cpp
+++ b/src/cfunc_class.cpp
@@ -943,6 +943,12 @@ void cfunc<T>::multi_eval(out_2d outputs, in_2d inputs, std::optional<in_2d> par
     // - the value of ncols,
     // - the floating-point type in use,
     // - the batch size.
+    //
+    // Note that this cost model is very rough and does not take into account,
+    // for instance, that different elementary operations may have very different
+    // costs (e.g., a trig function vs a simple add). Perhaps we can re-evaluate this
+    // in the future and maybe just remove it and parallelise regardless to simplify
+    // the logic.
 
     // Cost of a scalar fp operation.
     const auto fp_unit_cost = detail::get_fp_unit_cost<T>();
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 4c311d726..4aae9d02e 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -953,6 +953,15 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint
 
     // Generate the code for the computation of the Taylor derivatives.
     if (parallel_mode) {
+        // NOTE: in principle here we could implement a cost model to decide at runtime
+        // whether or not it is worth it to run the parallel implementation depending
+        // on the current Taylor order. The cost model for the computation of the Taylor
+        // derivatives is quite simple (as all AD formulae basically boild down to
+        // sums of products), apart from order 0 where we may have operations with
+        // wildly different costs (e.g., a cos() vs a simple addition). We made an attempt
+        // at implementing such a cost model at one point, but there were no benefits
+        // (even a small slowdown) in the large N-body problem used as a test case.
+        // Thus, for now, let us keep things simple.
         taylor_cm_codegen_segment_diff_parallel(s, fp_vec_type, seg_map, n_uvars);
     } else {
         taylor_cm_codegen_segment_diff_sequential(s, fp_vec_type, seg_map, n_uvars);

From a11e32baac8abd344f436d4af5f0f01aaf6336b6 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 30 Aug 2024 11:18:24 +0200
Subject: [PATCH 27/30] Update the known issues page.

---
 doc/known_issues.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/known_issues.rst b/doc/known_issues.rst
index 599f17127..495739430 100644
--- a/doc/known_issues.rst
+++ b/doc/known_issues.rst
@@ -18,6 +18,14 @@ Unsolved
 
   The root cause is most likely a code-generation/optimisation problem in LLVM.
   This issue is currently under investigation.
+* The parallel compilation feature (added in heyoka 6.0.0) is currently disabled
+  by default on 64-bit ARM processors (this includes the Apple M1 and its successors).
+  The reason is a likely thread scheduling bug in LLVM's parallel compilation facilities
+  that very rarely results in a multiply-defined symbol, which ultimately leads to compilation
+  failure. The issue is currently under investigation by the LLVM developers. In the
+  meantime, you can explicitly turn on parallel compilation via the ``kw::parjit``
+  :ref:`keyword argument <kwargs>` when constructing an integrator or a compiled
+  function.
 
 Solved
 ======

From a631bf6b6d4f8be345701fba2049d18a4059c4e4 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 30 Aug 2024 11:32:48 +0200
Subject: [PATCH 28/30] Minor.

---
 src/taylor_02.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 4aae9d02e..5b993c1ed 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -956,7 +956,7 @@ taylor_cm_seg_f_list_t taylor_cm_codegen_segment_diff(const auto &seg, std::uint
         // NOTE: in principle here we could implement a cost model to decide at runtime
         // whether or not it is worth it to run the parallel implementation depending
         // on the current Taylor order. The cost model for the computation of the Taylor
-        // derivatives is quite simple (as all AD formulae basically boild down to
+        // derivatives is quite simple (as all AD formulae basically boil down to
         // sums of products), apart from order 0 where we may have operations with
         // wildly different costs (e.g., a cos() vs a simple addition). We made an attempt
         // at implementing such a cost model at one point, but there were no benefits

From a92e8807a56c6663119cf36ab07993a5326d8ca7 Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Fri, 30 Aug 2024 17:01:07 +0200
Subject: [PATCH 29/30] Update changelog.

---
 doc/changelog.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 98ec5911f..c1af36f72 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -7,6 +7,11 @@ Changelog
 New
 ~~~
 
+- Implement parallel compilation for Taylor integrators
+  and compiled functions
+  (`#446 <https://github.com/bluescarni/heyoka/pull/446>`__,
+  `#444 <https://github.com/bluescarni/heyoka/pull/444>`__,
+  `#441 <https://github.com/bluescarni/heyoka/pull/441>`__).
 - Add the possibility of specifying the LLVM code model
   used for JIT compilation
   (`#440 <https://github.com/bluescarni/heyoka/pull/440>`__).

From 57189c7b0841582faaba253fb57bbe3f684faedb Mon Sep 17 00:00:00 2001
From: Francesco Biscani <bluescarni@gmail.com>
Date: Sat, 31 Aug 2024 09:12:40 +0200
Subject: [PATCH 30/30] Internal doc bit.

---
 src/expression_cfunc.cpp | 7 +++++++
 src/taylor_02.cpp        | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
index 940586166..20b128001 100644
--- a/src/expression_cfunc.cpp
+++ b/src/expression_cfunc.cpp
@@ -1704,6 +1704,13 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state>
     // Limit of codegenned blocks per state.
     // NOTE: this has not been really properly tuned,
     // needs more investigation.
+    // NOTE: it would probably be better here to keep track of the
+    // total number of function calls per segment, rather than
+    // the number of blocks. The reason for this is that each
+    // function call in principle increases the size of the
+    // auxiliary global arrays used by the compact mode
+    // argument generators, which in turn increases the code
+    // generation time.
     constexpr auto max_n_cg_blocks = 20u;
 
     // Variable to keep track of the u variable
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 5b993c1ed..af54aae39 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -1049,6 +1049,13 @@ std::vector<llvm_state> taylor_compute_jet_multi(llvm_state &main_state, llvm::T
     // Limit of codegenned blocks per state.
     // NOTE: this has not been really properly tuned,
     // needs more investigation.
+    // NOTE: it would probably be better here to keep track of the
+    // total number of function calls per segment, rather than
+    // the number of blocks. The reason for this is that each
+    // function call in principle increases the size of the
+    // auxiliary global arrays used by the compact mode
+    // argument generators, which in turns increases the code
+    // generation time.
     constexpr auto max_n_cg_blocks = 20u;
 
     // Variable to keep track of the index of the first u variable