diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86717c759..8211fb370 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,6 +192,7 @@ set(HEYOKA_SRC_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/setup_variational_ics.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp"
     # NOTE: this will be an empty file in case we are not
     # building with support for real.
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp"
diff --git a/include/heyoka/detail/aligned_buffer.hpp b/include/heyoka/detail/aligned_buffer.hpp
new file mode 100644
index 000000000..1b68bc867
--- /dev/null
+++ b/include/heyoka/detail/aligned_buffer.hpp
@@ -0,0 +1,39 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef HEYOKA_DETAIL_ALIGNED_BUFFER_HPP
+#define HEYOKA_DETAIL_ALIGNED_BUFFER_HPP
+
+#include <cstddef>
+#include <memory>
+
+#include <heyoka/config.hpp>
+
+HEYOKA_BEGIN_NAMESPACE
+
+namespace detail
+{
+
+// Utilities to create and destroy tape arrays for compiled functions
+// and/or Taylor integrators in compact mode. These may have custom alignment requirements due
+// to the use of SIMD instructions, hence we need to use aligned new/delete
+// and a custom deleter for the unique ptr.
+struct aligned_buffer_deleter {
+    std::align_val_t al{};
+    void operator()(void *ptr) const noexcept;
+};
+
+using aligned_buffer_t = std::unique_ptr<std::byte[], aligned_buffer_deleter>;
+
+aligned_buffer_t make_aligned_buffer(std::size_t, std::size_t);
+
+} // namespace detail
+
+HEYOKA_END_NAMESPACE
+
+#endif
diff --git a/include/heyoka/detail/llvm_helpers.hpp b/include/heyoka/detail/llvm_helpers.hpp
index 3c60deaa3..cbc4e0395 100644
--- a/include/heyoka/detail/llvm_helpers.hpp
+++ b/include/heyoka/detail/llvm_helpers.hpp
@@ -67,6 +67,8 @@ HEYOKA_DLL_PUBLIC llvm::Type *make_vector_type(llvm::Type *, std::uint32_t);
 
 HEYOKA_DLL_PUBLIC std::string llvm_mangle_type(llvm::Type *);
 
+HEYOKA_DLL_PUBLIC llvm::Type *llvm_clone_type(llvm_state &, llvm::Type *);
+
 HEYOKA_DLL_PUBLIC std::uint32_t get_vector_size(llvm::Value *);
 
 HEYOKA_DLL_PUBLIC std::uint64_t get_alignment(llvm::Module &, llvm::Type *);
diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index 2758a14c5..a5c099c12 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -698,8 +698,8 @@ auto cfunc_common_opts(const KwArgs &...kw_args)
 
 template <typename>
 std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
-make_multi_cfunc(const llvm_state &, const std::string &, const std::vector<expression> &,
-                 const std::vector<expression> &, std::uint32_t, bool, bool, long long);
+make_multi_cfunc(llvm_state, const std::string &, const std::vector<expression> &, const std::vector<expression> &,
+                 std::uint32_t, bool, bool, long long);
 
 } // namespace detail
 
diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp
index 5916cf99d..93882b47c 100644
--- a/src/cfunc_class.cpp
+++ b/src/cfunc_class.cpp
@@ -13,7 +13,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <new>
 #include <optional>
 #include <ostream>
 #include <stdexcept>
@@ -49,6 +48,7 @@
 
 #endif
 
+#include <heyoka/detail/aligned_buffer.hpp>
 #include <heyoka/detail/type_traits.hpp>
 #include <heyoka/detail/variant_s11n.hpp>
 #include <heyoka/detail/visibility.hpp>
@@ -59,60 +59,6 @@
 
 HEYOKA_BEGIN_NAMESPACE
 
-namespace detail
-{
-
-namespace
-{
-
-// Utilities to create and destroy tape arrays for compiled functions
-// in compact mode. These may have custom alignment requirements due
-// to the use of SIMD instructions, hence we need to use aligned new/delete
-// and a custom deleter for the unique ptr.
-struct aligned_array_deleter {
-    std::align_val_t al{};
-    void operator()(void *ptr) const noexcept
-    {
-        // NOTE: here we are using directly the delete operator (which does not invoke destructors),
-        // rather than a delete expression (which would also invoke destructors). However, because
-        // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be
-        // sufficient.
-        ::operator delete[](ptr, al);
-    }
-};
-
-using aligned_array_t = std::unique_ptr<std::byte[], aligned_array_deleter>;
-
-aligned_array_t make_aligned_array(std::size_t sz, std::size_t al)
-{
-    assert(al > 0u);
-    assert((al & (al - 1u)) == 0u);
-
-    if (sz == 0u) {
-        return {};
-    } else {
-#if defined(_MSC_VER)
-        // MSVC workaround for this issue:
-        // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320
-
-        // Allocate the raw memory.
-        auto *buf = ::operator new[](sz, std::align_val_t{al});
-
-        // Formally construct the bytes array.
-        auto *ptr = ::new (buf) std::byte[sz];
-
-        // Construct and return the unique ptr.
-        return aligned_array_t{ptr, {.al = std::align_val_t{al}}};
-#else
-        return aligned_array_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}};
-#endif
-    }
-}
-
-} // namespace
-
-} // namespace detail
-
 template <typename T>
 struct cfunc<T>::impl {
     // The compiled function types.
@@ -124,7 +70,7 @@ struct cfunc<T>::impl {
     using c_cfunc_ptr_s_t = void (*)(T *, const T *, const T *, const T *, void *, std::size_t) noexcept;
 
     // Thread-local storage for parallel operations.
-    using ets_item_t = detail::aligned_array_t;
+    using ets_item_t = detail::aligned_buffer_t;
     using ets_t = oneapi::tbb::enumerable_thread_specific<ets_item_t, oneapi::tbb::cache_aligned_allocator<ets_item_t>,
                                                           oneapi::tbb::ets_key_usage_type::ets_key_per_instance>;
 
@@ -135,7 +81,7 @@ struct cfunc<T>::impl {
     std::uint32_t m_batch_size = 0;
     std::vector<expression> m_dc;
     std::vector<std::array<std::size_t, 2>> m_tape_sa;
-    std::vector<detail::aligned_array_t> m_tapes;
+    std::vector<detail::aligned_buffer_t> m_tapes;
     std::variant<cfunc_ptr_t, c_cfunc_ptr_t> m_fptr_scal;
     std::variant<cfunc_ptr_s_t, c_cfunc_ptr_s_t> m_fptr_scal_s;
     std::variant<cfunc_ptr_s_t, c_cfunc_ptr_s_t> m_fptr_batch_s;
@@ -223,7 +169,7 @@ struct cfunc<T>::impl {
         assert(m_tapes.empty());
 
         for (const auto [sz, al] : m_tape_sa) {
-            m_tapes.push_back(detail::make_aligned_array(sz, al));
+            m_tapes.push_back(detail::make_aligned_buffer(sz, al));
         }
     }
 
@@ -260,8 +206,8 @@ struct cfunc<T>::impl {
 
         if (compact_mode) {
             // Build the multi cfunc, and assign the internal members.
-            std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc<T>(s, "cfunc", m_fn, m_vars, m_batch_size,
-                                                                              high_accuracy, m_parallel_mode, prec);
+            std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc<T>(
+                std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec);
 
             // Compile.
             std::get<1>(m_states).compile();
@@ -845,8 +791,8 @@ void cfunc<T>::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional<in_2d>
         typename impl::ets_t ets_batch([this, batch_size]() {
             // NOTE: the batch-mode tape is at index 1 only if the batch
             // size is > 1, otherwise we are using the scalar tape.
-            return detail::make_aligned_array(m_impl->m_tape_sa[batch_size > 1u][0],
-                                              m_impl->m_tape_sa[batch_size > 1u][1]);
+            return detail::make_aligned_buffer(m_impl->m_tape_sa[batch_size > 1u][0],
+                                               m_impl->m_tape_sa[batch_size > 1u][1]);
         });
 
         oneapi::tbb::parallel_invoke(
@@ -865,7 +811,7 @@ void cfunc<T>::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional<in_2d>
                                               // will block as execution in the parallel region of the cfunc begins. The
                                               // blocked thread could then grab another task from the parallel for loop
                                               // we are currently in, and it would then start writing for a second time
-                                              // into the same tape it already begun writing into, leading to UB.
+                                              // into the same tape it already begun writing into.
                                               oneapi::tbb::this_task_arena::isolate(
                                                   [&]() { batch_iter.template operator()<true>(range, tape_ptr); });
                                           });
diff --git a/src/detail/aligned_buffer.cpp b/src/detail/aligned_buffer.cpp
new file mode 100644
index 000000000..5e9ba6eb0
--- /dev/null
+++ b/src/detail/aligned_buffer.cpp
@@ -0,0 +1,58 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <cassert>
+#include <cstddef>
+#include <new>
+
+#include <heyoka/config.hpp>
+#include <heyoka/detail/aligned_buffer.hpp>
+
+HEYOKA_BEGIN_NAMESPACE
+
+namespace detail
+{
+
+void aligned_buffer_deleter::operator()(void *ptr) const noexcept
+{
+    // NOTE: here we are using directly the delete operator (which does not invoke destructors),
+    // rather than a delete expression (which would also invoke destructors). However, because
+    // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be
+    // sufficient.
+    ::operator delete[](ptr, al);
+}
+
+aligned_buffer_t make_aligned_buffer(std::size_t sz, std::size_t al)
+{
+    assert(al > 0u);
+    assert((al & (al - 1u)) == 0u);
+
+    if (sz == 0u) {
+        return {};
+    } else {
+#if defined(_MSC_VER)
+        // MSVC workaround for this issue:
+        // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320
+
+        // Allocate the raw memory.
+        auto *buf = ::operator new[](sz, std::align_val_t{al});
+
+        // Formally construct the bytes array.
+        auto *ptr = ::new (buf) std::byte[sz];
+
+        // Construct and return the unique ptr.
+        return aligned_buffer_t{ptr, {.al = std::align_val_t{al}}};
+#else
+        return aligned_buffer_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}};
+#endif
+    }
+}
+
+} // namespace detail
+
+HEYOKA_END_NAMESPACE
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index 010ed98fc..c5a4afc2c 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -3359,6 +3359,83 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t)
     }
 }
 
+// Utility to create an identical copy of the type tp in the context of the state s.
+// NOTE: although it may sound like this is a read-only operation on tp, it is not,
+// since we are potentially poking into the context of tp during operations. Thus, this
+// function cannot be called concurrently from multiple threads on the same tp object,
+// or even on different tp objects defined in the same context.
+// NOTE: this handles only floating-point (vector) types at this time, extending
+// to integral types should be fairly easy.
+// NOTE: perhaps this function could be made more generic for arbitrary struct types
+// by (recursively) reading the struct layout and then reproducing it in the target
+// context. Like this, we could avoid special casing for the mppp::real types.
+llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp)
+{
+    assert(tp != nullptr);
+
+    // Fetch the target context.
+    auto &ctx = s.context();
+
+    // Construct the scalar type first, then we will convert
+    // to a vector if needed.
+    auto *tp_scal = tp->getScalarType();
+    llvm::Type *ret_scal_t = nullptr;
+
+#define HEYOKA_LLVM_CLONE_TYPE_IMPL(tid)                                                                               \
+    case llvm::Type::tid##TyID:                                                                                        \
+        ret_scal_t = llvm::Type::get##tid##Ty(ctx);                                                                    \
+        break
+
+    // NOTE: gcov seems to get a bit confused by the macro usage.
+    // LCOV_EXCL_START
+    switch (tp_scal->getTypeID()) {
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(Float);
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(Double);
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(X86_FP80);
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(FP128);
+        default: {
+
+#if defined(HEYOKA_HAVE_REAL)
+
+            if (const auto prec = llvm_is_real(tp_scal); prec != 0) {
+                // tp_scal is the internal counterpart of mppp::real.
+                ret_scal_t = to_internal_llvm_type<mppp::real>(s, prec);
+                break;
+            } else if (tp_scal == to_external_llvm_type<mppp::real>(tp_scal->getContext())) {
+                // tp_scal is mppp::real.
+                ret_scal_t = to_external_llvm_type<mppp::real>(ctx);
+                break;
+            }
+
+#endif
+
+            throw std::invalid_argument(
+                fmt::format("Cannot clone the LLVM type '{}' to another context", llvm_type_name(tp)));
+        }
+    }
+
+#undef HEYOKA_LLVM_CLONE_TYPE_IMPL
+    // LCOV_EXCL_STOP
+
+    assert(ret_scal_t != nullptr);
+
+    if (tp->isVectorTy()) {
+        // tp is a vector type.
+        if (const auto *vtp = llvm::dyn_cast<llvm_vector_type>(tp)) [[likely]] {
+            return make_vector_type(ret_scal_t, boost::numeric_cast<std::uint32_t>(vtp->getNumElements()));
+        } else {
+            // LCOV_EXCL_START
+            throw std::invalid_argument(fmt::format("Cannot clone the LLVM type '{}' to another context - the type is "
+                                                    "a vector type whose size is not fixed",
+                                                    llvm_type_name(tp)));
+            // LCOV_EXCL_STOP
+        }
+    } else {
+        // tp is a scalar type.
+        return ret_scal_t;
+    }
+}
+
 } // namespace detail
 
 HEYOKA_END_NAMESPACE
diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp
index 00a8c5aa0..5eeb93538 100644
--- a/src/expression_cfunc.cpp
+++ b/src/expression_cfunc.cpp
@@ -1568,14 +1568,15 @@ namespace
 // implemented in distinct llvm_state objects.
 //
 // states is the current list of states (to which more will be added by this function), and the last state
-// in the list is the "main" state. s_dc is the segmented decomposition of the function to be compiled.
+// in the list is the "main" state. main_fp_t is the internal scalar floating-point type as defined in the main state.
+// s_dc is the segmented decomposition of the function to be compiled.
 // base_name is the name of the main function from which the drivers are to be invoked. main_eval_arr,
 // main_par_ptr, main_time_ptr and main_stride are, respectively, the pointer to the evaluation tape,
 // the pointer to the parameter values, the pointer to time coordinate(s) and the stride - these are all
 // defined in the main state and they are passed to the driver functions invocations.
-template <typename T, typename SDC>
-void multi_cfunc_evaluate_segments(std::list<llvm_state> &states, const SDC &s_dc, std::uint32_t nvars,
-                                   std::uint32_t batch_size, bool high_accuracy, long long prec,
+template <typename SDC>
+void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list<llvm_state> &states, const SDC &s_dc,
+                                   std::uint32_t nvars, std::uint32_t batch_size, bool high_accuracy,
                                    const std::string &base_name, llvm::Value *main_eval_arr, llvm::Value *main_par_ptr,
                                    llvm::Value *main_time_ptr, llvm::Value *main_stride)
 {
@@ -1608,6 +1609,11 @@ void multi_cfunc_evaluate_segments(std::list<llvm_state> &states, const SDC &s_d
                    llvm_func_name_compare>;
 
     // Push back a new state and use it as initial current state.
+    // NOTE: like this, we always end up creating at least one driver
+    // function and a state, even in the degenerate case of an empty decomposition,
+    // which is suboptimal peformance-wise.
+    // I do not think however that it is worth it to complicate the code to avoid
+    // this corner-case pessimisation.
     states.push_back(main_state.make_similar());
     auto *cur_state = &states.back();
 
@@ -1729,7 +1735,7 @@ void multi_cfunc_evaluate_segments(std::list<llvm_state> &states, const SDC &s_d
         }
 
         // Fetch the internal fp type and its vector counterpart for the current state.
-        auto *fp_t = to_internal_llvm_type<T>(*cur_state, prec);
+        auto *fp_t = llvm_clone_type(*cur_state, main_fp_t);
         auto *fp_vec_type = make_vector_type(fp_t, batch_size);
 
         // Fetch the current builder.
@@ -1955,17 +1961,13 @@ void multi_cfunc_evaluate_segments(std::list<llvm_state> &states, const SDC &s_d
     // LCOV_EXCL_STOP
 }
 
-// NOTE: here we are forced to use a templated function, rather than passing in the
-// LLVM type fp_t as usual, because we need to re-create the type for every context
-// in every state, and there seems not to be an easy way to transfer/copy a type
-// from one context to the other.
-template <typename T>
-std::array<std::size_t, 2>
-add_multi_cfunc_impl(std::list<llvm_state> &states, llvm::Value *out_ptr, llvm::Value *in_ptr, llvm::Value *par_ptr,
-                     llvm::Value *time_ptr, llvm::Value *stride, const std::vector<expression> &dc, std::uint32_t nvars,
-                     // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-                     std::uint32_t nuvars, std::uint32_t batch_size, bool high_accuracy, long long prec,
-                     const std::string &base_name, llvm::Value *eval_arr)
+std::array<std::size_t, 2> add_multi_cfunc_impl(llvm::Type *fp_t, std::list<llvm_state> &states, llvm::Value *out_ptr,
+                                                llvm::Value *in_ptr, llvm::Value *par_ptr, llvm::Value *time_ptr,
+                                                llvm::Value *stride, const std::vector<expression> &dc,
+                                                std::uint32_t nvars,
+                                                // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+                                                std::uint32_t nuvars, std::uint32_t batch_size, bool high_accuracy,
+                                                const std::string &base_name, llvm::Value *eval_arr)
 {
     // Fetch the main state, module, etc.
     auto &main_state = states.back();
@@ -1973,7 +1975,10 @@ add_multi_cfunc_impl(std::list<llvm_state> &states, llvm::Value *out_ptr, llvm::
     auto &main_builder = main_state.builder();
 
     // Fetch the fp types for the main state.
-    auto *main_fp_t = to_internal_llvm_type<T>(main_state, prec);
+    // NOTE: cloning is safe here, as even though this function is being invoked
+    // in parallel from multiple threads, we have made sure that each invocation
+    // gets its own cloned copy of fp_t.
+    auto *main_fp_t = llvm_clone_type(main_state, fp_t);
     auto *main_ext_fp_t = make_external_llvm_type(main_fp_t);
     auto *fp_vec_type = make_vector_type(main_fp_t, batch_size);
 
@@ -1991,7 +1996,7 @@ add_multi_cfunc_impl(std::list<llvm_state> &states, llvm::Value *out_ptr, llvm::
     const auto al = boost::numeric_cast<std::size_t>(get_alignment(main_md, fp_vec_type));
 
     // NOTE: eval_arr is used as temporary storage for the current function,
-    // but it provided externally from dynamically-allocated memory in order to avoid stack overflow.
+    // but it is provided externally from dynamically-allocated memory in order to avoid stack overflow.
     // This creates a situation in which LLVM cannot elide stores into eval_arr
     // (even if it figures out a way to avoid storing intermediate results into
     // eval_arr) because LLVM must assume that some other function may
@@ -2015,8 +2020,8 @@ add_multi_cfunc_impl(std::list<llvm_state> &states, llvm::Value *out_ptr, llvm::
     });
 
     // Generate the code for the evaluation of all segments.
-    multi_cfunc_evaluate_segments<T>(states, s_dc, nvars, batch_size, high_accuracy, prec, base_name, eval_arr, par_ptr,
-                                     time_ptr, stride);
+    multi_cfunc_evaluate_segments(main_fp_t, states, s_dc, nvars, batch_size, high_accuracy, base_name, eval_arr,
+                                  par_ptr, time_ptr, stride);
 
     // Write the results to the output pointer.
     cfunc_c_write_outputs(main_state, main_fp_t, out_ptr, cout_gl, eval_arr, par_ptr, stride, batch_size);
@@ -2027,32 +2032,10 @@ add_multi_cfunc_impl(std::list<llvm_state> &states, llvm::Value *out_ptr, llvm::
     return {sz, al};
 }
 
-} // namespace
-
-// This function will compile several versions of the input function fn, with input variables vars, in compact mode.
-//
-// The compiled functions are implemented across several llvm_states which are collated together and returned as
-// a single llvm_multi_state (this is the first element of the return tuple). If batch_size is 1,
-// then 2 compiled functions are created - a scalar strided and a scalar unstrided version.
-// If batch size is > 1, then an additional batch-mode strided compiled function is returned.
-// The function names are created using "name" as base name and then mangling in the strided/unstrided
-// property and the batch size.
-//
-// The second element of the return tuple is the decomposition of fn.
-//
-// The third element of the return tuple is a vector of pairs, each pair containing the size and alignment requirements
-// for the externally-provided storage for the evaluation tape. If batch_size is 1, then only a single
-// pair is returned, representing the size/alignment requirements for the scalar-mode evaluation tape.
-// If batch_size > 1, then an additional pair is appended representing the size/alignment requirements
-// for the batch-mode evaluation tape.
-//
-// NOTE: there is a bunch of boilerplate logic overlap here with add_cfunc_impl(). Make sure to
-// coordinate changes between the two functions.
-template <typename T>
 std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
-make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vector<expression> &fn,
-                 const std::vector<expression> &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode,
-                 long long prec)
+make_multi_cfunc_impl(llvm::Type *fp_t, const llvm_state &tplt, const std::string &name,
+                      const std::vector<expression> &fn, const std::vector<expression> &vars, std::uint32_t batch_size,
+                      bool high_accuracy, bool parallel_mode)
 {
     if (batch_size == 0u) [[unlikely]] {
         throw std::invalid_argument("The batch size of a compiled function cannot be zero");
@@ -2062,27 +2045,6 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec
         throw std::invalid_argument("Parallel mode has not been implemented yet");
     }
 
-#if defined(HEYOKA_ARCH_PPC)
-    if constexpr (std::is_same_v<T, long double>) {
-        throw not_implemented_error("'long double' computations are not supported on PowerPC");
-    }
-#endif
-
-#if defined(HEYOKA_HAVE_REAL)
-
-    if constexpr (std::is_same_v<T, mppp::real>) {
-        const auto sprec = boost::numeric_cast<mpfr_prec_t>(prec);
-
-        if (sprec < mppp::real_prec_min() || sprec > mppp::real_prec_max()) [[unlikely]] {
-            throw std::invalid_argument(
-                fmt::format("An invalid precision value of {} was passed to make_multi_cfunc() (the "
-                            "value must be in the [{}, {}] range)",
-                            sprec, mppp::real_prec_min(), mppp::real_prec_max()));
-        }
-    }
-
-#endif
-
     if (name.empty()) [[unlikely]] {
         throw std::invalid_argument("A non-empty function name is required when invoking make_multi_cfunc()");
     }
@@ -2126,9 +2088,26 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec
         tape_size_align.resize(2);
     }
 
-    // Helper to create a cfunc.
+    // NOTE: this is ugly, but needed. Cloning an LLVM type into another
+    // context is not a thread-safe operation as we might be poking into
+    // the context of the original type. Thus, we first make 2 or 3 clones
+    // of fp_t each associated to a different llvm_state without any multithreading,
+    // and then we use these clones for further cloning while parallel invoking
+    // create_cfunc().
+    std::vector<std::pair<llvm_state, llvm::Type *>> fp_t_clones;
+    fp_t_clones.reserve(3);
+    for (auto i = 0; i < (batch_size == 1u ? 2 : 3); ++i) {
+        // Create a new state and clone fp_t into it.
+        auto new_state = tplt.make_similar();
+        auto *new_fp_t = llvm_clone_type(new_state, fp_t);
+
+        fp_t_clones.emplace_back(std::move(new_state), new_fp_t);
+    }
+
+    // Helper to create a single cfunc.
     auto create_cfunc = [&states_lists, &tape_size_align, &tplt, &name, &dc = std::as_const(dc), nvars, nuvars,
-                         high_accuracy, prec](bool strided, std::uint32_t cur_batch_size) {
+                         high_accuracy,
+                         &fp_t_clones = std::as_const(fp_t_clones)](bool strided, std::uint32_t cur_batch_size) {
         // NOTE: the batch unstrided variant is not supposed to be requested.
         assert(strided || cur_batch_size == 1u);
 
@@ -2145,6 +2124,9 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec
 
         assert(states.empty());
 
+        // Fetch the local cloned fp_t.
+        auto *loc_fp_t = fp_t_clones[sidx].second;
+
         // Add a new state and fetch it.
         states.push_back(tplt.make_similar());
         auto &s = states.back();
@@ -2228,8 +2210,8 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec
         builder.SetInsertPoint(bb);
 
         // Create the body of the function.
-        const auto tape_sa = add_multi_cfunc_impl<T>(states, out_ptr, in_ptr, par_ptr, time_ptr, stride, dc, nvars,
-                                                     nuvars, cur_batch_size, high_accuracy, prec, cur_name, tape_ptr);
+        const auto tape_sa = add_multi_cfunc_impl(loc_fp_t, states, out_ptr, in_ptr, par_ptr, time_ptr, stride, dc,
+                                                  nvars, nuvars, cur_batch_size, high_accuracy, cur_name, tape_ptr);
 
         // Add the size/alignment requirements for the tape storage.
         // NOTE: there's no difference in requirements between strided and
@@ -2258,7 +2240,8 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec
     //   into a thread-safe tbb vector.
     //
     // At the moment though it looks like the practical gains from such further parallelisation
-    // would not be worth it, perhaps we can reconsider in the future.
+    // would not be worth it, perhaps we can reconsider in the future. It is also not clear how
+    // to deal with thread-unsafe type cloning in this hypothetical scenario.
     if (batch_size == 1u) {
         oneapi::tbb::parallel_invoke([&create_cfunc]() { create_cfunc(false, 1); },
                                      [&create_cfunc]() { create_cfunc(true, 1); });
@@ -2285,11 +2268,66 @@ make_multi_cfunc(const llvm_state &tplt, const std::string &name, const std::vec
         std::move(dc), std::move(tape_size_align));
 }
 
+} // namespace
+
+// This function will compile several versions of the input function fn, with input variables vars, in compact mode.
+//
+// The compiled functions are implemented across several llvm_states which are collated together and returned as
+// a single llvm_multi_state (this is the first element of the return tuple). If batch_size is 1,
+// then 2 compiled functions are created - a scalar strided and a scalar unstrided version.
+// If batch size is > 1, then an additional batch-mode strided compiled function is returned.
+// The function names are created using "name" as base name and then mangling in the strided/unstrided
+// property and the batch size.
+//
+// The second element of the return tuple is the decomposition of fn.
+//
+// The third element of the return tuple is a vector of pairs, each pair containing the size and alignment requirements
+// for the externally-provided storage for the evaluation tape. If batch_size is 1, then only a single
+// pair is returned, representing the size/alignment requirements for the scalar-mode evaluation tape.
+// If batch_size > 1, then an additional pair is appended representing the size/alignment requirements
+// for the batch-mode evaluation tape.
+//
+// NOTE: there is a bunch of boilerplate logic overlap here with add_cfunc_impl(). Make sure to
+// coordinate changes between the two functions.
+template <typename T>
+std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
+make_multi_cfunc(llvm_state tplt, const std::string &name, const std::vector<expression> &fn,
+                 const std::vector<expression> &vars, std::uint32_t batch_size, bool high_accuracy, bool parallel_mode,
+                 long long prec)
+{
+#if defined(HEYOKA_ARCH_PPC)
+    if constexpr (std::is_same_v<T, long double>) {
+        throw not_implemented_error("'long double' computations are not supported on PowerPC");
+    }
+#endif
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    if constexpr (std::is_same_v<T, mppp::real>) {
+        const auto sprec = boost::numeric_cast<mpfr_prec_t>(prec);
+
+        if (sprec < mppp::real_prec_min() || sprec > mppp::real_prec_max()) [[unlikely]] {
+            throw std::invalid_argument(
+                fmt::format("An invalid precision value of {} was passed to make_multi_cfunc() (the "
+                            "value must be in the [{}, {}] range)",
+                            sprec, mppp::real_prec_min(), mppp::real_prec_max()));
+        }
+    }
+
+#endif
+
+    // Fetch the internal scalar fp type from the template state. We will be cloning
+    // this throughout the rest of the implementation.
+    auto *fp_t = to_internal_llvm_type<T>(tplt, prec);
+
+    return make_multi_cfunc_impl(fp_t, tplt, name, fn, vars, batch_size, high_accuracy, parallel_mode);
+}
+
 // Explicit instantiations.
 #define HEYOKA_MAKE_MULTI_CFUNC_INST(T)                                                                                \
     template HEYOKA_DLL_PUBLIC                                                                                         \
         std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>                 \
-        make_multi_cfunc<T>(const llvm_state &, const std::string &, const std::vector<expression> &,                  \
+        make_multi_cfunc<T>(llvm_state, const std::string &, const std::vector<expression> &,                          \
                             const std::vector<expression> &, std::uint32_t, bool, bool, long long);
 
 HEYOKA_MAKE_MULTI_CFUNC_INST(float)
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 9bfdd5238..130cdda90 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -671,7 +671,6 @@ auto taylor_build_function_maps(llvm_state &s, llvm::Type *fp_t, const std::vect
 
 // Helper for the computation of a jet of derivatives in compact mode,
 // used in taylor_compute_jet().
-// NOTE: order0, par_ptr and time_ptr are external pointers.
 std::pair<llvm::Value *, llvm::Type *> taylor_compute_jet_compact_mode(
     // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
     llvm_state &s, llvm::Type *fp_type, llvm::Value *order0, llvm::Value *par_ptr, llvm::Value *time_ptr,
diff --git a/src/taylor_adaptive.cpp b/src/taylor_adaptive.cpp
index 38cf97974..3a8bb1bab 100644
--- a/src/taylor_adaptive.cpp
+++ b/src/taylor_adaptive.cpp
@@ -437,8 +437,7 @@ void taylor_adaptive<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> state,
 
     // Add the function for the computation of
     // the dense output.
-    detail::taylor_add_d_out_function(m_llvm, detail::internal_llvm_type_like(m_llvm, m_state[0]), m_dim, m_order, 1,
-                                      high_accuracy);
+    detail::taylor_add_d_out_function(m_llvm, fp_t, m_dim, m_order, 1, high_accuracy);
 
     detail::get_logger()->trace("Taylor dense output runtime: {}", sw);
     sw.reset();
diff --git a/src/taylor_adaptive_batch.cpp b/src/taylor_adaptive_batch.cpp
index a8e0f6046..e97b4d0df 100644
--- a/src/taylor_adaptive_batch.cpp
+++ b/src/taylor_adaptive_batch.cpp
@@ -284,8 +284,7 @@ void taylor_adaptive_batch<T>::finalise_ctor_impl(sys_t vsys, std::vector<T> sta
 
     // Add the function for the computation of
     // the dense output.
-    detail::taylor_add_d_out_function(m_llvm, detail::to_external_llvm_type<T>(m_llvm.context()), m_dim, m_order,
-                                      m_batch_size, high_accuracy);
+    detail::taylor_add_d_out_function(m_llvm, ext_fp_t, m_dim, m_order, m_batch_size, high_accuracy);
 
     detail::get_logger()->trace("Taylor batch dense output runtime: {}", sw);
     sw.reset();
diff --git a/test/llvm_helpers.cpp b/test/llvm_helpers.cpp
index a09f40f1a..ec13bd327 100644
--- a/test/llvm_helpers.cpp
+++ b/test/llvm_helpers.cpp
@@ -3030,3 +3030,36 @@ TEST_CASE("switch")
 
 #endif
 }
+
+TEST_CASE("clone type")
+{
+    using detail::llvm_clone_type;
+
+    auto tester = []<typename fp_t>(fp_t) {
+        llvm_state source, dest;
+
+        auto *tp_source = detail::to_external_llvm_type<fp_t>(source.context());
+        auto *tp_dest = llvm_clone_type(dest, tp_source);
+        REQUIRE(tp_dest == detail::to_external_llvm_type<fp_t>(dest.context()));
+
+        auto *vec_tp_source = detail::make_vector_type(tp_source, 4);
+        auto *vec_tp_dest = llvm_clone_type(dest, vec_tp_source);
+        REQUIRE(vec_tp_dest == detail::make_vector_type(tp_dest, 4));
+    };
+
+    tuple_for_each(fp_types, tester);
+
+#if defined(HEYOKA_HAVE_REAL)
+
+    llvm_state source, dest;
+
+    auto *tp_ext_source = detail::to_external_llvm_type<mppp::real>(source.context());
+    auto *tp_ext_dest = llvm_clone_type(dest, tp_ext_source);
+    REQUIRE(tp_ext_dest == detail::to_external_llvm_type<mppp::real>(dest.context()));
+
+    auto *tp_int_source = detail::to_internal_llvm_type<mppp::real>(source, 11);
+    auto *tp_int_dest = llvm_clone_type(dest, tp_int_source);
+    REQUIRE(tp_int_dest == detail::to_internal_llvm_type<mppp::real>(dest, 11));
+
+#endif
+}