Merge pull request #445 from bluescarni/pr/backports

Tweaks to parallel cfunc compilation
bluescarni · Aug 26, 2024 · e704228 · e704228
2 parents 1178154 + bc7bf46
commit e704228
Show file tree

Hide file tree

Showing 12 changed files with 333 additions and 142 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -192,6 +192,7 @@ set(HEYOKA_SRC_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/setup_variational_ics.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp"
     # NOTE: this will be an empty file in case we are not
     # building with support for real.
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp"

diff --git a/include/heyoka/detail/aligned_buffer.hpp b/include/heyoka/detail/aligned_buffer.hpp
@@ -0,0 +1,39 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef HEYOKA_DETAIL_ALIGNED_BUFFER_HPP
+#define HEYOKA_DETAIL_ALIGNED_BUFFER_HPP
+
+#include <cstddef>
+#include <memory>
+
+#include <heyoka/config.hpp>
+
+HEYOKA_BEGIN_NAMESPACE
+
+namespace detail
+{
+
+// Utilities to create and destroy tape arrays for compiled functions
+// and/or Taylor integrators in compact mode. These may have custom alignment requirements due
+// to the use of SIMD instructions, hence we need to use aligned new/delete
+// and a custom deleter for the unique ptr.
+struct aligned_buffer_deleter {
+    std::align_val_t al{};
+    void operator()(void *ptr) const noexcept;
+};
+
+using aligned_buffer_t = std::unique_ptr<std::byte[], aligned_buffer_deleter>;
+
+aligned_buffer_t make_aligned_buffer(std::size_t, std::size_t);
+
+} // namespace detail
+
+HEYOKA_END_NAMESPACE
+
+#endif
diff --git a/include/heyoka/detail/llvm_helpers.hpp b/include/heyoka/detail/llvm_helpers.hpp
@@ -67,6 +67,8 @@ HEYOKA_DLL_PUBLIC llvm::Type *make_vector_type(llvm::Type *, std::uint32_t);
 
 HEYOKA_DLL_PUBLIC std::string llvm_mangle_type(llvm::Type *);
 
+HEYOKA_DLL_PUBLIC llvm::Type *llvm_clone_type(llvm_state &, llvm::Type *);
+
 HEYOKA_DLL_PUBLIC std::uint32_t get_vector_size(llvm::Value *);
 
 HEYOKA_DLL_PUBLIC std::uint64_t get_alignment(llvm::Module &, llvm::Type *);

diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
@@ -698,8 +698,8 @@ auto cfunc_common_opts(const KwArgs &...kw_args)
 
 template <typename>
 std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
-make_multi_cfunc(const llvm_state &, const std::string &, const std::vector<expression> &,
-                 const std::vector<expression> &, std::uint32_t, bool, bool, long long);
+make_multi_cfunc(llvm_state, const std::string &, const std::vector<expression> &, const std::vector<expression> &,
+                 std::uint32_t, bool, bool, long long);
 
 } // namespace detail
 

diff --git a/src/cfunc_class.cpp b/src/cfunc_class.cpp
@@ -13,7 +13,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
-#include <new>
 #include <optional>
 #include <ostream>
 #include <stdexcept>
@@ -49,6 +48,7 @@
 
 #endif
 
+#include <heyoka/detail/aligned_buffer.hpp>
 #include <heyoka/detail/type_traits.hpp>
 #include <heyoka/detail/variant_s11n.hpp>
 #include <heyoka/detail/visibility.hpp>
@@ -59,60 +59,6 @@
 
 HEYOKA_BEGIN_NAMESPACE
 
-namespace detail
-{
-
-namespace
-{
-
-// Utilities to create and destroy tape arrays for compiled functions
-// in compact mode. These may have custom alignment requirements due
-// to the use of SIMD instructions, hence we need to use aligned new/delete
-// and a custom deleter for the unique ptr.
-struct aligned_array_deleter {
-    std::align_val_t al{};
-    void operator()(void *ptr) const noexcept
-    {
-        // NOTE: here we are using directly the delete operator (which does not invoke destructors),
-        // rather than a delete expression (which would also invoke destructors). However, because
-        // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be
-        // sufficient.
-        ::operator delete[](ptr, al);
-    }
-};
-
-using aligned_array_t = std::unique_ptr<std::byte[], aligned_array_deleter>;
-
-aligned_array_t make_aligned_array(std::size_t sz, std::size_t al)
-{
-    assert(al > 0u);
-    assert((al & (al - 1u)) == 0u);
-
-    if (sz == 0u) {
-        return {};
-    } else {
-#if defined(_MSC_VER)
-        // MSVC workaround for this issue:
-        // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320
-
-        // Allocate the raw memory.
-        auto *buf = ::operator new[](sz, std::align_val_t{al});
-
-        // Formally construct the bytes array.
-        auto *ptr = ::new (buf) std::byte[sz];
-
-        // Construct and return the unique ptr.
-        return aligned_array_t{ptr, {.al = std::align_val_t{al}}};
-#else
-        return aligned_array_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}};
-#endif
-    }
-}
-
-} // namespace
-
-} // namespace detail
-
 template <typename T>
 struct cfunc<T>::impl {
     // The compiled function types.
@@ -124,7 +70,7 @@ struct cfunc<T>::impl {
     using c_cfunc_ptr_s_t = void (*)(T *, const T *, const T *, const T *, void *, std::size_t) noexcept;
 
     // Thread-local storage for parallel operations.
-    using ets_item_t = detail::aligned_array_t;
+    using ets_item_t = detail::aligned_buffer_t;
     using ets_t = oneapi::tbb::enumerable_thread_specific<ets_item_t, oneapi::tbb::cache_aligned_allocator<ets_item_t>,
                                                           oneapi::tbb::ets_key_usage_type::ets_key_per_instance>;
 
@@ -135,7 +81,7 @@ struct cfunc<T>::impl {
     std::uint32_t m_batch_size = 0;
     std::vector<expression> m_dc;
     std::vector<std::array<std::size_t, 2>> m_tape_sa;
-    std::vector<detail::aligned_array_t> m_tapes;
+    std::vector<detail::aligned_buffer_t> m_tapes;
     std::variant<cfunc_ptr_t, c_cfunc_ptr_t> m_fptr_scal;
     std::variant<cfunc_ptr_s_t, c_cfunc_ptr_s_t> m_fptr_scal_s;
     std::variant<cfunc_ptr_s_t, c_cfunc_ptr_s_t> m_fptr_batch_s;
@@ -223,7 +169,7 @@ struct cfunc<T>::impl {
         assert(m_tapes.empty());
 
         for (const auto [sz, al] : m_tape_sa) {
-            m_tapes.push_back(detail::make_aligned_array(sz, al));
+            m_tapes.push_back(detail::make_aligned_buffer(sz, al));
         }
     }
 
@@ -260,8 +206,8 @@ struct cfunc<T>::impl {
 
         if (compact_mode) {
             // Build the multi cfunc, and assign the internal members.
-            std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc<T>(s, "cfunc", m_fn, m_vars, m_batch_size,
-                                                                              high_accuracy, m_parallel_mode, prec);
+            std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc<T>(
+                std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec);
 
             // Compile.
             std::get<1>(m_states).compile();
@@ -845,8 +791,8 @@ void cfunc<T>::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional<in_2d>
         typename impl::ets_t ets_batch([this, batch_size]() {
             // NOTE: the batch-mode tape is at index 1 only if the batch
             // size is > 1, otherwise we are using the scalar tape.
-            return detail::make_aligned_array(m_impl->m_tape_sa[batch_size > 1u][0],
-                                              m_impl->m_tape_sa[batch_size > 1u][1]);
+            return detail::make_aligned_buffer(m_impl->m_tape_sa[batch_size > 1u][0],
+                                               m_impl->m_tape_sa[batch_size > 1u][1]);
         });
 
         oneapi::tbb::parallel_invoke(
@@ -865,7 +811,7 @@ void cfunc<T>::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional<in_2d>
                                               // will block as execution in the parallel region of the cfunc begins. The
                                               // blocked thread could then grab another task from the parallel for loop
                                               // we are currently in, and it would then start writing for a second time
-                                              // into the same tape it already begun writing into, leading to UB.
+                                              // into the same tape it already begun writing into.
                                               oneapi::tbb::this_task_arena::isolate(
                                                   [&]() { batch_iter.template operator()<true>(range, tape_ptr); });
                                           });

diff --git a/src/detail/aligned_buffer.cpp b/src/detail/aligned_buffer.cpp
@@ -0,0 +1,58 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <cassert>
+#include <cstddef>
+#include <new>
+
+#include <heyoka/config.hpp>
+#include <heyoka/detail/aligned_buffer.hpp>
+
+HEYOKA_BEGIN_NAMESPACE
+
+namespace detail
+{
+
+void aligned_buffer_deleter::operator()(void *ptr) const noexcept
+{
+    // NOTE: here we are using directly the delete operator (which does not invoke destructors),
+    // rather than a delete expression (which would also invoke destructors). However, because
+    // ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be
+    // sufficient.
+    ::operator delete[](ptr, al);
+}
+
+aligned_buffer_t make_aligned_buffer(std::size_t sz, std::size_t al)
+{
+    assert(al > 0u);
+    assert((al & (al - 1u)) == 0u);
+
+    if (sz == 0u) {
+        return {};
+    } else {
+#if defined(_MSC_VER)
+        // MSVC workaround for this issue:
+        // https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320
+
+        // Allocate the raw memory.
+        auto *buf = ::operator new[](sz, std::align_val_t{al});
+
+        // Formally construct the bytes array.
+        auto *ptr = ::new (buf) std::byte[sz];
+
+        // Construct and return the unique ptr.
+        return aligned_buffer_t{ptr, {.al = std::align_val_t{al}}};
+#else
+        return aligned_buffer_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}};
+#endif
+    }
+}
+
+} // namespace detail
+
+HEYOKA_END_NAMESPACE
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
@@ -3359,6 +3359,83 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t)
     }
 }
 
+// Utility to create an identical copy of the type tp in the context of the state s.
+// NOTE: although it may sound like this is a read-only operation on tp, it is not,
+// since we are potentially poking into the context of tp during operations. Thus, this
+// function cannot be called concurrently from multiple threads on the same tp object,
+// or even on different tp objects defined in the same context.
+// NOTE: this handles only floating-point (vector) types at this time, extending
+// to integral types should be fairly easy.
+// NOTE: perhaps this function could be made more generic for arbitrary struct types
+// by (recursively) reading the struct layout and then reproducing it in the target
+// context. Like this, we could avoid special casing for the mppp::real types.
+llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp)
+{
+    assert(tp != nullptr);
+
+    // Fetch the target context.
+    auto &ctx = s.context();
+
+    // Construct the scalar type first, then we will convert
+    // to a vector if needed.
+    auto *tp_scal = tp->getScalarType();
+    llvm::Type *ret_scal_t = nullptr;
+
+#define HEYOKA_LLVM_CLONE_TYPE_IMPL(tid)                                                                               \
+    case llvm::Type::tid##TyID:                                                                                        \
+        ret_scal_t = llvm::Type::get##tid##Ty(ctx);                                                                    \
+        break
+
+    // NOTE: gcov seems to get a bit confused by the macro usage.
+    // LCOV_EXCL_START
+    switch (tp_scal->getTypeID()) {
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(Float);
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(Double);
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(X86_FP80);
+        HEYOKA_LLVM_CLONE_TYPE_IMPL(FP128);
+        default: {
+
+#if defined(HEYOKA_HAVE_REAL)
+
+            if (const auto prec = llvm_is_real(tp_scal); prec != 0) {
+                // tp_scal is the internal counterpart of mppp::real.
+                ret_scal_t = to_internal_llvm_type<mppp::real>(s, prec);
+                break;
+            } else if (tp_scal == to_external_llvm_type<mppp::real>(tp_scal->getContext())) {
+                // tp_scal is mppp::real.
+                ret_scal_t = to_external_llvm_type<mppp::real>(ctx);
+                break;
+            }
+
+#endif
+
+            throw std::invalid_argument(
+                fmt::format("Cannot clone the LLVM type '{}' to another context", llvm_type_name(tp)));
+        }
+    }
+
+#undef HEYOKA_LLVM_CLONE_TYPE_IMPL
+    // LCOV_EXCL_STOP
+
+    assert(ret_scal_t != nullptr);
+
+    if (tp->isVectorTy()) {
+        // tp is a vector type.
+        if (const auto *vtp = llvm::dyn_cast<llvm_vector_type>(tp)) [[likely]] {
+            return make_vector_type(ret_scal_t, boost::numeric_cast<std::uint32_t>(vtp->getNumElements()));
+        } else {
+            // LCOV_EXCL_START
+            throw std::invalid_argument(fmt::format("Cannot clone the LLVM type '{}' to another context - the type is "
+                                                    "a vector type whose size is not fixed",
+                                                    llvm_type_name(tp)));
+            // LCOV_EXCL_STOP
+        }
+    } else {
+        // tp is a scalar type.
+        return ret_scal_t;
+    }
+}
+
 } // namespace detail
 
 HEYOKA_END_NAMESPACE