Skip to content

Commit

Permalink
Merge pull request #445 from bluescarni/pr/backports
Browse files Browse the repository at this point in the history
Tweaks to parallel cfunc compilation
  • Loading branch information
bluescarni authored Aug 26, 2024
2 parents 1178154 + bc7bf46 commit e704228
Show file tree
Hide file tree
Showing 12 changed files with 333 additions and 142 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ set(HEYOKA_SRC_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/setup_variational_ics.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp"
# NOTE: this will be an empty file in case we are not
# building with support for real.
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp"
Expand Down
39 changes: 39 additions & 0 deletions include/heyoka/detail/aligned_buffer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
//
// This file is part of the heyoka library.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef HEYOKA_DETAIL_ALIGNED_BUFFER_HPP
#define HEYOKA_DETAIL_ALIGNED_BUFFER_HPP

#include <cstddef>
#include <memory>

#include <heyoka/config.hpp>

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

// Utilities to create and destroy tape arrays for compiled functions
// and/or Taylor integrators in compact mode. These may have custom alignment requirements due
// to the use of SIMD instructions, hence we need to use aligned new/delete
// and a custom deleter for the unique ptr.
struct aligned_buffer_deleter {
std::align_val_t al{};
void operator()(void *ptr) const noexcept;
};

using aligned_buffer_t = std::unique_ptr<std::byte[], aligned_buffer_deleter>;

aligned_buffer_t make_aligned_buffer(std::size_t, std::size_t);

} // namespace detail

HEYOKA_END_NAMESPACE

#endif
2 changes: 2 additions & 0 deletions include/heyoka/detail/llvm_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ HEYOKA_DLL_PUBLIC llvm::Type *make_vector_type(llvm::Type *, std::uint32_t);

HEYOKA_DLL_PUBLIC std::string llvm_mangle_type(llvm::Type *);

HEYOKA_DLL_PUBLIC llvm::Type *llvm_clone_type(llvm_state &, llvm::Type *);

HEYOKA_DLL_PUBLIC std::uint32_t get_vector_size(llvm::Value *);

HEYOKA_DLL_PUBLIC std::uint64_t get_alignment(llvm::Module &, llvm::Type *);
Expand Down
4 changes: 2 additions & 2 deletions include/heyoka/expression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,8 +698,8 @@ auto cfunc_common_opts(const KwArgs &...kw_args)

template <typename>
std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
make_multi_cfunc(const llvm_state &, const std::string &, const std::vector<expression> &,
const std::vector<expression> &, std::uint32_t, bool, bool, long long);
make_multi_cfunc(llvm_state, const std::string &, const std::vector<expression> &, const std::vector<expression> &,
std::uint32_t, bool, bool, long long);

} // namespace detail

Expand Down
72 changes: 9 additions & 63 deletions src/cfunc_class.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include <cstddef>
#include <cstdint>
#include <memory>
#include <new>
#include <optional>
#include <ostream>
#include <stdexcept>
Expand Down Expand Up @@ -49,6 +48,7 @@

#endif

#include <heyoka/detail/aligned_buffer.hpp>
#include <heyoka/detail/type_traits.hpp>
#include <heyoka/detail/variant_s11n.hpp>
#include <heyoka/detail/visibility.hpp>
Expand All @@ -59,60 +59,6 @@

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

namespace
{

// Utilities to create and destroy tape arrays for compiled functions
// in compact mode. These may have custom alignment requirements due
// to the use of SIMD instructions, hence we need to use aligned new/delete
// and a custom deleter for the unique ptr.
struct aligned_array_deleter {
std::align_val_t al{};
void operator()(void *ptr) const noexcept
{
// NOTE: here we are using directly the delete operator (which does not invoke destructors),
// rather than a delete expression (which would also invoke destructors). However, because
// ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be
// sufficient.
::operator delete[](ptr, al);
}
};

using aligned_array_t = std::unique_ptr<std::byte[], aligned_array_deleter>;

aligned_array_t make_aligned_array(std::size_t sz, std::size_t al)
{
assert(al > 0u);
assert((al & (al - 1u)) == 0u);

if (sz == 0u) {
return {};
} else {
#if defined(_MSC_VER)
// MSVC workaround for this issue:
// https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320

// Allocate the raw memory.
auto *buf = ::operator new[](sz, std::align_val_t{al});

// Formally construct the bytes array.
auto *ptr = ::new (buf) std::byte[sz];

// Construct and return the unique ptr.
return aligned_array_t{ptr, {.al = std::align_val_t{al}}};
#else
return aligned_array_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}};
#endif
}
}

} // namespace

} // namespace detail

template <typename T>
struct cfunc<T>::impl {
// The compiled function types.
Expand All @@ -124,7 +70,7 @@ struct cfunc<T>::impl {
using c_cfunc_ptr_s_t = void (*)(T *, const T *, const T *, const T *, void *, std::size_t) noexcept;

// Thread-local storage for parallel operations.
using ets_item_t = detail::aligned_array_t;
using ets_item_t = detail::aligned_buffer_t;
using ets_t = oneapi::tbb::enumerable_thread_specific<ets_item_t, oneapi::tbb::cache_aligned_allocator<ets_item_t>,
oneapi::tbb::ets_key_usage_type::ets_key_per_instance>;

Expand All @@ -135,7 +81,7 @@ struct cfunc<T>::impl {
std::uint32_t m_batch_size = 0;
std::vector<expression> m_dc;
std::vector<std::array<std::size_t, 2>> m_tape_sa;
std::vector<detail::aligned_array_t> m_tapes;
std::vector<detail::aligned_buffer_t> m_tapes;
std::variant<cfunc_ptr_t, c_cfunc_ptr_t> m_fptr_scal;
std::variant<cfunc_ptr_s_t, c_cfunc_ptr_s_t> m_fptr_scal_s;
std::variant<cfunc_ptr_s_t, c_cfunc_ptr_s_t> m_fptr_batch_s;
Expand Down Expand Up @@ -223,7 +169,7 @@ struct cfunc<T>::impl {
assert(m_tapes.empty());

for (const auto [sz, al] : m_tape_sa) {
m_tapes.push_back(detail::make_aligned_array(sz, al));
m_tapes.push_back(detail::make_aligned_buffer(sz, al));
}
}

Expand Down Expand Up @@ -260,8 +206,8 @@ struct cfunc<T>::impl {

if (compact_mode) {
// Build the multi cfunc, and assign the internal members.
std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc<T>(s, "cfunc", m_fn, m_vars, m_batch_size,
high_accuracy, m_parallel_mode, prec);
std::tie(m_states, m_dc, m_tape_sa) = detail::make_multi_cfunc<T>(
std::move(s), "cfunc", m_fn, m_vars, m_batch_size, high_accuracy, m_parallel_mode, prec);

// Compile.
std::get<1>(m_states).compile();
Expand Down Expand Up @@ -845,8 +791,8 @@ void cfunc<T>::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional<in_2d>
typename impl::ets_t ets_batch([this, batch_size]() {
// NOTE: the batch-mode tape is at index 1 only if the batch
// size is > 1, otherwise we are using the scalar tape.
return detail::make_aligned_array(m_impl->m_tape_sa[batch_size > 1u][0],
m_impl->m_tape_sa[batch_size > 1u][1]);
return detail::make_aligned_buffer(m_impl->m_tape_sa[batch_size > 1u][0],
m_impl->m_tape_sa[batch_size > 1u][1]);
});

oneapi::tbb::parallel_invoke(
Expand All @@ -865,7 +811,7 @@ void cfunc<T>::multi_eval_mt(out_2d outputs, in_2d inputs, std::optional<in_2d>
// will block as execution in the parallel region of the cfunc begins. The
// blocked thread could then grab another task from the parallel for loop
// we are currently in, and it would then start writing for a second time
// into the same tape it already begun writing into, leading to UB.
// into the same tape it already begun writing into.
oneapi::tbb::this_task_arena::isolate(
[&]() { batch_iter.template operator()<true>(range, tape_ptr); });
});
Expand Down
58 changes: 58 additions & 0 deletions src/detail/aligned_buffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
//
// This file is part of the heyoka library.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#include <cassert>
#include <cstddef>
#include <new>

#include <heyoka/config.hpp>
#include <heyoka/detail/aligned_buffer.hpp>

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

void aligned_buffer_deleter::operator()(void *ptr) const noexcept
{
// NOTE: here we are using directly the delete operator (which does not invoke destructors),
// rather than a delete expression (which would also invoke destructors). However, because
// ptr points to a bytes array, we do not need to explicitly call the destructor here, deallocation will be
// sufficient.
::operator delete[](ptr, al);
}

aligned_buffer_t make_aligned_buffer(std::size_t sz, std::size_t al)
{
assert(al > 0u);
assert((al & (al - 1u)) == 0u);

if (sz == 0u) {
return {};
} else {
#if defined(_MSC_VER)
// MSVC workaround for this issue:
// https://developercommunity.visualstudio.com/t/using-c17-new-stdalign-val-tn-syntax-results-in-er/528320

// Allocate the raw memory.
auto *buf = ::operator new[](sz, std::align_val_t{al});

// Formally construct the bytes array.
auto *ptr = ::new (buf) std::byte[sz];

// Construct and return the unique ptr.
return aligned_buffer_t{ptr, {.al = std::align_val_t{al}}};
#else
return aligned_buffer_t{::new (std::align_val_t{al}) std::byte[sz], {.al = std::align_val_t{al}}};
#endif
}
}

} // namespace detail

HEYOKA_END_NAMESPACE
77 changes: 77 additions & 0 deletions src/detail/llvm_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3359,6 +3359,83 @@ llvm::Value *llvm_ui_to_fp(llvm_state &s, llvm::Value *n, llvm::Type *fp_t)
}
}

// Utility to create an identical copy of the type tp in the context of the state s.
// NOTE: although it may sound like this is a read-only operation on tp, it is not,
// since we are potentially poking into the context of tp during operations. Thus, this
// function cannot be called concurrently from multiple threads on the same tp object,
// or even on different tp objects defined in the same context.
// NOTE: this handles only floating-point (vector) types at this time, extending
// to integral types should be fairly easy.
// NOTE: perhaps this function could be made more generic for arbitrary struct types
// by (recursively) reading the struct layout and then reproducing it in the target
// context. Like this, we could avoid special casing for the mppp::real types.
llvm::Type *llvm_clone_type(llvm_state &s, llvm::Type *tp)
{
assert(tp != nullptr);

// Fetch the target context.
auto &ctx = s.context();

// Construct the scalar type first, then we will convert
// to a vector if needed.
auto *tp_scal = tp->getScalarType();
llvm::Type *ret_scal_t = nullptr;

#define HEYOKA_LLVM_CLONE_TYPE_IMPL(tid) \
case llvm::Type::tid##TyID: \
ret_scal_t = llvm::Type::get##tid##Ty(ctx); \
break

// NOTE: gcov seems to get a bit confused by the macro usage.
// LCOV_EXCL_START
switch (tp_scal->getTypeID()) {
HEYOKA_LLVM_CLONE_TYPE_IMPL(Float);
HEYOKA_LLVM_CLONE_TYPE_IMPL(Double);
HEYOKA_LLVM_CLONE_TYPE_IMPL(X86_FP80);
HEYOKA_LLVM_CLONE_TYPE_IMPL(FP128);
default: {

#if defined(HEYOKA_HAVE_REAL)

if (const auto prec = llvm_is_real(tp_scal); prec != 0) {
// tp_scal is the internal counterpart of mppp::real.
ret_scal_t = to_internal_llvm_type<mppp::real>(s, prec);
break;
} else if (tp_scal == to_external_llvm_type<mppp::real>(tp_scal->getContext())) {
// tp_scal is mppp::real.
ret_scal_t = to_external_llvm_type<mppp::real>(ctx);
break;
}

#endif

throw std::invalid_argument(
fmt::format("Cannot clone the LLVM type '{}' to another context", llvm_type_name(tp)));
}
}

#undef HEYOKA_LLVM_CLONE_TYPE_IMPL
// LCOV_EXCL_STOP

assert(ret_scal_t != nullptr);

if (tp->isVectorTy()) {
// tp is a vector type.
if (const auto *vtp = llvm::dyn_cast<llvm_vector_type>(tp)) [[likely]] {
return make_vector_type(ret_scal_t, boost::numeric_cast<std::uint32_t>(vtp->getNumElements()));
} else {
// LCOV_EXCL_START
throw std::invalid_argument(fmt::format("Cannot clone the LLVM type '{}' to another context - the type is "
"a vector type whose size is not fixed",
llvm_type_name(tp)));
// LCOV_EXCL_STOP
}
} else {
// tp is a scalar type.
return ret_scal_t;
}
}

} // namespace detail

HEYOKA_END_NAMESPACE
Expand Down
Loading

0 comments on commit e704228

Please sign in to comment.