diff --git a/.github/workflows/gha_ci.yml b/.github/workflows/gha_ci.yml index ee5e0f980..3ea2d68ca 100644 --- a/.github/workflows/gha_ci.yml +++ b/.github/workflows/gha_ci.yml @@ -45,6 +45,7 @@ jobs: cmake ../ -G "Visual Studio 17 2022" -A x64 -DHEYOKA_BUILD_TESTS=yes -DHEYOKA_WITH_MPPP=yes -DHEYOKA_BUILD_TUTORIALS=ON -DHEYOKA_ENABLE_IPO=yes -DHEYOKA_WITH_SLEEF=yes cmake --build . --config Release -j2 copy Release\heyoka.dll test\Release\ + ctest -j4 -V -C Release conda_release_static: runs-on: ubuntu-latest steps: diff --git a/doc/known_issues.rst b/doc/known_issues.rst index 495739430..7972f0689 100644 --- a/doc/known_issues.rst +++ b/doc/known_issues.rst @@ -18,14 +18,18 @@ Unsolved The root cause is most likely a code-generation/optimisation problem in LLVM. This issue is currently under investigation. -* The parallel compilation feature (added in heyoka 6.0.0) is currently disabled - by default on 64-bit ARM processors (this includes the Apple M1 and its successors). +* The parallel compilation feature (added in heyoka 6.0.0) is currently turned + off by default on all platforms and completely disabled on Windows. The reason is a likely thread scheduling bug in LLVM's parallel compilation facilities - that very rarely results in a multiply-defined symbol, which ultimately leads to compilation - failure. The issue is currently under investigation by the LLVM developers. In the - meantime, you can explicitly turn on parallel compilation via the ``kw::parjit`` + which, on Unix systems, rarely results in a multiply-defined symbol, ultimately leading to a compilation + failure. On Windows, parallel compilation under heavy loads results in segmentation faults. + The issue is currently under investigation by the LLVM developers. In the + meantime, you can explicitly turn on parallel compilation on Unix systems via the ``kw::parjit`` :ref:`keyword argument ` when constructing an integrator or a compiled function. +* The option for selecting the code used model for JIT compilation + (added in heyoka 6.0.0) is currently disabled on Windows due to what + looks like an LLVM bug. The issue is currently under investigation. Solved ====== diff --git a/include/heyoka/llvm_state.hpp b/include/heyoka/llvm_state.hpp index 4ee929cbf..a6fb77d52 100644 --- a/include/heyoka/llvm_state.hpp +++ b/include/heyoka/llvm_state.hpp @@ -348,17 +348,18 @@ std::optional llvm_state_mem_cache_lookup(const std::vector, unsigned, llvm_mc_value); // The default setting for the parjit flag for llvm_multi_state. -// There is evidence of an LLVM thread scheduling bug when parallel compilation -// is active, that rarely results in multiply-defined symbols for external C -// functions, which leads to compilation failure. So far, we have been able to -// trigger this issue only on 64-bit arm. -inline constexpr bool default_parjit = -#if defined(HEYOKA_ARCH_ARM) - false -#else - true -#endif - ; +// +// At this time, it seems like parallel compilation in lljit is buggy: +// +// - on Unix platforms, parallel compilation occasionally results in +// multiply-defined symbols for external C functions, which leads to +// compilation failures; +// - on Windows, it seems like parallel compilation outright results in +// segmentation faults under heavy load. +// +// The root of the problem seems to be a concurrency issue. Thus, for the time +// being, let us just disable parallel compilation by default. +inline constexpr bool default_parjit = false; } // namespace detail diff --git a/src/expression_cfunc.cpp b/src/expression_cfunc.cpp index 20b128001..e1875233d 100644 --- a/src/expression_cfunc.cpp +++ b/src/expression_cfunc.cpp @@ -1697,21 +1697,14 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list cur_state->builder().SetInsertPoint( llvm::BasicBlock::Create(cur_state->context(), "entry", make_driver_proto(*cur_state, cur_state_idx))); - // Variable to keep track of how many blocks have been codegenned - // in the current state. - boost::safe_numerics::safe n_cg_blocks = 0; + // Variable to keep track of how many evaluation functions have + // been invoked in the current state. + boost::safe_numerics::safe n_evalf = 0; - // Limit of codegenned blocks per state. + // Limit of function evaluations per state. // NOTE: this has not been really properly tuned, // needs more investigation. - // NOTE: it would probably be better here to keep track of the - // total number of function calls per segment, rather than - // the number of blocks. The reason for this is that each - // function call in principle increases the size of the - // auxiliary global arrays used by the compact mode - // argument generators, which in turn increases the code - // generation time. - constexpr auto max_n_cg_blocks = 20u; + constexpr auto max_n_evalf = 100u; // Variable to keep track of the u variable // on whose definition we are operating. @@ -1719,7 +1712,7 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list // Iterate over the segments in s_dc. for (const auto &seg : s_dc) { - if (n_cg_blocks > max_n_cg_blocks) { + if (n_evalf > max_n_evalf) { // We have codegenned enough blocks for this state. Create the return // value for the current driver, and move to the next one. cur_state->builder().CreateRetVoid(); @@ -1729,7 +1722,7 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list cur_state = &states.back(); // Reset/update the counters. - n_cg_blocks = 0; + n_evalf = 0; ++cur_state_idx; // Add the driver declaration to the main state, and invoke it. @@ -1898,6 +1891,9 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list assert(std::ranges::all_of(gens, [](const auto &f) { return static_cast(f); })); // LCOV_EXCL_STOP + // Update the number of invoked evaluation functions. + n_evalf += ncalls; + // We will be manually unrolling loops if ncalls is small enough. // This seems to help with compilation times. constexpr auto max_unroll_n = 5u; @@ -1942,9 +1938,6 @@ void multi_cfunc_evaluate_segments(llvm::Type *main_fp_t, std::list } } - // Update the number of codegenned blocks. - n_cg_blocks += seg_map.size(); - // LCOV_EXCL_START // Update segment_bd if needed. if (is_tracing) { diff --git a/src/llvm_state.cpp b/src/llvm_state.cpp index da7908e14..af5fc6d6d 100644 --- a/src/llvm_state.cpp +++ b/src/llvm_state.cpp @@ -309,17 +309,17 @@ llvm::orc::JITTargetMachineBuilder create_jit_tmb(unsigned opt_level, code_model // LCOV_EXCL_START -#if LLVM_VERSION_MAJOR >= 17 - // NOTE: the code model setup is working only on LLVM>=19 (or at least // LLVM 18 + patches, as in the conda-forge LLVM package), due to this bug: // // https://github.com/llvm/llvm-project/issues/88115 // // Additionally, there are indications from our CI that attempting to set - // the code model before LLVM 17 might just be buggy, as we see widespread + // the code model before LLVM 17 or on Windows might just be buggy, as we see widespread // ASAN failures all over the place. Thus, let us not do anything with the code - // model setting before LLVM 17. + // model setting before LLVM 17 or on Windows. + +#if LLVM_VERSION_MAJOR >= 17 && !defined(_WIN32) // Setup the code model. switch (c_model) { @@ -1667,11 +1667,17 @@ multi_jit::multi_jit(unsigned n_modules, unsigned opt_level, code_model c_model, #else + // NOTE: never enable parallel compilation on Windows due to + // segfaults under heavy load. +#if !defined(_WIN32) + if (m_parjit) { // Set the number of compilation threads. lljit_builder.setNumCompileThreads(std::thread::hardware_concurrency()); } +#endif + #endif // Create the jit. diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp index a4ff3e61d..b5d4354d0 100644 --- a/src/taylor_02.cpp +++ b/src/taylor_02.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1042,28 +1043,25 @@ std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::T cur_state->builder().SetInsertPoint(llvm::BasicBlock::Create( cur_state->context(), "entry", taylor_cm_make_driver_proto(*cur_state, cur_state_idx))); - // Variable to keep track of how many blocks have been codegenned - // in the current state. - boost::safe_numerics::safe n_cg_blocks = 0; + // Variable to keep track of how many evaluation functions have + // been invoked in the current state. + boost::safe_numerics::safe n_evalf = 0; - // Limit of codegenned blocks per state. + // Limit of function evaluations per state. // NOTE: this has not been really properly tuned, - // needs more investigation. - // NOTE: it would probably be better here to keep track of the - // total number of function calls per segment, rather than - // the number of blocks. The reason for this is that each - // function call in principle increases the size of the - // auxiliary global arrays used by the compact mode - // argument generators, which in turn increases the code - // generation time. - constexpr auto max_n_cg_blocks = 20u; + // needs more investigation. In any case, this should + // be smaller than the corresponding limit in cfunc + // because here we are typically more work for function + // evaluation (as each function evaluation implements + // an AD formula). + constexpr auto max_n_evalf = 20u; // Variable to keep track of the index of the first u variable // in a segment. auto start_u_idx = n_eq; // Helper to finalise the current driver function and create a new one. - auto start_new_driver = [&cur_state, &states, &main_state, &n_cg_blocks, &cur_state_idx, &main_driver_decls]() { + auto start_new_driver = [&cur_state, &states, &main_state, &n_evalf, &cur_state_idx, &main_driver_decls]() { // Finalise the current driver. cur_state->builder().CreateRetVoid(); @@ -1072,7 +1070,7 @@ std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::T cur_state = &states.back(); // Reset/update the counters. - n_cg_blocks = 0; + n_evalf = 0; ++cur_state_idx; // Add the driver declaration to the main state. @@ -1100,7 +1098,7 @@ std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::T // of the sv funcs. const auto is_svf_seg = need_svf_lo && max_svf_idx >= start_u_idx && max_svf_idx < (start_u_idx + seg_n_ex); - if (n_cg_blocks > max_n_cg_blocks || is_svf_seg) { + if (n_evalf > max_n_evalf || is_svf_seg) { // Either we have codegenned enough blocks for this state, or we are // in the max_svf_idx state. Finalise the current driver and start the new one. start_new_driver(); @@ -1119,8 +1117,9 @@ std::vector taylor_compute_jet_multi(llvm_state &main_state, llvm::T const auto seg_map = taylor_cm_codegen_segment_diff(seg, start_u_idx, *cur_state, fp_t, batch_size, n_uvars, high_accuracy, parallel_mode); - // Update the number of codegenned blocks. - n_cg_blocks += seg_map.size(); + // Update the number of invoked evaluation functions. + n_evalf = std::accumulate(seg_map.begin(), seg_map.end(), n_evalf, + [](auto a, const auto &p) { return a + p.second.first; }); // Update start_u_idx. start_u_idx += seg_n_ex; diff --git a/test/llvm_helpers.cpp b/test/llvm_helpers.cpp index ec13bd327..b96833ac9 100644 --- a/test/llvm_helpers.cpp +++ b/test/llvm_helpers.cpp @@ -20,7 +20,6 @@ #include #include -#include #include @@ -1671,16 +1670,18 @@ TEST_CASE("eft_product scalar") REQUIRE(x == a * b); +#if defined(HEYOKA_HAVE_REAL) #if defined(HEYOKA_HAVE_REAL128) if constexpr (!std::is_same_v) { #endif - namespace bmp = boost::multiprecision; - using mp_fp_t - = bmp::number::digits * 2, bmp::digit_base_2>>; - REQUIRE(mp_fp_t(x) + mp_fp_t(y) == mp_fp_t(a) * mp_fp_t(b)); + using mp_fp_t = mppp::real; + const auto prec = std::numeric_limits::digits * 2; + + REQUIRE(mp_fp_t(x, prec) + mp_fp_t(y, prec) == mp_fp_t(a, prec) * mp_fp_t(b, prec)); #if defined(HEYOKA_HAVE_REAL128) } +#endif #endif } } @@ -1759,16 +1760,17 @@ TEST_CASE("eft_product batch") REQUIRE(xv == a * b); +#if defined(HEYOKA_HAVE_REAL) #if defined(HEYOKA_HAVE_REAL128) if constexpr (!std::is_same_v) { #endif - namespace bmp = boost::multiprecision; - using mp_fp_t = bmp::number< - bmp::cpp_bin_float::digits * 2, bmp::digit_base_2>>; + using mp_fp_t = mppp::real; + const auto prec = std::numeric_limits::digits * 2; - REQUIRE(mp_fp_t(xv) + mp_fp_t(yv) == mp_fp_t(a) * mp_fp_t(b)); + REQUIRE(mp_fp_t(xv, prec) + mp_fp_t(yv, prec) == mp_fp_t(a, prec) * mp_fp_t(b, prec)); #if defined(HEYOKA_HAVE_REAL128) } +#endif #endif } } @@ -2526,12 +2528,12 @@ TEST_CASE("dl modulus scalar") auto f_ptr = reinterpret_cast(s.jit_lookup("hey_dl_modulus")); +#if defined(HEYOKA_HAVE_REAL) #if defined(HEYOKA_HAVE_REAL128) if constexpr (!std::is_same_v) { #endif - namespace bmp = boost::multiprecision; - using mp_fp_t - = bmp::number::digits * 2, bmp::digit_base_2>>; + using mp_fp_t = mppp::real; + const auto prec = std::numeric_limits::digits * 2; std::uniform_real_distribution op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.)); @@ -2542,13 +2544,14 @@ TEST_CASE("dl modulus scalar") f_ptr(&res_hi, &res_lo, x, 0, y, 0); - auto res_mp = mp_fp_t(x) - mp_fp_t(y) * floor(mp_fp_t(x) / mp_fp_t(y)); + auto res_mp = mp_fp_t(x, prec) - mp_fp_t(y, prec) * floor(mp_fp_t(x, prec) / mp_fp_t(y, prec)); REQUIRE(res_hi == approximately(static_cast(res_mp), fp_t(10))); } #if defined(HEYOKA_HAVE_REAL128) } +#endif #endif } }; @@ -2608,12 +2611,12 @@ TEST_CASE("dl modulus batch") auto f_ptr = reinterpret_cast( s.jit_lookup("hey_dl_modulus")); +#if defined(HEYOKA_HAVE_REAL) #if defined(HEYOKA_HAVE_REAL128) if constexpr (!std::is_same_v) { #endif - namespace bmp = boost::multiprecision; - using mp_fp_t - = bmp::number::digits * 2, bmp::digit_base_2>>; + using mp_fp_t = mppp::real; + const auto prec = std::numeric_limits::digits * 2; std::uniform_real_distribution op_dist(fp_t(-1e6), fp_t(1e6)), quo_dist(fp_t(.1), fp_t(10.)); @@ -2634,8 +2637,9 @@ TEST_CASE("dl modulus batch") b_lo_vec.data()); for (auto i = 0u; i < batch_size; ++i) { - auto res_mp = mp_fp_t(a_hi_vec[i]) - - mp_fp_t(b_hi_vec[i]) * floor(mp_fp_t(a_hi_vec[i]) / mp_fp_t(b_hi_vec[i])); + auto res_mp = mp_fp_t(a_hi_vec[i], prec) + - mp_fp_t(b_hi_vec[i], prec) + * floor(mp_fp_t(a_hi_vec[i], prec) / mp_fp_t(b_hi_vec[i], prec)); REQUIRE(x_vec[i] == approximately(static_cast(res_mp), fp_t(10))); } @@ -2643,6 +2647,7 @@ TEST_CASE("dl modulus batch") #if defined(HEYOKA_HAVE_REAL128) } +#endif #endif } }