diff --git a/CMakeLists.txt b/CMakeLists.txt index dd2d1420..08f75be2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ endif() option(BUILD_TESTS "Build test" OFF) option(BUILD_DEMO "Build demo programs" OFF) option(BUILD_EXAMPLE "Build example programs" OFF) +option(BUILD_WITH_TBB "Build with TBB support" ON) +option(BUILD_WITH_OPENMP "Build with OpenMP support" ON) option(BUILD_WITH_CUDA "Build with GPU support" OFF) option(BUILD_WITH_CUDA_MULTIARCH "Build with CUDA multi-architecture support" OFF) option(BUILD_WITH_MARCH_NATIVE "Build with -march=native" OFF) @@ -19,6 +21,7 @@ option(ENABLE_CPPCHECK "Enable cppcheck" OFF) option(ENABLE_COVERAGE "Enable coverage check" OFF) if(BUILD_WITH_MARCH_NATIVE) + set(GTSAM_POINTS_WITH_MARCH_NATIVE 1) add_compile_options(-march=native) set(CMAKE_C_FLAGS "-march=native ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "-march=native ${CMAKE_CXX_FLAGS}") @@ -26,9 +29,18 @@ endif() find_package(Boost REQUIRED COMPONENTS filesystem) find_package(GTSAM REQUIRED) -find_package(OpenMP REQUIRED) find_package(Eigen3 REQUIRED) +if(BUILD_WITH_TBB) + find_package(TBB REQUIRED) + set(GTSAM_POINTS_USE_TBB 1) +endif() + +if(BUILD_WITH_OPENMP) + find_package(OpenMP REQUIRED) + set(GTSAM_POINTS_USE_OPENMP 1) +endif() + if(${BUILD_WITH_CUDA} AND ${EIGEN3_VERSION_STRING} AND ${EIGEN3_VERSION_STRING} VERSION_LESS "3.3.90") message(WARNING "Detected Eigen ${EIGEN3_VERSION_STRING} is not compatible with CUDA") message(WARNING "Use Eigen 3.3.90 or later (3.4.0 is recommended)") @@ -36,6 +48,7 @@ endif() # GPU-related if(BUILD_WITH_CUDA) + set(GTSAM_POINTS_USE_CUDA 1) add_definitions(-DBUILD_GTSAM_POINTS_GPU) find_package(CUDAToolkit REQUIRED) @@ -105,6 +118,8 @@ if(ENABLE_COVERAGE) WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) endif() +configure_file(include/gtsam_points/config.hpp.in include/gtsam_points/config.hpp) + ########### ## Build ## ########### @@ -115,6 +130,7 @@ add_library(gtsam_points SHARED src/gtsam_points/util/normal_estimation.cpp src/gtsam_points/util/bspline.cpp src/gtsam_points/util/continuous_trajectory.cpp + src/gtsam_points/util/parallelism.cpp # ann src/gtsam_points/ann/kdtree.cpp src/gtsam_points/ann/intensity_kdtree.cpp @@ -158,6 +174,7 @@ add_library(gtsam_points SHARED target_include_directories(gtsam_points PUBLIC $ $ + $ $ ) target_link_libraries(gtsam_points @@ -165,7 +182,8 @@ target_link_libraries(gtsam_points Boost::filesystem Eigen3::Eigen GTSAM::GTSAM - OpenMP::OpenMP_CXX + $ + $ ) # GPU-related @@ -206,11 +224,12 @@ if(BUILD_WITH_CUDA) $ ) target_link_libraries(gtsam_points_cuda + CUDA::cudart Boost::boost Eigen3::Eigen GTSAM::GTSAM - OpenMP::OpenMP_CXX - CUDA::cudart + $ + $ ) target_link_libraries(gtsam_points @@ -274,7 +293,10 @@ endif() ############# include(GNUInstallDirs) -install(DIRECTORY include/ thirdparty/nanoflann/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +install( + DIRECTORY include/ thirdparty/nanoflann/include/ ${CMAKE_CURRENT_BINARY_DIR}/include/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) list(APPEND GTSAM_POINTS_LIBRARIES gtsam_points) if(BUILD_WITH_CUDA) diff --git a/cmake/gtsam_points-config.cmake.in b/cmake/gtsam_points-config.cmake.in index bb9fdc54..0586b36f 100644 --- a/cmake/gtsam_points-config.cmake.in +++ b/cmake/gtsam_points-config.cmake.in @@ -9,7 +9,9 @@ include_guard() -set(BUILD_WITH_CUDA @BUILD_WITH_CUDA@) +set(GTSAM_POINTS_USE_TBB @GTSAM_POINTS_USE_TBB@) +set(GTSAM_POINTS_USE_OPENMP @GTSAM_POINTS_USE_OPENMP@) +set(GTSAM_POINTS_USE_CUDA @GTSAM_POINTS_USE_CUDA@) get_filename_component(gtsam_points_CURRENT_CONFIG_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${gtsam_points_CURRENT_CONFIG_DIR}") diff --git a/include/gtsam_points/ann/kdtree2.hpp b/include/gtsam_points/ann/kdtree2.hpp index a513514a..83a18797 100644 --- a/include/gtsam_points/ann/kdtree2.hpp +++ b/include/gtsam_points/ann/kdtree2.hpp @@ -7,9 +7,11 @@ #include #include +#include #include #include #include +#include namespace gtsam_points { @@ -24,7 +26,16 @@ struct KdTree2 : public NearestNeighborSearch { KdTree2(const std::shared_ptr& frame, int build_num_threads = 1) : frame(frame), search_eps(-1.0), - index(new Index(*this->frame, KdTreeBuilderOMP(build_num_threads))) { + index( + is_omp_default() || build_num_threads == 1 ? // + new Index(*this->frame, KdTreeBuilderOMP(build_num_threads)) // + : // +#ifdef GTSAM_POINTS_USE_TBB // + new Index(*this->frame, KdTreeBuilderTBB()) // +#else // + new Index(*this->frame, KdTreeBuilder()) +#endif + ) { if (frame::size(*frame) == 0) { std::cerr << "error: empty frame is given for KdTree2" << std::endl; std::cerr << " : frame::size() may not be implemented" << std::endl; diff --git a/include/gtsam_points/ann/small_kdtree.hpp b/include/gtsam_points/ann/small_kdtree.hpp index 23a2df10..142b5edd 100644 --- a/include/gtsam_points/ann/small_kdtree.hpp +++ b/include/gtsam_points/ann/small_kdtree.hpp @@ -44,9 +44,14 @@ #include #include +#include #include #include +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif + namespace gtsam_points { /// @brief Parameters to control the projection axis search. @@ -266,6 +271,77 @@ struct KdTreeBuilderOMP { ProjectionSetting projection_setting; ///< Projection setting. }; +#ifdef GTSAM_POINTS_USE_TBB +/// @brief Kd-tree builder with TBB. +struct KdTreeBuilderTBB { +public: + /// @brief Build KdTree + template + void build_tree(KdTree& kdtree, const PointCloud& points) const { + kdtree.indices.resize(frame::size(points)); + std::iota(kdtree.indices.begin(), kdtree.indices.end(), 0); + + std::atomic_uint64_t node_count = 0; + kdtree.nodes.resize(frame::size(points)); + kdtree.root = create_node(kdtree, node_count, points, kdtree.indices.begin(), kdtree.indices.begin(), kdtree.indices.end()); + kdtree.nodes.resize(node_count); + } + + /// @brief Create a Kd-tree node from the given point indices. + /// @param global_first Global first point index iterator (i.e., this->indices.begin()). + /// @param first First point index iterator to be scanned. + /// @param last Last point index iterator to be scanned. + /// @return Index of the created node. + template + NodeIndexType create_node( + KdTree& kdtree, + std::atomic_uint64_t& node_count, + const PointCloud& points, + IndexConstIterator global_first, + IndexConstIterator first, + IndexConstIterator last) const { + const size_t N = std::distance(first, last); + const NodeIndexType node_index = node_count++; + auto& node = kdtree.nodes[node_index]; + + // Create a leaf node. + if (N <= max_leaf_size) { + // std::sort(first, last); + node.node_type.lr.first = std::distance(global_first, first); + node.node_type.lr.last = std::distance(global_first, last); + + return node_index; + } + + // Find the best axis to split the input points. + using Projection = typename KdTree::Projection; + const auto proj = Projection::find_axis(points, first, last, projection_setting); + const auto median_itr = first + N / 2; + std::nth_element(first, median_itr, last, [&](size_t i, size_t j) { return proj(frame::point(points, i)) < proj(frame::point(points, j)); }); + + // Create a non-leaf node. + node.node_type.sub.proj = proj; + node.node_type.sub.thresh = proj(frame::point(points, *median_itr)); + + // Create left and right child nodes. + if (N > 512) { + tbb::parallel_invoke( + [&] { node.left = create_node(kdtree, node_count, points, global_first, first, median_itr); }, + [&] { node.right = create_node(kdtree, node_count, points, global_first, median_itr, last); }); + } else { + node.left = create_node(kdtree, node_count, points, global_first, first, median_itr); + node.right = create_node(kdtree, node_count, points, global_first, median_itr, last); + } + + return node_index; + } + +public: + int max_leaf_size = 20; ///< Maximum number of points in a leaf node. + ProjectionSetting projection_setting; ///< Projection setting. +}; +#endif + /// @brief "Unsafe" KdTree. /// @note This class does not hold the ownership of the input points. /// You must keep the input points along with this class. diff --git a/include/gtsam_points/config.hpp.in b/include/gtsam_points/config.hpp.in new file mode 100644 index 00000000..28bb2c3e --- /dev/null +++ b/include/gtsam_points/config.hpp.in @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2021 Kenji Koide (k.koide@aist.go.jp) + +#pragma once + +// Library version +#define GTSAM_POINTS_VERSION_MAJOR @CMAKE_PROJECT_VERSION_MAJOR@ +#define GTSAM_POINTS_VERSION_MINOR @CMAKE_PROJECT_VERSION_MINOR@ +#define GTSAM_POINTS_VERSION_PATCH @CMAKE_PROJECT_VERSION_PATCH@ +#define GTSAM_POINTS_VERSION_STRING "@CMAKE_PROJECT_VERSION@" + +#cmakedefine GTSAM_POINTS_USE_TBB + +#cmakedefine GTSAM_POINTS_USE_OPENMP + +#cmakedefine GTSAM_POINTS_USE_CUDA + +#cmakedefine GTSAM_POINTS_WITH_MARCH_NATIVE \ No newline at end of file diff --git a/include/gtsam_points/factors/impl/integrated_color_consistency_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_color_consistency_factor_impl.hpp index f5f086a5..d1c85e55 100644 --- a/include/gtsam_points/factors/impl/integrated_color_consistency_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_color_consistency_factor_impl.hpp @@ -4,7 +4,14 @@ #include #include +#include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -83,23 +90,41 @@ void IntegratedColorConsistencyFactor_knn_search(pt.data(), 1, &k_index, &k_sq_dist, max_correspondence_distance_sq); - correspondences[i] = (num_found && k_sq_dist < max_correspondence_distance_sq) ? k_index : -1; + size_t num_found = target_tree->knn_search(pt.data(), 1, &k_index, &k_sq_dist, max_correspondence_distance_sq); + correspondences[i] = (num_found && k_sq_dist < max_correspondence_distance_sq) ? k_index : -1; + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*source); i++) { + perpoint_task(i); } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } - - last_correspondence_point = delta; } template @@ -115,27 +140,16 @@ double IntegratedColorConsistencyFactor_> Hs_target; - std::vector> Hs_source; - std::vector> Hs_target_source; - std::vector> bs_target; - std::vector> bs_source; - - if (H_target && H_source && H_target_source && b_target && b_source) { - Hs_target.resize(num_threads, Eigen::Matrix::Zero()); - Hs_source.resize(num_threads, Eigen::Matrix::Zero()); - Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); - bs_target.resize(num_threads, Eigen::Matrix::Zero()); - bs_source.resize(num_threads, Eigen::Matrix::Zero()); - } - -#pragma omp parallel for num_threads(num_threads) reduction(+ : sum_errors_photo) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { const long target_index = correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } // source atributes @@ -155,17 +169,12 @@ double IntegratedColorConsistencyFactor_ J_transed_target = Eigen::Matrix::Zero(); J_transed_target.block<3, 3>(0, 0) = gtsam::SO3::Hat(transed_A.head<3>()); J_transed_target.block<3, 3>(0, 3) = -Eigen::Matrix3d::Identity(); @@ -185,30 +194,36 @@ double IntegratedColorConsistencyFactor_ J_ephoto_target = J_ephoto_transed * J_transed_target; Eigen::Matrix J_ephoto_source = J_ephoto_transed * J_transed_source; - Hs_target[thread_num] += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_target; - Hs_source[thread_num] += J_ephoto_source.transpose() * photometric_term_weight * J_ephoto_source; - Hs_target_source[thread_num] += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_source; - bs_target[thread_num] += J_ephoto_target.transpose() * photometric_term_weight * error_photo; - bs_source[thread_num] += J_ephoto_source.transpose() * photometric_term_weight * error_photo; - } + *H_target += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_target; + *H_source += J_ephoto_source.transpose() * photometric_term_weight * J_ephoto_source; + *H_target_source += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_source; + *b_target += J_ephoto_target.transpose() * photometric_term_weight * error_photo; + *b_source += J_ephoto_source.transpose() * photometric_term_weight * error_photo; + + return err; + }; + + double sum_errors_photo = 0.0; + + std::vector> Hs_target; + std::vector> Hs_source; + std::vector> Hs_target_source; + std::vector> bs_target; + std::vector> bs_source; if (H_target && H_source && H_target_source && b_target && b_source) { - H_target->setZero(); - H_source->setZero(); - H_target_source->setZero(); - b_target->setZero(); - b_source->setZero(); - - for (int i = 0; i < num_threads; i++) { - (*H_target) += Hs_target[i]; - (*H_source) += Hs_source[i]; - (*H_target_source) += Hs_target_source[i]; - (*b_target) += bs_target[i]; - (*b_source) += bs_source[i]; - } + Hs_target.resize(num_threads, Eigen::Matrix::Zero()); + Hs_source.resize(num_threads, Eigen::Matrix::Zero()); + Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); + bs_target.resize(num_threads, Eigen::Matrix::Zero()); + bs_source.resize(num_threads, Eigen::Matrix::Zero()); } - return sum_errors_photo; + if (is_omp_default() || num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*source), num_threads, H_target, H_source, H_target_source, b_target, b_source); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*source), H_target, H_source, H_target_source, b_target, b_source); + } } } // namespace gtsam_points \ No newline at end of file diff --git a/include/gtsam_points/factors/impl/integrated_colored_gicp_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_colored_gicp_factor_impl.hpp index ba1e76f7..e7937cee 100644 --- a/include/gtsam_points/factors/impl/integrated_colored_gicp_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_colored_gicp_factor_impl.hpp @@ -4,7 +4,14 @@ #include #include +#include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -83,11 +90,14 @@ void IntegratedColoredGICPFactor_: } } + if (do_update) { + last_correspondence_point = delta; + } + correspondences.resize(frame::size(*source)); mahalanobis.resize(frame::size(*source)); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&](int i) { if (do_update) { Eigen::Vector4d pt = delta * frame::point(*source, i); pt[3] = frame::intensity(*source, i); @@ -108,9 +118,25 @@ void IntegratedColoredGICPFactor_: mahalanobis[i] = RCR.inverse(); mahalanobis[i](3, 3) = 0.0; } - } + }; - last_correspondence_point = delta; + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif + } } template @@ -128,28 +154,16 @@ double IntegratedColoredGICPFactor_> Hs_target; - std::vector> Hs_source; - std::vector> Hs_target_source; - std::vector> bs_target; - std::vector> bs_source; - - if (H_target && H_source && H_target_source && b_target && b_source) { - Hs_target.resize(num_threads, Eigen::Matrix::Zero()); - Hs_source.resize(num_threads, Eigen::Matrix::Zero()); - Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); - bs_target.resize(num_threads, Eigen::Matrix::Zero()); - bs_source.resize(num_threads, Eigen::Matrix::Zero()); - } - -#pragma omp parallel for num_threads(num_threads) reduction(+ : sum_errors_geom) reduction(+ : sum_errors_photo) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { const long target_index = correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } // source atributes @@ -167,25 +181,20 @@ double IntegratedColoredGICPFactor_ J_transed_target = Eigen::Matrix::Zero(); J_transed_target.block<3, 3>(0, 0) = gtsam::SO3::Hat(transed_A.head<3>()); J_transed_target.block<3, 3>(0, 3) = -Eigen::Matrix3d::Identity(); @@ -198,11 +207,11 @@ double IntegratedColoredGICPFactor_ J_projected_transed = Eigen::Matrix4d::Identity() - normal_B * normal_B.transpose(); @@ -215,30 +224,20 @@ double IntegratedColoredGICPFactor_ J_ephoto_target = J_ephoto_transed * J_transed_target; Eigen::Matrix J_ephoto_source = J_ephoto_transed * J_transed_source; - Hs_target[thread_num] += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_target; - Hs_source[thread_num] += J_ephoto_source.transpose() * photometric_term_weight * J_ephoto_source; - Hs_target_source[thread_num] += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_source; - bs_target[thread_num] += J_ephoto_target.transpose() * photometric_term_weight * error_photo; - bs_source[thread_num] += J_ephoto_source.transpose() * photometric_term_weight * error_photo; - } + *H_target += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_target; + *H_source += J_ephoto_source.transpose() * photometric_term_weight * J_ephoto_source; + *H_target_source += J_ephoto_target.transpose() * photometric_term_weight * J_ephoto_source; + *b_target += J_ephoto_target.transpose() * photometric_term_weight * residual_photo; + *b_source += J_ephoto_source.transpose() * photometric_term_weight * residual_photo; - if (H_target && H_source && H_target_source && b_target && b_source) { - H_target->setZero(); - H_source->setZero(); - H_target_source->setZero(); - b_target->setZero(); - b_source->setZero(); - - for (int i = 0; i < num_threads; i++) { - (*H_target) += Hs_target[i]; - (*H_source) += Hs_source[i]; - (*H_target_source) += Hs_target_source[i]; - (*b_target) += bs_target[i]; - (*b_source) += bs_source[i]; - } - } + return error_geom + error_photo; + }; - return sum_errors_geom + sum_errors_photo; + if (is_omp_default() || num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*source), num_threads, H_target, H_source, H_target_source, b_target, b_source); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*source), H_target, H_source, H_target_source, b_target, b_source); + } } } // namespace gtsam_points \ No newline at end of file diff --git a/include/gtsam_points/factors/impl/integrated_ct_gicp_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_ct_gicp_factor_impl.hpp index ed06bc97..b0d792f9 100644 --- a/include/gtsam_points/factors/impl/integrated_ct_gicp_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_ct_gicp_factor_impl.hpp @@ -4,7 +4,14 @@ #include #include +#include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -46,12 +53,16 @@ double IntegratedCT_GICPFactor_::error(const gtsam::Va this->update_correspondences(); } - double sum_errors = 0.0; -#pragma omp parallel for reduction(+ : sum_errors) schedule(guided, 8) num_threads(this->num_threads) - for (int i = 0; i < frame::size(*this->source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { const long target_index = this->correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } const int time_index = this->time_indices[i]; @@ -60,13 +71,18 @@ double IntegratedCT_GICPFactor_::error(const gtsam::Va const auto& source_pt = frame::point(*this->source, i); const auto& target_pt = frame::point(*this->target, target_index); - Eigen::Vector4d transed_source_pt = pose * source_pt; - Eigen::Vector4d error = transed_source_pt - target_pt; + const Eigen::Vector4d transed_source_pt = pose * source_pt; + const Eigen::Vector4d residual = transed_source_pt - target_pt; + const double error = 0.5 * residual.transpose() * mahalanobis[i] * residual; - sum_errors += 0.5 * error.transpose() * mahalanobis[i] * error; - } + return error; + }; - return sum_errors; + if (is_omp_default() || this->num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*this->source), this->num_threads, nullptr, nullptr, nullptr, nullptr, nullptr); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*this->source), nullptr, nullptr, nullptr, nullptr, nullptr); + } } template @@ -74,24 +90,16 @@ boost::shared_ptr IntegratedCT_GICPFactor_update_poses(values); this->update_correspondences(); - double sum_errors = 0.0; - std::vector Hs_00(this->num_threads, gtsam::Matrix6::Zero()); - std::vector Hs_01(this->num_threads, gtsam::Matrix6::Zero()); - std::vector Hs_11(this->num_threads, gtsam::Matrix6::Zero()); - std::vector bs_0(this->num_threads, gtsam::Vector6::Zero()); - std::vector bs_1(this->num_threads, gtsam::Vector6::Zero()); - - gtsam::Matrix6 H_00 = gtsam::Matrix6::Zero(); - gtsam::Matrix6 H_01 = gtsam::Matrix6::Zero(); - gtsam::Matrix6 H_11 = gtsam::Matrix6::Zero(); - gtsam::Vector6 b_0 = gtsam::Vector6::Zero(); - gtsam::Vector6 b_1 = gtsam::Vector6::Zero(); - -#pragma omp parallel for reduction(+ : sum_errors) schedule(guided, 8) num_threads(this->num_threads) - for (int i = 0; i < frame::size(*this->source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_00, + Eigen::Matrix* H_11, + Eigen::Matrix* H_01, + Eigen::Matrix* b_0, + Eigen::Matrix* b_1) { const long target_index = this->correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } const int time_index = this->time_indices[i]; @@ -108,39 +116,40 @@ boost::shared_ptr IntegratedCT_GICPFactor_(0, 3) = pose.linear(); const Eigen::Vector4d transed_source_pt = pose * source_pt; - const auto& H_error_pose = H_transed_pose; - const Eigen::Vector4d error = transed_source_pt - target_pt; - - const gtsam::Matrix46 H_0 = H_error_pose * H_pose_0; - const gtsam::Matrix46 H_1 = H_error_pose * H_pose_1; + const auto& H_residual_pose = H_transed_pose; + const Eigen::Vector4d residual = transed_source_pt - target_pt; - int thread_num = 0; -#ifdef _OPENMP - thread_num = omp_get_thread_num(); -#endif + const gtsam::Matrix46 H_0 = H_residual_pose * H_pose_0; + const gtsam::Matrix46 H_1 = H_residual_pose * H_pose_1; - const gtsam::Vector4 mahalanobis_error = mahalanobis[i] * error; + const gtsam::Vector4 mahalanobis_residual = mahalanobis[i] * residual; const gtsam::Matrix64 H_0_mahalanobis = H_0.transpose() * mahalanobis[i]; const gtsam::Matrix64 H_1_mahalanobis = H_1.transpose() * mahalanobis[i]; - sum_errors += 0.5 * error.transpose() * mahalanobis_error; - Hs_00[thread_num] += H_0_mahalanobis * H_0; - Hs_11[thread_num] += H_1_mahalanobis * H_1; - Hs_01[thread_num] += H_0_mahalanobis * H_1; - bs_0[thread_num] += H_0.transpose() * mahalanobis_error; - bs_1[thread_num] += H_1.transpose() * mahalanobis_error; + const double error = 0.5 * residual.transpose() * mahalanobis_residual; + *H_00 += H_0_mahalanobis * H_0; + *H_11 += H_1_mahalanobis * H_1; + *H_01 += H_0_mahalanobis * H_1; + *b_0 += H_0.transpose() * mahalanobis_residual; + *b_1 += H_1.transpose() * mahalanobis_residual; + + return error; + }; + + double error = 0.0; + gtsam::Matrix6 H_00; + gtsam::Matrix6 H_01; + gtsam::Matrix6 H_11; + gtsam::Vector6 b_0; + gtsam::Vector6 b_1; + + if (is_omp_default() || this->num_threads == 1) { + error = scan_matching_reduce_omp(perpoint_task, frame::size(*this->source), this->num_threads, &H_00, &H_11, &H_01, &b_0, &b_1); + } else { + error = scan_matching_reduce_tbb(perpoint_task, frame::size(*this->source), &H_00, &H_11, &H_01, &b_0, &b_1); } - for (int i = 1; i < Hs_00.size(); i++) { - Hs_00[0] += Hs_00[i]; - Hs_11[0] += Hs_11[i]; - Hs_01[0] += Hs_01[i]; - bs_0[0] += bs_0[i]; - bs_1[0] += bs_1[i]; - } - - auto factor = gtsam::HessianFactor::shared_ptr( - new gtsam::HessianFactor(this->keys_[0], this->keys_[1], Hs_00[0], Hs_01[0], -bs_0[0], Hs_11[0], -bs_1[0], sum_errors)); + auto factor = gtsam::HessianFactor::shared_ptr(new gtsam::HessianFactor(this->keys_[0], this->keys_[1], H_00, H_01, -b_0, H_11, -b_1, error)); return factor; } @@ -149,8 +158,7 @@ void IntegratedCT_GICPFactor_::update_correspondences( this->correspondences.resize(frame::size(*this->source)); this->mahalanobis.resize(frame::size(*this->source)); -#pragma omp parallel for schedule(guided, 8) num_threads(this->num_threads) - for (int i = 0; i < frame::size(*this->source); i++) { + const auto perpoint_task = [&](int i) { const int time_index = this->time_indices[i]; const Eigen::Matrix4d pose = this->source_poses[time_index].matrix(); @@ -175,6 +183,24 @@ void IntegratedCT_GICPFactor_::update_correspondences( mahalanobis[i].setZero(); mahalanobis[i].block<3, 3>(0, 0) = RCR.block<3, 3>(0, 0).inverse(); } + }; + + if (is_omp_default() || this->num_threads == 1) { +#pragma omp parallel for num_threads(this->num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*this->source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*this->source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } } } // namespace gtsam_points \ No newline at end of file diff --git a/include/gtsam_points/factors/impl/integrated_ct_icp_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_ct_icp_factor_impl.hpp index 13f5d395..155bc050 100644 --- a/include/gtsam_points/factors/impl/integrated_ct_icp_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_ct_icp_factor_impl.hpp @@ -4,7 +4,14 @@ #include #include +#include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -72,11 +79,16 @@ double IntegratedCT_ICPFactor_::error(const gtsam::Val update_correspondences(); } - double sum_errors = 0.0; - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { const long target_index = correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } const int time_index = time_indices[i]; @@ -90,10 +102,14 @@ double IntegratedCT_ICPFactor_::error(const gtsam::Val gtsam::Point3 residual = transed_source_pt - target_pt.template head<3>(); double error = gtsam::dot(residual, target_normal.template head<3>()); - sum_errors += 0.5 * error * error; - } + return 0.5 * error * error; + }; - return sum_errors; + if (is_omp_default() || num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*this->source), this->num_threads, nullptr, nullptr, nullptr, nullptr, nullptr); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*this->source), nullptr, nullptr, nullptr, nullptr, nullptr); + } } template @@ -106,17 +122,16 @@ boost::shared_ptr IntegratedCT_ICPFactor_* H_00, + Eigen::Matrix* H_11, + Eigen::Matrix* H_01, + Eigen::Matrix* b_0, + Eigen::Matrix* b_1) { const long target_index = correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } const int time_index = time_indices[i]; @@ -141,15 +156,29 @@ boost::shared_ptr IntegratedCT_ICPFactor_source), this->num_threads, &H_00, &H_11, &H_01, &b_0, &b_1); + } else { + error = scan_matching_reduce_tbb(perpoint_task, frame::size(*this->source), &H_00, &H_11, &H_01, &b_0, &b_1); } - auto factor = gtsam::HessianFactor::shared_ptr(new gtsam::HessianFactor(keys_[0], keys_[1], H_00, H_01, -b_0, H_11, -b_1, sum_errors)); + auto factor = gtsam::HessianFactor::shared_ptr(new gtsam::HessianFactor(this->keys_[0], this->keys_[1], H_00, H_01, -b_0, H_11, -b_1, error)); return factor; } @@ -188,7 +217,7 @@ template void IntegratedCT_ICPFactor_::update_correspondences() const { correspondences.resize(frame::size(*source)); - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&](int i) { const int time_index = time_indices[i]; const auto& pt = frame::point(*source, i); @@ -203,6 +232,24 @@ void IntegratedCT_ICPFactor_::update_correspondences() } else { correspondences[i] = k_index; } + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(this->num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*this->source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*this->source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } } diff --git a/include/gtsam_points/factors/impl/integrated_gicp_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_gicp_factor_impl.hpp index 3706e1d6..412e0443 100644 --- a/include/gtsam_points/factors/impl/integrated_gicp_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_gicp_factor_impl.hpp @@ -5,8 +5,15 @@ #include #include +#include #include +#include #include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -112,8 +119,7 @@ void IntegratedGICPFactor_::update_correspondences(con correspondences.resize(frame::size(*source)); mahalanobis.resize(frame::size(*source)); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&](int i) { if (do_update) { Eigen::Vector4d pt = delta * frame::point(*source, i); @@ -133,6 +139,24 @@ void IntegratedGICPFactor_::update_correspondences(con mahalanobis[i] = RCR.inverse(); mahalanobis[i](3, 3) = 0.0; } + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } } @@ -149,43 +173,29 @@ double IntegratedGICPFactor_::evaluate( update_correspondences(delta); } - // - double sum_errors = 0.0; - - std::vector> Hs_target; - std::vector> Hs_source; - std::vector> Hs_target_source; - std::vector> bs_target; - std::vector> bs_source; - - if (H_target && H_source && H_target_source && b_target && b_source) { - Hs_target.resize(num_threads, Eigen::Matrix::Zero()); - Hs_source.resize(num_threads, Eigen::Matrix::Zero()); - Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); - bs_target.resize(num_threads, Eigen::Matrix::Zero()); - bs_source.resize(num_threads, Eigen::Matrix::Zero()); - } - -#pragma omp parallel for num_threads(num_threads) reduction(+ : sum_errors) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { const long target_index = correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } const auto& mean_A = frame::point(*source, i); const auto& cov_A = frame::cov(*source, i); - const auto& mean_B = frame::point(*target, target_index); const auto& cov_B = frame::cov(*target, target_index); - Eigen::Vector4d transed_mean_A = delta * mean_A; - Eigen::Vector4d error = mean_B - transed_mean_A; - - sum_errors += 0.5 * error.transpose() * mahalanobis[i] * error; + const Eigen::Vector4d transed_mean_A = delta * mean_A; + const Eigen::Vector4d residual = mean_B - transed_mean_A; - if (Hs_target.empty()) { - continue; + const double error = 0.5 * residual.transpose() * mahalanobis[i] * residual; + if (H_target == nullptr) { + return error; } Eigen::Matrix J_target = Eigen::Matrix::Zero(); @@ -196,38 +206,23 @@ double IntegratedGICPFactor_::evaluate( J_source.block<3, 3>(0, 0) = delta.linear() * gtsam::SO3::Hat(mean_A.template head<3>()); J_source.block<3, 3>(0, 3) = -delta.linear(); - int thread_num = 0; -#ifdef _OPENMP - thread_num = omp_get_thread_num(); -#endif - Eigen::Matrix J_target_mahalanobis = J_target.transpose() * mahalanobis[i]; Eigen::Matrix J_source_mahalanobis = J_source.transpose() * mahalanobis[i]; - Hs_target[thread_num] += J_target_mahalanobis * J_target; - Hs_source[thread_num] += J_source_mahalanobis * J_source; - Hs_target_source[thread_num] += J_target_mahalanobis * J_source; - bs_target[thread_num] += J_target_mahalanobis * error; - bs_source[thread_num] += J_source_mahalanobis * error; - } + *H_target += J_target_mahalanobis * J_target; + *H_source += J_source_mahalanobis * J_source; + *H_target_source += J_target_mahalanobis * J_source; + *b_target += J_target_mahalanobis * residual; + *b_source += J_source_mahalanobis * residual; - if (H_target && H_source && H_target_source && b_target && b_source) { - H_target->setZero(); - H_source->setZero(); - H_target_source->setZero(); - b_target->setZero(); - b_source->setZero(); - - for (int i = 0; i < num_threads; i++) { - (*H_target) += Hs_target[i]; - (*H_source) += Hs_source[i]; - (*H_target_source) += Hs_target_source[i]; - (*b_target) += bs_target[i]; - (*b_source) += bs_source[i]; - } - } + return error; + }; - return sum_errors; + if (is_omp_default() || num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*source), num_threads, H_target, H_source, H_target_source, b_target, b_source); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*source), H_target, H_source, H_target_source, b_target, b_source); + } } } // namespace gtsam_points diff --git a/include/gtsam_points/factors/impl/integrated_icp_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_icp_factor_impl.hpp index bdd657bd..f450861e 100644 --- a/include/gtsam_points/factors/impl/integrated_icp_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_icp_factor_impl.hpp @@ -5,9 +5,15 @@ #include #include +#include #include - #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -102,19 +108,24 @@ IntegratedICPFactor_::~IntegratedICPFactor_() {} template void IntegratedICPFactor_::update_correspondences(const Eigen::Isometry3d& delta) const { + bool do_update = true; if (correspondences.size() == frame::size(*source) && (correspondence_update_tolerance_trans > 0.0 || correspondence_update_tolerance_rot > 0.0)) { Eigen::Isometry3d diff = delta.inverse() * last_correspondence_point; double diff_rot = Eigen::AngleAxisd(diff.linear()).angle(); double diff_trans = diff.translation().norm(); if (diff_rot < correspondence_update_tolerance_rot && diff_trans < correspondence_update_tolerance_trans) { - return; + do_update = false; } } + if (!do_update) { + return; + } + + last_correspondence_point = delta; correspondences.resize(frame::size(*source)); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&](int i) { Eigen::Vector4d pt = delta * frame::point(*source, i); size_t k_index = -1; @@ -126,9 +137,25 @@ void IntegratedICPFactor_::update_correspondences(cons } else { correspondences[i] = k_index; } - } + }; - last_correspondence_point = delta; + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif + } } template @@ -144,45 +171,32 @@ double IntegratedICPFactor_::evaluate( update_correspondences(delta); } - // - double sum_errors = 0.0; - - std::vector> Hs_target; - std::vector> Hs_source; - std::vector> Hs_target_source; - std::vector> bs_target; - std::vector> bs_source; - - if (H_target && H_source && H_target_source && b_target && b_source) { - Hs_target.resize(num_threads, Eigen::Matrix::Zero()); - Hs_source.resize(num_threads, Eigen::Matrix::Zero()); - Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); - bs_target.resize(num_threads, Eigen::Matrix::Zero()); - bs_source.resize(num_threads, Eigen::Matrix::Zero()); - } - -#pragma omp parallel for num_threads(num_threads) reduction(+ : sum_errors) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { long target_index = correspondences[i]; if (target_index < 0) { - continue; + return 0.0; } const auto& mean_A = frame::point(*source, i); const auto& mean_B = frame::point(*target, target_index); - Eigen::Vector4d transed_mean_A = delta * mean_A; - Eigen::Vector4d error = mean_B - transed_mean_A; + const Eigen::Vector4d transed_mean_A = delta * mean_A; + Eigen::Vector4d residual = mean_B - transed_mean_A; if (use_point_to_plane) { const auto& normal_B = frame::normal(*target, target_index); - error = normal_B.array() * error.array(); + residual = normal_B.array() * residual.array(); } - sum_errors += 0.5 * error.transpose() * error; - - if (Hs_target.empty()) { - continue; + const double error = 0.5 * residual.transpose() * residual; + if (H_target == nullptr) { + return error; } Eigen::Matrix J_target = Eigen::Matrix::Zero(); @@ -199,35 +213,20 @@ double IntegratedICPFactor_::evaluate( J_source = normal_B.asDiagonal() * J_source; } - int thread_num = 0; -#ifdef _OPENMP - thread_num = omp_get_thread_num(); -#endif + *H_target += J_target.transpose() * J_target; + *H_source += J_source.transpose() * J_source; + *H_target_source += J_target.transpose() * J_source; + *b_target += J_target.transpose() * residual; + *b_source += J_source.transpose() * residual; - Hs_target[thread_num] += J_target.transpose() * J_target; - Hs_source[thread_num] += J_source.transpose() * J_source; - Hs_target_source[thread_num] += J_target.transpose() * J_source; - bs_target[thread_num] += J_target.transpose() * error; - bs_source[thread_num] += J_source.transpose() * error; - } + return error; + }; - if (H_target && H_source && H_target_source && b_target && b_source) { - H_target->setZero(); - H_source->setZero(); - H_target_source->setZero(); - b_target->setZero(); - b_source->setZero(); - - for (int i = 0; i < num_threads; i++) { - (*H_target) += Hs_target[i]; - (*H_source) += Hs_source[i]; - (*H_target_source) += Hs_target_source[i]; - (*b_target) += bs_target[i]; - (*b_source) += bs_source[i]; - } + if (is_omp_default() || num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*source), num_threads, H_target, H_source, H_target_source, b_target, b_source); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*source), H_target, H_source, H_target_source, b_target, b_source); } - - return sum_errors; } } // namespace gtsam_points diff --git a/include/gtsam_points/factors/impl/integrated_loam_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_loam_factor_impl.hpp index 3d1675da..c893ca81 100644 --- a/include/gtsam_points/factors/impl/integrated_loam_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_loam_factor_impl.hpp @@ -4,7 +4,14 @@ #include #include +#include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -62,10 +69,10 @@ void IntegratedPointToPlaneFactor_::update_corresponde } } + last_correspondence_point = delta; correspondences.resize(frame::size(*source)); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&](int i) { Eigen::Vector4d pt = delta * frame::point(*source, i); std::array k_indices; @@ -77,9 +84,25 @@ void IntegratedPointToPlaneFactor_::update_corresponde } else { correspondences[i] = std::make_tuple(k_indices[0], k_indices[1], k_indices[2]); } - } + }; - last_correspondence_point = delta; + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif + } } template @@ -95,28 +118,16 @@ double IntegratedPointToPlaneFactor_::evaluate( update_correspondences(delta); } - // - double sum_errors = 0.0; - - std::vector> Hs_target; - std::vector> Hs_source; - std::vector> Hs_target_source; - std::vector> bs_target; - std::vector> bs_source; - - if (H_target && H_source && H_target_source && b_target && b_source) { - Hs_target.resize(num_threads, Eigen::Matrix::Zero()); - Hs_source.resize(num_threads, Eigen::Matrix::Zero()); - Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); - bs_target.resize(num_threads, Eigen::Matrix::Zero()); - bs_source.resize(num_threads, Eigen::Matrix::Zero()); - } - -#pragma omp parallel for num_threads(num_threads) reduction(+ : sum_errors) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { auto target_indices = correspondences[i]; if (std::get<0>(target_indices) < 0) { - continue; + return 0.0; } const auto& x_i = frame::point(*source, i); @@ -129,12 +140,12 @@ double IntegratedPointToPlaneFactor_::evaluate( normal.head<3>() = (x_j - x_l).template head<3>().cross((x_j - x_m).template head<3>()); normal = normal / normal.norm(); - Eigen::Vector4d error = x_j - transed_x_i; - Eigen::Vector4d plane_error = error.array() * normal.array(); - sum_errors += 0.5 * plane_error.transpose() * plane_error; + Eigen::Vector4d residual = x_j - transed_x_i; + Eigen::Vector4d plane_residual = residual.array() * normal.array(); + const double error = 0.5 * plane_residual.transpose() * plane_residual; - if (Hs_target.empty()) { - continue; + if (H_target == nullptr) { + return error; } Eigen::Matrix J_target = Eigen::Matrix::Zero(); @@ -148,35 +159,20 @@ double IntegratedPointToPlaneFactor_::evaluate( J_target = normal.asDiagonal() * J_target; J_source = normal.asDiagonal() * J_source; - int thread_num = 0; -#ifdef _OPENMP - thread_num = omp_get_thread_num(); -#endif + *H_target += J_target.transpose() * J_target; + *H_source += J_source.transpose() * J_source; + *H_target_source += J_target.transpose() * J_source; + *b_target += J_target.transpose() * plane_residual; + *b_source += J_source.transpose() * plane_residual; - Hs_target[thread_num] += J_target.transpose() * J_target; - Hs_source[thread_num] += J_source.transpose() * J_source; - Hs_target_source[thread_num] += J_target.transpose() * J_source; - bs_target[thread_num] += J_target.transpose() * plane_error; - bs_source[thread_num] += J_source.transpose() * plane_error; - } + return error; + }; - if (H_target && H_source && H_target_source && b_target && b_source) { - H_target->setZero(); - H_source->setZero(); - H_target_source->setZero(); - b_target->setZero(); - b_source->setZero(); - - for (int i = 0; i < num_threads; i++) { - (*H_target) += Hs_target[i]; - (*H_source) += Hs_source[i]; - (*H_target_source) += Hs_target_source[i]; - (*b_target) += bs_target[i]; - (*b_source) += bs_source[i]; - } + if (is_omp_default() || num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*source), num_threads, H_target, H_source, H_target_source, b_target, b_source); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*source), H_target, H_source, H_target_source, b_target, b_source); } - - return sum_errors; } template @@ -229,9 +225,9 @@ void IntegratedPointToEdgeFactor_::update_corresponden } correspondences.resize(frame::size(*source)); + last_correspondence_point = delta; -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&](int i) { Eigen::Vector4d pt = delta * frame::point(*source, i); std::array k_indices; @@ -243,9 +239,25 @@ void IntegratedPointToEdgeFactor_::update_corresponden } else { correspondences[i] = std::make_tuple(k_indices[0], k_indices[1]); } - } + }; - last_correspondence_point = delta; + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif + } } template @@ -261,28 +273,16 @@ double IntegratedPointToEdgeFactor_::evaluate( update_correspondences(delta); } - // - double sum_errors = 0.0; - - std::vector> Hs_target; - std::vector> Hs_source; - std::vector> Hs_target_source; - std::vector> bs_target; - std::vector> bs_source; - - if (H_target && H_source && H_target_source && b_target && b_source) { - Hs_target.resize(num_threads, Eigen::Matrix::Zero()); - Hs_source.resize(num_threads, Eigen::Matrix::Zero()); - Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); - bs_target.resize(num_threads, Eigen::Matrix::Zero()); - bs_source.resize(num_threads, Eigen::Matrix::Zero()); - } - -#pragma omp parallel for num_threads(num_threads) reduction(+ : sum_errors) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { auto target_indices = correspondences[i]; if (std::get<0>(target_indices) < 0) { - continue; + return 0.0; } const auto& x_i = frame::point(*source, i); @@ -295,12 +295,12 @@ double IntegratedPointToEdgeFactor_::evaluate( Eigen::Vector4d x_ij = transed_x_i - x_j; Eigen::Vector4d x_il = transed_x_i - x_l; - Eigen::Vector4d error = Eigen::Vector4d::Zero(); - error.head<3>() = x_ij.head<3>().cross(x_il.head<3>()) * c_inv; - sum_errors += 0.5 * error.dot(error); + Eigen::Vector4d residual = Eigen::Vector4d::Zero(); + residual.head<3>() = x_ij.head<3>().cross(x_il.head<3>()) * c_inv; + const double error = 0.5 * residual.dot(residual); - if (Hs_target.empty()) { - continue; + if (H_target == nullptr) { + return error; } Eigen::Matrix J_target = Eigen::Matrix::Zero(); @@ -319,35 +319,20 @@ double IntegratedPointToEdgeFactor_::evaluate( J_target = c_inv * J_e * J_target; J_source = c_inv * J_e * J_source; - int thread_num = 0; -#ifdef _OPENMP - thread_num = omp_get_thread_num(); -#endif + *H_target += J_target.transpose() * J_target; + *H_source += J_source.transpose() * J_source; + *H_target_source += J_target.transpose() * J_source; + *b_target += J_target.transpose() * residual; + *b_source += J_source.transpose() * residual; - Hs_target[thread_num] += J_target.transpose() * J_target; - Hs_source[thread_num] += J_source.transpose() * J_source; - Hs_target_source[thread_num] += J_target.transpose() * J_source; - bs_target[thread_num] += J_target.transpose() * error; - bs_source[thread_num] += J_source.transpose() * error; - } + return error; + }; - if (H_target && H_source && H_target_source && b_target && b_source) { - H_target->setZero(); - H_source->setZero(); - H_target_source->setZero(); - b_target->setZero(); - b_source->setZero(); - - for (int i = 0; i < num_threads; i++) { - (*H_target) += Hs_target[i]; - (*H_source) += Hs_source[i]; - (*H_target_source) += Hs_target_source[i]; - (*b_target) += bs_target[i]; - (*b_source) += bs_source[i]; - } + if (is_omp_default()|| num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*source), num_threads, H_target, H_source, H_target_source, b_target, b_source); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*source), H_target, H_source, H_target_source, b_target, b_source); } - - return sum_errors; } template diff --git a/include/gtsam_points/factors/impl/integrated_vgicp_factor_impl.hpp b/include/gtsam_points/factors/impl/integrated_vgicp_factor_impl.hpp index 9c9f4556..cc565839 100644 --- a/include/gtsam_points/factors/impl/integrated_vgicp_factor_impl.hpp +++ b/include/gtsam_points/factors/impl/integrated_vgicp_factor_impl.hpp @@ -5,7 +5,14 @@ #include #include +#include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -71,8 +78,7 @@ void IntegratedVGICPFactor_::update_correspondences(const Eigen::Is correspondences.resize(frame::size(*source)); mahalanobis.resize(frame::size(*source)); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + const auto perpoint_task = [&](int i) { Eigen::Vector4d pt = delta * frame::point(*source, i); Eigen::Vector3i coord = target_voxels->voxel_coord(pt); const auto voxel_id = target_voxels->lookup_voxel_index(coord); @@ -90,6 +96,24 @@ void IntegratedVGICPFactor_::update_correspondences(const Eigen::Is mahalanobis[i] = RCR.inverse(); mahalanobis[i](3, 3) = 0.0; } + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame::size(*source); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame::size(*source), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } } @@ -102,27 +126,22 @@ double IntegratedVGICPFactor_::evaluate( Eigen::Matrix* b_target, Eigen::Matrix* b_source) const { // - double sum_errors = 0.0; - - std::vector> Hs_target; - std::vector> Hs_source; - std::vector> Hs_target_source; - std::vector> bs_target; - std::vector> bs_source; - - if (H_target && H_source && H_target_source && b_target && b_source) { - Hs_target.resize(num_threads, Eigen::Matrix::Zero()); - Hs_source.resize(num_threads, Eigen::Matrix::Zero()); - Hs_target_source.resize(num_threads, Eigen::Matrix::Zero()); - bs_target.resize(num_threads, Eigen::Matrix::Zero()); - bs_source.resize(num_threads, Eigen::Matrix::Zero()); + if (correspondences.size() != frame::size(*source)) { + update_correspondences(delta); } -#pragma omp parallel for num_threads(num_threads) reduction(+ : sum_errors) schedule(guided, 8) - for (int i = 0; i < frame::size(*source); i++) { + double sum_errors = 0.0; + + const auto perpoint_task = [&]( + int i, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { const auto& target_voxel = correspondences[i]; if (target_voxel == nullptr) { - continue; + return 0.0; } const auto& mean_A = frame::point(*source, i); @@ -132,12 +151,12 @@ double IntegratedVGICPFactor_::evaluate( const auto& cov_B = target_voxel->cov; Eigen::Vector4d transed_mean_A = delta * mean_A; - Eigen::Vector4d error = mean_B - transed_mean_A; + Eigen::Vector4d residual = mean_B - transed_mean_A; - sum_errors += 0.5 * error.transpose() * mahalanobis[i] * error; + const double error = 0.5 * residual.transpose() * mahalanobis[i] * residual; - if (Hs_target.empty()) { - continue; + if (!H_target) { + return error; } Eigen::Matrix J_target = Eigen::Matrix::Zero(); @@ -148,38 +167,23 @@ double IntegratedVGICPFactor_::evaluate( J_source.block<3, 3>(0, 0) = delta.linear() * gtsam::SO3::Hat(mean_A.template head<3>()); J_source.block<3, 3>(0, 3) = -delta.linear(); - int thread_num = 0; -#ifdef _OPENMP - thread_num = omp_get_thread_num(); -#endif - Eigen::Matrix J_target_mahalanobis = J_target.transpose() * mahalanobis[i]; Eigen::Matrix J_source_mahalanobis = J_source.transpose() * mahalanobis[i]; - Hs_target[thread_num] += J_target_mahalanobis * J_target; - Hs_source[thread_num] += J_source_mahalanobis * J_source; - Hs_target_source[thread_num] += J_target_mahalanobis * J_source; - bs_target[thread_num] += J_target_mahalanobis * error; - bs_source[thread_num] += J_source_mahalanobis * error; - } + *H_target += J_target_mahalanobis * J_target; + *H_source += J_source_mahalanobis * J_source; + *H_target_source += J_target_mahalanobis * J_source; + *b_target += J_target_mahalanobis * residual; + *b_source += J_source_mahalanobis * residual; - if (H_target && H_source && H_target_source && b_target && b_source) { - H_target->setZero(); - H_source->setZero(); - H_target_source->setZero(); - b_target->setZero(); - b_source->setZero(); - - for (int i = 0; i < num_threads; i++) { - (*H_target) += Hs_target[i]; - (*H_source) += Hs_source[i]; - (*H_target_source) += Hs_target_source[i]; - (*b_target) += bs_target[i]; - (*b_source) += bs_source[i]; - } - } + return error; + }; - return sum_errors; + if (is_omp_default() || num_threads == 1) { + return scan_matching_reduce_omp(perpoint_task, frame::size(*source), num_threads, H_target, H_source, H_target_source, b_target, b_source); + } else { + return scan_matching_reduce_tbb(perpoint_task, frame::size(*source), H_target, H_source, H_target_source, b_target, b_source); + } } } // namespace gtsam_points diff --git a/include/gtsam_points/factors/impl/scan_matching_reduction.hpp b/include/gtsam_points/factors/impl/scan_matching_reduction.hpp new file mode 100644 index 00000000..66b4e334 --- /dev/null +++ b/include/gtsam_points/factors/impl/scan_matching_reduction.hpp @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024 Kenji Koide (k.koide@aist.go.jp) +#pragma once + +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#include +#endif + +namespace gtsam_points { + +template +double scan_matching_reduce_omp( + const Transform f, + int num_points, + int num_threads, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { + double sum_errors = 0.0; + + const int num_Hs = H_target ? num_threads : 0; + std::vector> Hs_target(num_Hs, Eigen::Matrix::Zero()); + std::vector> Hs_source(num_Hs, Eigen::Matrix::Zero()); + std::vector> Hs_target_source(num_Hs, Eigen::Matrix::Zero()); + std::vector> bs_target(num_Hs, Eigen::Matrix::Zero()); + std::vector> bs_source(num_Hs, Eigen::Matrix::Zero()); + +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) reduction(+ : sum_errors) + for (int i = 0; i < num_points; i++) { + int thread_num = 0; +#ifdef _OPENMP + thread_num = omp_get_thread_num(); +#endif + + double error = 0.0; + if (Hs_target.empty()) { + error = f(i, nullptr, nullptr, nullptr, nullptr, nullptr); + } else { + error = f(i, &Hs_target[thread_num], &Hs_source[thread_num], &Hs_target_source[thread_num], &bs_target[thread_num], &bs_source[thread_num]); + } + + sum_errors += error; + } + + if (H_target) { + *H_target = Hs_target[0]; + *H_source = Hs_source[0]; + *H_target_source = Hs_target_source[0]; + *b_target = bs_target[0]; + *b_source = bs_source[0]; + + for (int i = 1; i < num_threads; i++) { + *H_target += Hs_target[i]; + *H_source += Hs_source[i]; + *H_target_source += Hs_target_source[i]; + *b_target += bs_target[i]; + *b_source += bs_source[i]; + } + } + + return sum_errors; +} + +#ifdef GTSAM_POINTS_USE_TBB + +template +struct ScanMatchingReductionTBBError { +public: + ScanMatchingReductionTBBError(const Transform f) : f(f), sum_errors(0.0) {} + ScanMatchingReductionTBBError(const ScanMatchingReductionTBBError& other, tbb::split) : f(other.f), sum_errors(0.0) {} + + void operator()(const tbb::blocked_range& range) { + double local_sum_errors = sum_errors; + for (int i = range.begin(); i != range.end(); i++) { + local_sum_errors += f(i, nullptr, nullptr, nullptr, nullptr, nullptr); + } + sum_errors = local_sum_errors; + } + + void join(const ScanMatchingReductionTBBError& other) { sum_errors += other.sum_errors; } + +public: + const Transform f; + double sum_errors; +}; + +template +struct ScanMatchingReductionTBBLinearize { +public: + ScanMatchingReductionTBBLinearize(const Transform f) + : f(f), + sum_errors(0.0), + H_target(Eigen::Matrix::Zero()), + H_source(Eigen::Matrix::Zero()), + H_target_source(Eigen::Matrix::Zero()), + b_target(Eigen::Matrix::Zero()), + b_source(Eigen::Matrix::Zero()) {} + + ScanMatchingReductionTBBLinearize(const ScanMatchingReductionTBBLinearize& other, tbb::split) + : f(other.f), + sum_errors(0.0), + H_target(Eigen::Matrix::Zero()), + H_source(Eigen::Matrix::Zero()), + H_target_source(Eigen::Matrix::Zero()), + b_target(Eigen::Matrix::Zero()), + b_source(Eigen::Matrix::Zero()) {} + + void operator()(const tbb::blocked_range& range) { + double local_sum_errors = sum_errors; + Eigen::Matrix local_H_target = Eigen::Matrix::Zero(); + Eigen::Matrix local_H_source = Eigen::Matrix::Zero(); + Eigen::Matrix local_H_target_source = Eigen::Matrix::Zero(); + Eigen::Matrix local_b_target = Eigen::Matrix::Zero(); + Eigen::Matrix local_b_source = Eigen::Matrix::Zero(); + + for (int i = range.begin(); i != range.end(); i++) { + local_sum_errors += f(i, &local_H_target, &local_H_source, &local_H_target_source, &local_b_target, &local_b_source); + } + + sum_errors = local_sum_errors; + H_target += local_H_target; + H_source += local_H_source; + H_target_source += local_H_target_source; + b_target += local_b_target; + b_source += local_b_source; + } + + void join(const ScanMatchingReductionTBBLinearize& other) { + sum_errors += other.sum_errors; + H_target += other.H_target; + H_source += other.H_source; + H_target_source += other.H_target_source; + b_target += other.b_target; + b_source += other.b_source; + } + +public: + const Transform f; + double sum_errors; + Eigen::Matrix H_target; + Eigen::Matrix H_source; + Eigen::Matrix H_target_source; + Eigen::Matrix b_target; + Eigen::Matrix b_source; +}; + +template +double scan_matching_reduce_tbb( + const Transform f, + int num_points, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { + if (H_target) { + ScanMatchingReductionTBBLinearize reduction(f); + tbb::parallel_reduce(tbb::blocked_range(0, num_points, 32), reduction); + *H_target = reduction.H_target; + *H_source = reduction.H_source; + *H_target_source = reduction.H_target_source; + *b_target = reduction.b_target; + *b_source = reduction.b_source; + return reduction.sum_errors; + } else { + ScanMatchingReductionTBBError reduction(f); + tbb::parallel_reduce(tbb::blocked_range(0, num_points, 32), reduction); + return reduction.sum_errors; + } +} + +#else +template +double scan_matching_reduce_tbb( + const Transform f, + int num_points, + Eigen::Matrix* H_target, + Eigen::Matrix* H_source, + Eigen::Matrix* H_target_source, + Eigen::Matrix* b_target, + Eigen::Matrix* b_source) { + std::cerr << "warning : TBB is not available" << std::endl; + return scan_matching_reduce_omp(f, num_points, 1, H_target, H_source, H_target_source, b_target, b_source); +} +#endif + +} // namespace gtsam_points diff --git a/include/gtsam_points/optimizers/isam2_ext_impl.hpp b/include/gtsam_points/optimizers/isam2_ext_impl.hpp index 4e0bfe53..669276b2 100644 --- a/include/gtsam_points/optimizers/isam2_ext_impl.hpp +++ b/include/gtsam_points/optimizers/isam2_ext_impl.hpp @@ -39,6 +39,7 @@ using namespace boost::adaptors; #include #include #include +#include namespace gtsam_points { diff --git a/include/gtsam_points/util/parallelism.hpp b/include/gtsam_points/util/parallelism.hpp new file mode 100644 index 00000000..a95b7977 --- /dev/null +++ b/include/gtsam_points/util/parallelism.hpp @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024 Kenji Koide (k.koide@aist.go.jp) +#pragma once + +#include + +namespace gtsam_points { + +/** + * @brief Parallelism backend + */ +enum class ParallelismBackend { + OMP, ///< OpenMP + TBB ///< Intel TBB +}; + +/// @brief Set TBB as the default parallelism backend +void set_tbb_as_default(); + +/// @brief Set OpenMP as the default parallelism backend +void set_omp_as_default(); + +/// @brief Set the default parallelism backend +void set_default_parallelism(ParallelismBackend parallelism); + +/// @brief Check if the default parallelism backend is TBB +bool is_tbb_default(); + +/// @brief Check if the default parallelism backend is OpenMP +bool is_omp_default(); + +/// @brief Get the default parallelism backend +ParallelismBackend get_default_parallelism(); + +} // namespace gtsam_points diff --git a/src/demo/demo_benchmark.cpp b/src/demo/demo_benchmark.cpp index aa05fc93..d83d116d 100644 --- a/src/demo/demo_benchmark.cpp +++ b/src/demo/demo_benchmark.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -14,7 +15,7 @@ #include #include -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA #include #include #include @@ -137,7 +138,7 @@ void benchmark_alignment(const std::string& factor_type, int num_threads, int nu frame->add_covs(gtsam_points::estimate_covariances(frame->points, frame->size(), 10, num_threads)); frames.emplace_back(frame); } else { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA auto frame = std::make_shared(points); frame->add_covs(gtsam_points::estimate_covariances(frame->points, frame->size(), 10, num_threads)); frames.emplace_back(frame); @@ -162,7 +163,7 @@ void benchmark_alignment(const std::string& factor_type, int num_threads, int nu } stopwatch.stop("voxelmap creation"); } else if (factor_type == "VGICP_GPU") { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA stopwatch.start(); for (int i = 0; i < 5; i++) { voxels.emplace_back(std::make_shared(1.0)); @@ -176,7 +177,7 @@ void benchmark_alignment(const std::string& factor_type, int num_threads, int nu std::cerr << "error: unknown factor type " << factor_type << std::endl; } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA gtsam_points::StreamTempBufferRoundRobin stream_buffer_roundrobin; #endif @@ -194,7 +195,7 @@ void benchmark_alignment(const std::string& factor_type, int num_threads, int nu factor->set_num_threads(num_threads); graph.add(factor); } else if (factor_type == "VGICP_GPU") { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA const auto stream_buffer = stream_buffer_roundrobin.get_stream_buffer(); const auto stream = stream_buffer.first; const auto buffer = stream_buffer.second; @@ -242,7 +243,7 @@ void print_info() { } } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA const auto cuda_devices = gtsam_points::cuda_device_names(); for (int i = 0; i < cuda_devices.size(); i++) { std::cout << boost::format("GPU model%d : %s") % i % cuda_devices[i] << std::endl; @@ -281,7 +282,7 @@ int main(int argc, char** argv) { benchmark_alignment("VGICP", max_num_threads, 10); std::cout << std::endl; -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA benchmark_alignment("VGICP_GPU", 1, 1); benchmark_alignment("VGICP_GPU", max_num_threads, 1); benchmark_alignment("VGICP_GPU", max_num_threads, 10); diff --git a/src/demo/demo_matching_cost_factors.cpp b/src/demo/demo_matching_cost_factors.cpp index ab5e22cb..d8bd85ea 100644 --- a/src/demo/demo_matching_cost_factors.cpp +++ b/src/demo/demo_matching_cost_factors.cpp @@ -8,14 +8,14 @@ #include #include +#include #include #include #include - #include #include -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA #include #include #include @@ -47,7 +47,7 @@ class MatchingCostFactorDemo { abort(); } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA std::cout << "Register GPU linearization hook" << std::endl; gtsam_points::LinearizationHook::register_hook([] { return gtsam_points::create_nonlinear_factor_set_gpu(); }); #endif @@ -83,7 +83,7 @@ class MatchingCostFactorDemo { }); auto covs = gtsam_points::estimate_covariances(points); -#ifndef BUILD_GTSAM_POINTS_GPU +#ifndef GTSAM_POINTS_USE_CUDA std::cout << "Create CPU frame" << std::endl; auto frame = std::make_shared(); #else @@ -99,7 +99,7 @@ class MatchingCostFactorDemo { voxelmap->insert(*frame); voxelmaps[i] = voxelmap; -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA auto voxelmap_gpu = std::make_shared(2.0); voxelmap_gpu->insert(*frame); voxelmaps_gpu[i] = voxelmap_gpu; @@ -121,11 +121,12 @@ class MatchingCostFactorDemo { factor_types.push_back("ICP_PLANE"); factor_types.push_back("GICP"); factor_types.push_back("VGICP"); -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA factor_types.push_back("VGICP_GPU"); #endif full_connection = true; + num_threads = 1; correspondence_update_tolerance_rot = 0.0f; correspondence_update_tolerance_trans = 0.0f; @@ -144,6 +145,7 @@ class MatchingCostFactorDemo { // Optimization configurations ImGui::Separator(); ImGui::Checkbox("full connection", &full_connection); + ImGui::DragInt("num threads", &num_threads, 1, 1, 128); ImGui::Combo("factor type", &factor_type, factor_types.data(), factor_types.size()); ImGui::Combo("optimizer type", &optimizer_type, optimizer_types.data(), optimizer_types.size()); @@ -202,19 +204,24 @@ class MatchingCostFactorDemo { if (factor_types[factor_type] == std::string("ICP")) { auto factor = gtsam::make_shared(target_key, source_key, target, source); factor->set_correspondence_update_tolerance(correspondence_update_tolerance_rot, correspondence_update_tolerance_trans); + factor->set_num_threads(num_threads); return factor; } else if (factor_types[factor_type] == std::string("ICP_PLANE")) { auto factor = gtsam::make_shared(target_key, source_key, target, source); factor->set_correspondence_update_tolerance(correspondence_update_tolerance_rot, correspondence_update_tolerance_trans); + factor->set_num_threads(num_threads); return factor; } else if (factor_types[factor_type] == std::string("GICP")) { auto factor = gtsam::make_shared(target_key, source_key, target, source); factor->set_correspondence_update_tolerance(correspondence_update_tolerance_rot, correspondence_update_tolerance_trans); + factor->set_num_threads(num_threads); return factor; } else if (factor_types[factor_type] == std::string("VGICP")) { - return gtsam::make_shared(target_key, source_key, target_voxelmap, source); + auto factor = gtsam::make_shared(target_key, source_key, target_voxelmap, source); + factor->set_num_threads(num_threads); + return factor; } else if (factor_types[factor_type] == std::string("VGICP_GPU")) { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA return gtsam::make_shared(target_key, source_key, target_voxelmap_gpu, source); #endif } @@ -279,6 +286,7 @@ class MatchingCostFactorDemo { std::vector factor_types; int factor_type; bool full_connection; + int num_threads; std::vector optimizer_types; int optimizer_type; diff --git a/src/gtsam_points/ann/incremental_covariance_voxelmap.cpp b/src/gtsam_points/ann/incremental_covariance_voxelmap.cpp index e6dd2947..12c7b5e8 100644 --- a/src/gtsam_points/ann/incremental_covariance_voxelmap.cpp +++ b/src/gtsam_points/ann/incremental_covariance_voxelmap.cpp @@ -139,8 +139,13 @@ void IncrementalCovarianceVoxelMap::insert(const PointCloud& points) { Eigen::SelfAdjointEigenSolver eig; eig.computeDirect(cov.block<3, 3>(0, 0)); + int thread_num = 0; +#ifdef _OPENMP + thread_num = omp_get_thread_num(); +#endif + if (in_warmup) { - new_stats[omp_get_thread_num()].add(eig.eigenvalues()); + new_stats[thread_num].add(eig.eigenvalues()); } // Check if the normal is valid. @@ -232,7 +237,11 @@ std::vector IncrementalCovarianceVoxelMap::valid_indices(int num_threads const auto& voxel = flat_voxels[i]; for (int j = 0; j < voxel->second.size(); j++) { if (voxel->second.valid(j)) { - valid_indices[omp_get_thread_num()].push_back(calc_index(i, j)); + int thread_num = 0; +#ifdef _OPENMP + thread_num = omp_get_thread_num(); +#endif + valid_indices[thread_num].push_back(calc_index(i, j)); } } } diff --git a/src/gtsam_points/ann/kdtree.cpp b/src/gtsam_points/ann/kdtree.cpp index af67c4e9..5e01c3ea 100644 --- a/src/gtsam_points/ann/kdtree.cpp +++ b/src/gtsam_points/ann/kdtree.cpp @@ -2,8 +2,10 @@ // SPDX-License-Identifier: MIT #include +#include #include #include +#include namespace gtsam_points { @@ -22,7 +24,17 @@ KdTree::KdTree(const Eigen::Vector4d* points, int num_points, int build_num_thre : num_points(num_points), points(points), search_eps(-1.0), - index(new Index(*this, KdTreeBuilderOMP(build_num_threads))) {} + index( + is_omp_default() || build_num_threads == 1 ? // + new Index(*this, KdTreeBuilderOMP(build_num_threads)) // + : // +#ifdef GTSAM_POINTS_USE_TBB // + new Index(*this, KdTreeBuilderTBB()) // +#else // + new Index(*this, KdTreeBuilder()) +#endif + ) { +} KdTree::~KdTree() {} diff --git a/src/gtsam_points/factors/experimental/intensity_gradients_ivox.cpp b/src/gtsam_points/factors/experimental/intensity_gradients_ivox.cpp index 456d6601..2b7a4de1 100644 --- a/src/gtsam_points/factors/experimental/intensity_gradients_ivox.cpp +++ b/src/gtsam_points/factors/experimental/intensity_gradients_ivox.cpp @@ -4,6 +4,12 @@ #include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -44,9 +50,7 @@ void IntensityGradientsiVox::insert(const PointCloud& frame) { grads[voxel.second->serial_id] = found->second; } - // Add new points and estimate their normal and gradients -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < flat_voxels.size(); i++) { + const auto pervoxel_task = [&](int i) { const auto& voxel = flat_voxels[i]; const auto& gradients = flat_grads[i]; gradients->points.reserve(voxel.second->size()); @@ -66,6 +70,25 @@ void IntensityGradientsiVox::insert(const PointCloud& frame) { gradients->normals.push_back(normal); gradients->points.push_back(gradient); } + }; + + if (is_omp_default() || num_threads == 1) { + // Add new points and estimate their normal and gradients +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < flat_voxels.size(); i++) { + pervoxel_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, flat_voxels.size(), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + pervoxel_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } } diff --git a/src/gtsam_points/factors/intensity_gradients.cpp b/src/gtsam_points/factors/intensity_gradients.cpp index 99513409..42db2966 100644 --- a/src/gtsam_points/factors/intensity_gradients.cpp +++ b/src/gtsam_points/factors/intensity_gradients.cpp @@ -6,7 +6,14 @@ #include #include +#include #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -72,8 +79,7 @@ IntensityGradients::Ptr IntensityGradients::estimate(const PointCloud::ConstPtr& IntensityGradients::Ptr gradients(new IntensityGradients); gradients->intensity_gradients.resize(frame::size(*frame)); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame::size(*frame); i++) { + const auto perpoint_task = [&](int i) { std::vector k_indices(k_neighbors); std::vector k_sq_dists(k_neighbors); kdtree.knn_search(frame::point(*frame, i).data(), k_neighbors, k_indices.data(), k_sq_dists.data()); @@ -103,6 +109,24 @@ IntensityGradients::Ptr IntensityGradients::estimate(const PointCloud::ConstPtr& Eigen::Matrix3d H = (A.transpose() * A).block<3, 3>(0, 0); Eigen::Vector3d e = (A.transpose() * b).head<3>(); gradients->intensity_gradients[i] << H.inverse() * e, 0.0; + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame->size(); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame->size(), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } return gradients; @@ -130,8 +154,7 @@ IntensityGradients::estimate(const gtsam_points::PointCloudCPU::Ptr& frame, int const int k_neighbors = std::max(k_geom_neighbors, k_photo_neighbors); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < frame->size(); i++) { + const auto perpoint_task = [&](int i) { std::vector k_indices(k_neighbors); std::vector k_sq_dists(k_neighbors); kdtree.knn_search(frame->points[i].data(), k_neighbors, k_indices.data(), k_sq_dists.data()); @@ -195,6 +218,24 @@ IntensityGradients::estimate(const gtsam_points::PointCloudCPU::Ptr& frame, int Eigen::Matrix3d H = (A.transpose() * A).block<3, 3>(0, 0); Eigen::Vector3d e = (A.transpose() * b).head<3>(); gradients->intensity_gradients[i] << H.inverse() * e, 0.0; + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < frame->size(); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame->size(), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } return gradients; diff --git a/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp b/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp index c3726b2b..9bc0de06 100644 --- a/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp +++ b/src/gtsam_points/types/gaussian_voxelmap_cpu_funcs.cpp @@ -11,10 +11,11 @@ #include #include +#include #include #include -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA #include #endif @@ -104,7 +105,7 @@ merge_frames(const std::vector& poses, const std::vector& poses, const std::vector& frames, double downsample_resolution) { // -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA if (frames[0]->points_gpu && frames[0]->covs_gpu) { return merge_frames_gpu(poses, frames, downsample_resolution); } @@ -163,7 +164,7 @@ double overlap( } double overlap_auto(const GaussianVoxelMap::ConstPtr& target, const PointCloud::ConstPtr& source, const Eigen::Isometry3d& delta) { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA if (source->points_gpu && std::dynamic_pointer_cast(target)) { return overlap_gpu(target, source, delta); } @@ -175,7 +176,7 @@ double overlap_auto( const std::vector& targets, const PointCloud::ConstPtr& source, const std::vector& deltas) { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA if (source->points_gpu && !targets.empty() && std::dynamic_pointer_cast(targets[0])) { return overlap_gpu(targets, source, deltas); } diff --git a/src/gtsam_points/types/point_cloud_cpu.cpp b/src/gtsam_points/types/point_cloud_cpu.cpp index a2de90ef..c902e8b3 100644 --- a/src/gtsam_points/types/point_cloud_cpu.cpp +++ b/src/gtsam_points/types/point_cloud_cpu.cpp @@ -11,10 +11,17 @@ #include #include +#include #include #include #include #include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#include +#endif namespace gtsam_points { @@ -400,18 +407,15 @@ PointCloudCPU::Ptr voxelgrid_sampling(const PointCloud::ConstPtr& frame, const d const double inv_resolution = 1.0 / voxel_resolution; std::vector> coord_pt(frame->size()); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 32) - for (std::int64_t i = 0; i < frame->size(); i++) { + const auto calc_coord = [&](std::int64_t i) -> std::uint64_t { if (!frame->points[i].array().isFinite().all()) { - coord_pt[i] = {invalid_coord, i}; - continue; + return invalid_coord; } const Eigen::Array4i coord = fast_floor(frame->points[i] * inv_resolution) + coord_offset; if ((coord < 0).any() || (coord > coord_bit_mask).any()) { std::cerr << "warning: voxel coord is out of range!!" << std::endl; - coord_pt[i] = {invalid_coord, i}; - continue; + return invalid_coord; } // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit) @@ -419,11 +423,31 @@ PointCloudCPU::Ptr voxelgrid_sampling(const PointCloud::ConstPtr& frame, const d (static_cast(coord[0] & coord_bit_mask) << (coord_bit_size * 0)) | // (static_cast(coord[1] & coord_bit_mask) << (coord_bit_size * 1)) | // (static_cast(coord[2] & coord_bit_mask) << (coord_bit_size * 2)); - coord_pt[i] = {bits, i}; - } + return bits; + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 32) + for (std::int64_t i = 0; i < frame->size(); i++) { + coord_pt[i] = {calc_coord(i), i}; + } + // Sort by voxel coords + quick_sort_omp(coord_pt.begin(), coord_pt.end(), [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }, num_threads); + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame->size(), 32), [&](const tbb::blocked_range& range) { + for (std::int64_t i = range.begin(); i < range.end(); i++) { + coord_pt[i] = {calc_coord(i), i}; + } + }); - // Sort by voxel coords - quick_sort_omp(coord_pt.begin(), coord_pt.end(), [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }, num_threads); + // Sort by voxel coords + tbb::parallel_sort(coord_pt.begin(), coord_pt.end(), [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif + } PointCloudCPU::Ptr downsampled(new PointCloudCPU); downsampled->points_storage.resize(frame->size()); @@ -443,8 +467,7 @@ PointCloudCPU::Ptr voxelgrid_sampling(const PointCloud::ConstPtr& frame, const d const int block_size = 1024; std::atomic_uint64_t num_points = 0; -#pragma omp parallel for num_threads(num_threads) schedule(guided, 4) - for (std::int64_t block_begin = 0; block_begin < coord_pt.size(); block_begin += block_size) { + const auto perblock_task = [&](std::int64_t block_begin) { std::vector*> sub_blocks; sub_blocks.reserve(block_size); @@ -498,6 +521,25 @@ PointCloudCPU::Ptr voxelgrid_sampling(const PointCloud::ConstPtr& frame, const d downsampled->intensities_storage[point_index_begin + i] = intensity_average.average(); } } + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 4) + for (std::int64_t block_begin = 0; block_begin < coord_pt.size(); block_begin += block_size) { + perblock_task(block_begin); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + const size_t num_blocks = (coord_pt.size() + block_size - 1) / block_size; + tbb::parallel_for(tbb::blocked_range(0, num_blocks), [&](const tbb::blocked_range& range) { + for (std::int64_t block_begin = range.begin() * block_size; block_begin < range.end() * block_size; block_begin += block_size) { + perblock_task(block_begin); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } downsampled->num_points = num_points; @@ -545,19 +587,15 @@ randomgrid_sampling(const PointCloud::ConstPtr& frame, const double voxel_resolu constexpr int coord_offset = 1 << (coord_bit_size - 1); // Coordinate offset to make values positive const double inv_resolution = 1.0 / voxel_resolution; - std::vector> coord_pt(frame->size()); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 32) - for (std::int64_t i = 0; i < frame->size(); i++) { + const auto calc_coord = [&](std::int64_t i) -> std::uint64_t { if (!frame->points[i].array().isFinite().all()) { - coord_pt[i] = {invalid_coord, i}; - continue; + return invalid_coord; } const Eigen::Array4i coord = fast_floor(frame->points[i] * inv_resolution) + coord_offset; if ((coord < 0).any() || (coord > coord_bit_mask).any()) { std::cerr << "warning: voxel coord is out of range!!" << std::endl; - coord_pt[i] = {invalid_coord, i}; - continue; + return invalid_coord; } // Compute voxel coord bits (0|1bit, z|21bit, y|21bit, x|21bit) @@ -565,18 +603,54 @@ randomgrid_sampling(const PointCloud::ConstPtr& frame, const double voxel_resolu (static_cast(coord[0] & coord_bit_mask) << (coord_bit_size * 0)) | // (static_cast(coord[1] & coord_bit_mask) << (coord_bit_size * 1)) | // (static_cast(coord[2] & coord_bit_mask) << (coord_bit_size * 2)); - coord_pt[i] = {bits, i}; - } - - // Sort by voxel coords - quick_sort_omp(coord_pt.begin(), coord_pt.end(), [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }, num_threads); + return bits; + }; size_t num_voxels = 0; + std::vector> coord_pt(frame->size()); + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 32) + for (std::int64_t i = 0; i < frame->size(); i++) { + coord_pt[i] = {calc_coord(i), i}; + } + + // Sort by voxel coords + quick_sort_omp(coord_pt.begin(), coord_pt.end(), [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }, num_threads); + #pragma omp parallel for num_threads(num_threads) schedule(guided, 128) reduction(+ : num_voxels) - for (size_t i = 1; i < coord_pt.size(); i++) { - if (coord_pt[i - 1].first != coord_pt[i].first) { - num_voxels++; + for (size_t i = 1; i < coord_pt.size(); i++) { + if (coord_pt[i - 1].first != coord_pt[i].first) { + num_voxels++; + } } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame->size(), 32), [&](const tbb::blocked_range& range) { + for (std::int64_t i = range.begin(); i < range.end(); i++) { + coord_pt[i] = {calc_coord(i), i}; + } + }); + + // Sort by voxel coords + tbb::parallel_sort(coord_pt.begin(), coord_pt.end(), [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); + + std::atomic_uint64_t num_voxels_ = 0; + tbb::parallel_for(tbb::blocked_range(1, coord_pt.size(), 128), [&](const tbb::blocked_range& range) { + size_t local_num_voxels = 0; + for (size_t i = range.begin(); i < range.end(); i++) { + if (coord_pt[i - 1].first != coord_pt[i].first) { + local_num_voxels++; + } + } + num_voxels_ += local_num_voxels; + }); + + num_voxels = num_voxels_; +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } const size_t points_per_voxel = std::ceil((sampling_rate * frame->size()) / num_voxels); @@ -590,8 +664,7 @@ randomgrid_sampling(const PointCloud::ConstPtr& frame, const double voxel_resolu std::vector indices(frame->size()); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 4) - for (std::int64_t block_begin = 0; block_begin < coord_pt.size(); block_begin += block_size) { + const auto perblock_task = [&](std::int64_t block_begin) { std::vector sub_indices; sub_indices.reserve(block_size); @@ -602,7 +675,11 @@ randomgrid_sampling(const PointCloud::ConstPtr& frame, const double voxel_resolu if (block_indices.size() < points_per_voxel) { sub_indices.insert(sub_indices.end(), block_indices.begin(), block_indices.end()); } else { - std::sample(block_indices.begin(), block_indices.end(), std::back_inserter(sub_indices), points_per_voxel, mts[omp_get_thread_num()]); + int thread_num = 0; +#ifdef _OPENMP + thread_num = omp_get_thread_num(); +#endif + std::sample(block_indices.begin(), block_indices.end(), std::back_inserter(sub_indices), points_per_voxel, mts[thread_num]); } block_indices.clear(); }; @@ -620,6 +697,26 @@ randomgrid_sampling(const PointCloud::ConstPtr& frame, const double voxel_resolu flush_block_indices(); std::copy(sub_indices.begin(), sub_indices.end(), indices.begin() + num_points.fetch_add(sub_indices.size())); + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 4) + for (std::int64_t block_begin = 0; block_begin < coord_pt.size(); block_begin += block_size) { + perblock_task(block_begin); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + const size_t num_blocks = (coord_pt.size() + block_size - 1) / block_size; + tbb::parallel_for(tbb::blocked_range(0, num_blocks), [&](const tbb::blocked_range& range) { + for (std::int64_t block_begin = range.begin() * block_size; block_begin < range.end() * block_size; block_begin += block_size) { + perblock_task(block_begin); + } + }); + +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } indices.resize(num_points); @@ -803,14 +900,29 @@ PointCloudCPU::Ptr remove_outliers(const PointCloud::ConstPtr& frame, const int std::vector neighbors(frame->size() * k, -1); -#pragma omp parallel for schedule(guided, 8) num_threads(num_threads) - for (int i = 0; i < frame->size(); i++) { + const auto perpoint_task = [&](int i) { std::vector k_neighbors(k); std::vector k_sq_dists(k); - kdtree.knn_search(frame->points[i].data(), k, k_neighbors.data(), k_sq_dists.data()); - std::copy(k_neighbors.begin(), k_neighbors.end(), neighbors.begin() + i * k); + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for schedule(guided, 8) num_threads(num_threads) + for (int i = 0; i < frame->size(); i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, frame->size(), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i != range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error : TBB is not available" << std::endl; + abort(); +#endif } return remove_outliers(frame, neighbors, k, std_thresh); diff --git a/src/gtsam_points/util/covariance_estimation.cpp b/src/gtsam_points/util/covariance_estimation.cpp index d2d0f218..3b6fbade 100644 --- a/src/gtsam_points/util/covariance_estimation.cpp +++ b/src/gtsam_points/util/covariance_estimation.cpp @@ -5,7 +5,13 @@ #include #include +#include #include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { @@ -13,8 +19,7 @@ std::vector estimate_covariances(const Eigen::Vector4d* points, KdTree tree(points, num_points, params.num_threads); std::vector covs(num_points); -#pragma omp parallel for num_threads(params.num_threads) schedule(guided, 8) - for (int i = 0; i < num_points; i++) { + const auto perpoint_task = [&](int i) { std::vector k_indices(params.k_neighbors); std::vector k_sq_dists(params.k_neighbors); size_t num_found = tree.knn_search(points[i].data(), params.k_neighbors, &k_indices[0], &k_sq_dists[0]); @@ -22,7 +27,7 @@ std::vector estimate_covariances(const Eigen::Vector4d* points, if (num_found < params.k_neighbors) { std::cerr << "warning: fewer than k neighbors found for point " << i << std::endl; covs[i].setIdentity(); - continue; + return; } Eigen::Vector4d sum_points = Eigen::Vector4d::Zero(); @@ -48,6 +53,24 @@ std::vector estimate_covariances(const Eigen::Vector4d* points, covs[i].block<3, 3>(0, 0) = eig.eigenvectors() * params.eigen_values.asDiagonal() * eig.eigenvectors().inverse(); } break; } + }; + + if (is_omp_default() || params.num_threads == 1) { +#pragma omp parallel for num_threads(params.num_threads) schedule(guided, 8) + for (int i = 0; i < num_points; i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, num_points, 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error: TBB is not available" << std::endl; + abort(); +#endif } return covs; diff --git a/src/gtsam_points/util/normal_estimation.cpp b/src/gtsam_points/util/normal_estimation.cpp index 9ef00f35..d7688e33 100644 --- a/src/gtsam_points/util/normal_estimation.cpp +++ b/src/gtsam_points/util/normal_estimation.cpp @@ -3,16 +3,22 @@ #include +#include #include +#include #include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif namespace gtsam_points { std::vector estimate_normals(const Eigen::Vector4d* points, const Eigen::Matrix4d* covs, int num_points, int num_threads) { std::vector normals(num_points, Eigen::Vector4d::Zero()); -#pragma omp parallel for num_threads(num_threads) - for (int i = 0; i < num_points; i++) { + const auto perpoint_task = [&](int i) { Eigen::SelfAdjointEigenSolver eig; eig.computeDirect(covs[i].block<3, 3>(0, 0)); normals[i].head<3>() = eig.eigenvectors().col(0); @@ -20,6 +26,24 @@ std::vector estimate_normals(const Eigen::Vector4d* points, con if (points[i].dot(normals[i]) > 1.0) { normals[i] = -normals[i]; } + }; + + if (is_omp_default() || num_threads == 1) { +#pragma omp parallel for num_threads(num_threads) + for (int i = 0; i < num_points; i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, num_points, 64), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error : TBB is not enabled" << std::endl; + abort(); +#endif } return normals; diff --git a/src/gtsam_points/util/parallelism.cpp b/src/gtsam_points/util/parallelism.cpp new file mode 100644 index 00000000..4c39e9eb --- /dev/null +++ b/src/gtsam_points/util/parallelism.cpp @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024 Kenji Koide (k.koide@aist.go.jp) +#include + +#include +#include + +namespace gtsam_points { + +ParallelismBackend default_parallelism = ParallelismBackend::OMP; + +void set_tbb_as_default() { + default_parallelism = ParallelismBackend::TBB; +} + +void set_omp_as_default() { + default_parallelism = ParallelismBackend::OMP; +} + +void set_default_parallelism(ParallelismBackend parallelism) { +#ifndef _OPENMP + if (parallelism == ParallelismBackend::OMP) { + throw std::runtime_error("OpenMP is not available"); + } +#endif + +#ifndef GTSAM_POINTS_USE_TBB + if (parallelism == ParallelismBackend::TBB) { + std::cerr << "warning: Intel TBB is not available" << std::endl; + return; + } +#endif + + default_parallelism = parallelism; +} + +bool is_tbb_default() { + return default_parallelism == ParallelismBackend::TBB; +} + +bool is_omp_default() { + return default_parallelism == ParallelismBackend::OMP; +} + +ParallelismBackend get_default_parallelism() { + return default_parallelism; +} + +} // namespace gtsam_points diff --git a/src/test/include/validate_frame.hpp b/src/test/include/validate_frame.hpp index 29cd990d..e90985fc 100644 --- a/src/test/include/validate_frame.hpp +++ b/src/test/include/validate_frame.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include void validate_frame(const gtsam_points::PointCloud::ConstPtr& frame) { @@ -39,7 +40,7 @@ void validate_frame(const gtsam_points::PointCloud::ConstPtr& frame) { } } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA void validate_frame_gpu(const gtsam_points::PointCloud::ConstPtr& frame) { if (frame->points_gpu) { const auto points = gtsam_points::download_points_gpu(*frame); diff --git a/src/test/test_colored_gicp.cpp b/src/test/test_colored_gicp.cpp index 7b16446d..1cc13f6f 100644 --- a/src/test/test_colored_gicp.cpp +++ b/src/test/test_colored_gicp.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -104,16 +105,35 @@ struct ColoredGICPTestBase : public testing::Test { std::vector source_points; }; -TEST_F(ColoredGICPTestBase, Check) { +class ColoredGICPTest : public ColoredGICPTestBase, public testing::WithParamInterface> {}; + +INSTANTIATE_TEST_SUITE_P( + gtsam_points, + ColoredGICPTest, + testing::Combine(testing::Values("GICP"), testing::Values("NONE", "OMP", "TBB")), + [](const auto& info) { return std::get<0>(info.param) + "_" + std::get<1>(info.param); }); + +TEST_P(ColoredGICPTest, AlignmentTest) { + const auto param = GetParam(); + const std::string method = std::get<0>(param); + const std::string parallelism = std::get<1>(param); + const int num_threads = parallelism == "NONE" ? 1 : 2; + + if (parallelism == "TBB") { + gtsam_points::set_tbb_as_default(); + } else { + gtsam_points::set_omp_as_default(); + } + gtsam_points::PointCloudCPU::Ptr target(new gtsam_points::PointCloudCPU(target_points)); target->add_intensities(target_intensities); - auto target_gradients = gtsam_points::IntensityGradients::estimate(target, 10, 50, 1); + auto target_gradients = gtsam_points::IntensityGradients::estimate(target, 10, 50, num_threads); EXPECT_NE(target->normals, nullptr); EXPECT_NE(target->covs, nullptr); gtsam_points::PointCloud::Ptr target_ = target; - auto target_gradients2 = gtsam_points::IntensityGradients::estimate(target_, 50, 1); + auto target_gradients2 = gtsam_points::IntensityGradients::estimate(target_, 50, num_threads); gtsam_points::PointCloudCPU::Ptr source(new gtsam_points::PointCloudCPU(source_points)); source->add_intensities(source_intensities); @@ -123,13 +143,17 @@ TEST_F(ColoredGICPTestBase, Check) { std::shared_ptr target_intensity_tree( new gtsam_points::IntensityKdTree(target->points, target->intensities, target->size())); - test_factor(gtsam::make_shared(0, 1, target, source, target_tree, target_gradients), "DEFAULT"); - test_factor( - gtsam::make_shared(0, 1, target, source, target_intensity_tree, target_gradients), - "ESTIMATE_PHOTO_AND_GEOM"); - test_factor( - gtsam::make_shared(0, 1, target, source, target_intensity_tree, target_gradients2), - "ESTIMATE_PHOTO_ONLY"); + auto f1 = gtsam::make_shared(0, 1, target, source, target_tree, target_gradients); + f1->set_num_threads(num_threads); + test_factor(f1, "DEFAULT"); + + auto f2 = gtsam::make_shared(0, 1, target, source, target_intensity_tree, target_gradients); + f2->set_num_threads(num_threads); + test_factor(f2, "ESTIMATE_PHOTO_AND_GEOM"); + + auto f3 = gtsam::make_shared(0, 1, target, source, target_intensity_tree, target_gradients2); + f3->set_num_threads(num_threads); + test_factor(f3, "ESTIMATE_PHOTO_ONLY"); } int main(int argc, char** argv) { diff --git a/src/test/test_continuous_time.cpp b/src/test/test_continuous_time.cpp index 264eabe0..f8213258 100644 --- a/src/test/test_continuous_time.cpp +++ b/src/test/test_continuous_time.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -54,7 +55,7 @@ TEST_F(ContinuousTimeFactorsTestBase, LoadCheck) { ASSERT_EQ(deskewed_target_frames.size(), 3) << "Failed to load target frames"; } -struct ContinuousTimeFactorTest : public ContinuousTimeFactorsTestBase, public testing::WithParamInterface { +struct ContinuousTimeFactorTest : public ContinuousTimeFactorsTestBase, public testing::WithParamInterface> { public: double pointcloud_distance(const gtsam_points::PointCloud::ConstPtr& frame1, const gtsam_points::PointCloud::ConstPtr& frame2) { gtsam_points::KdTree tree(frame2->points, frame2->size()); @@ -72,9 +73,24 @@ struct ContinuousTimeFactorTest : public ContinuousTimeFactorsTestBase, public t } }; -INSTANTIATE_TEST_SUITE_P(gtsam_points, ContinuousTimeFactorTest, testing::Values("CTICP", "CTGICP"), [](const auto& info) { return info.param; }); +INSTANTIATE_TEST_SUITE_P( + gtsam_points, + ContinuousTimeFactorTest, + testing::Combine(testing::Values("CTICP", "CTGICP"), testing::Values("NONE", "OMP", "TBB")), + [](const auto& info) { return std::get<0>(info.param) + "_" + std::get<1>(info.param); }); TEST_P(ContinuousTimeFactorTest, AlignmentTest) { + const auto param = GetParam(); + const std::string method = std::get<0>(param); + const std::string parallelism = std::get<1>(param); + const int num_threads = parallelism == "NONE" ? 1 : 2; + + if (parallelism == "TBB") { + gtsam_points::set_tbb_as_default(); + } else { + gtsam_points::set_omp_as_default(); + } + for (int i = 0; i < 3; i++) { gtsam::Values values; values.insert(0, gtsam::Pose3::Identity()); @@ -84,10 +100,14 @@ TEST_P(ContinuousTimeFactorTest, AlignmentTest) { const auto source = raw_source_frames[i]; gtsam_points::IntegratedCT_ICPFactor::shared_ptr factor; - if (GetParam() == "CTICP") { - factor.reset(new gtsam_points::IntegratedCT_ICPFactor(0, 1, target, source)); - } else if (GetParam() == "CTGICP") { - factor.reset(new gtsam_points::IntegratedCT_GICPFactor(0, 1, target, source)); + if (method == "CTICP") { + auto f = gtsam::make_shared(0, 1, target, source); + f->set_num_threads(num_threads); + factor = f; + } else if (method == "CTGICP") { + auto f = gtsam::make_shared(0, 1, target, source); + f->set_num_threads(num_threads); + factor = f; } gtsam::NonlinearFactorGraph graph; diff --git a/src/test/test_kdtree.cpp b/src/test/test_kdtree.cpp index 427ab6c5..55439f58 100644 --- a/src/test/test_kdtree.cpp +++ b/src/test/test_kdtree.cpp @@ -8,6 +8,7 @@ #include #include #include +#include class KdTreeTest : public testing::Test, public testing::WithParamInterface { virtual void SetUp() { @@ -65,23 +66,36 @@ TEST_F(KdTreeTest, LoadCheck) { ASSERT_EQ(gt_sq_dists.size(), queries.size()); } -INSTANTIATE_TEST_SUITE_P(gtsam_points, KdTreeTest, testing::Values("KdTree", "KdTreeMT", "KdTree2", "KdTree2MT"), [](const auto& info) { - return info.param; -}); +INSTANTIATE_TEST_SUITE_P( + gtsam_points, + KdTreeTest, + testing::Values("KdTree", "KdTreeMT", "KdTreeTBB", "KdTree2", "KdTree2MT", "KdTree2TBB"), + [](const auto& info) { return info.param; }); TEST_P(KdTreeTest, KdTreeTest) { gtsam_points::NearestNeighborSearch::ConstPtr kdtree; + if (GetParam().find("TBB") != std::string::npos) { + gtsam_points::set_tbb_as_default(); + } else { + gtsam_points::set_omp_as_default(); + } + if (GetParam() == "KdTree") { kdtree = std::make_shared(points.data(), points.size()); } else if (GetParam() == "KdTreeMT") { kdtree = std::make_shared(points.data(), points.size(), 2); + } else if (GetParam() == "KdTreeTBB") { + kdtree = std::make_shared(points.data(), points.size(), 2); } else if (GetParam() == "KdTree2") { auto pts = std::make_shared(points); kdtree = std::make_shared>(pts); } else if (GetParam() == "KdTree2MT") { auto pts = std::make_shared(points); kdtree = std::make_shared>(pts, 2); + } else if (GetParam() == "KdTree2TBB") { + auto pts = std::make_shared(points); + kdtree = std::make_shared>(pts, 2); } else { FAIL() << "Unknown KdTree type: " << GetParam(); } diff --git a/src/test/test_loam_factors.cpp b/src/test/test_loam_factors.cpp index 8d1f4992..e7074e2a 100644 --- a/src/test/test_loam_factors.cpp +++ b/src/test/test_loam_factors.cpp @@ -16,6 +16,7 @@ #include #include #include +#include struct LOAMTestBase : public testing::Test { virtual void SetUp() { @@ -67,7 +68,7 @@ TEST_F(LOAMTestBase, LoadCheck) { EXPECT_EQ(poses_gt.size(), 5) << "Failed to load GT poses"; } -class LOAMFactorTest : public LOAMTestBase, public testing::WithParamInterface { +class LOAMFactorTest : public LOAMTestBase, public testing::WithParamInterface> { public: gtsam::NonlinearFactor::shared_ptr create_factor( gtsam::Key target_key, @@ -76,15 +77,25 @@ class LOAMFactorTest : public LOAMTestBase, public testing::WithParamInterface(param); + const std::string parallelism = std::get<1>(param); + const int num_threads = parallelism == "NONE" ? 1 : 2; gtsam::NonlinearFactor::shared_ptr factor; if (method == "LOAM") { - factor.reset(new gtsam_points::IntegratedLOAMFactor(target_key, source_key, target_edges, target_planes, source_edges, source_planes)); + auto f = + gtsam::make_shared(target_key, source_key, target_edges, target_planes, source_edges, source_planes); + f->set_num_threads(num_threads); + factor = f; } else if (method == "EDGE") { - factor.reset(new gtsam_points::IntegratedPointToEdgeFactor(target_key, source_key, target_edges, source_edges)); + auto f = gtsam::make_shared(target_key, source_key, target_edges, source_edges); + f->set_num_threads(num_threads); + factor = f; } else if (method == "PLANE") { - factor.reset(new gtsam_points::IntegratedPointToPlaneFactor(target_key, source_key, target_planes, source_planes)); + auto f = gtsam::make_shared(target_key, source_key, target_planes, source_planes); + f->set_num_threads(num_threads); + factor = f; } return factor; @@ -120,12 +131,26 @@ class LOAMFactorTest : public LOAMTestBase, public testing::WithParamInterface(info.param) + "_" + std::get<1>(info.param); }); TEST_P(LOAMFactorTest, AlignmentTest) { + const auto param = GetParam(); + const std::string method = std::get<0>(param); + const std::string parallelism = std::get<1>(param); + + if (parallelism == "TBB") { + gtsam_points::set_tbb_as_default(); + } else { + gtsam_points::set_omp_as_default(); + } + auto f = create_factor(0, 1, edge_frames[0], plane_frames[0], edge_frames[1], plane_frames[1]); if (f == nullptr) { - std::cerr << "[ ] SKIP:" << GetParam() << std::endl; + std::cerr << "[ ] SKIP:" << std::get<0>(GetParam()) << std::endl; return; } diff --git a/src/test/test_matching_cost_factors.cpp b/src/test/test_matching_cost_factors.cpp index 43196a7c..29d935af 100644 --- a/src/test/test_matching_cost_factors.cpp +++ b/src/test/test_matching_cost_factors.cpp @@ -12,8 +12,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -75,7 +77,7 @@ struct MatchingCostFactorsTestBase : public testing::Test { frame->add_covs(covs); frames.push_back(frame); -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA frames.back() = gtsam_points::PointCloudGPU::clone(*frames.back()); #endif @@ -83,7 +85,7 @@ struct MatchingCostFactorsTestBase : public testing::Test { voxelmap->insert(*frames.back()); voxelmaps.push_back(voxelmap); -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA auto voxelmap_gpu = std::make_shared(1.0); voxelmap_gpu->insert(*frames.back()); voxelmaps_gpu.push_back(voxelmap_gpu); @@ -92,7 +94,7 @@ struct MatchingCostFactorsTestBase : public testing::Test { #endif } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA gtsam_points::LinearizationHook::register_hook([] { return gtsam_points::create_nonlinear_factor_set_gpu(); }); stream_buffer_roundrobin.reset(new gtsam_points::StreamTempBufferRoundRobin(32)); #endif @@ -104,7 +106,7 @@ struct MatchingCostFactorsTestBase : public testing::Test { gtsam::Values poses; gtsam::Values poses_gt; -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA std::unique_ptr stream_buffer_roundrobin; #endif }; @@ -114,7 +116,7 @@ TEST_F(MatchingCostFactorsTestBase, LoadCheck) { ASSERT_EQ(poses_gt.size(), 5) << "Failed to load submap poses"; } -class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testing::WithParamInterface { +class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testing::WithParamInterface> { public: gtsam::NonlinearFactor::shared_ptr create_factor( gtsam::Key target_key, @@ -123,17 +125,27 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin const gtsam_points::GaussianVoxelMap::ConstPtr& target_voxelmap, const gtsam_points::GaussianVoxelMap::ConstPtr& target_voxelmap_gpu, const gtsam_points::PointCloud::ConstPtr& source) { - std::string method = GetParam(); + const auto param = GetParam(); + std::string method = std::get<0>(param); + std::string parallelism = std::get<1>(param); + + const int num_threads = parallelism == "NONE" ? 1 : 2; gtsam::NonlinearFactor::shared_ptr factor; if (method == "ICP") { - factor.reset(new gtsam_points::IntegratedICPFactor(target_key, source_key, target, source)); + auto f = gtsam::make_shared(target_key, source_key, target, source); + f->set_num_threads(num_threads); + factor = f; } else if (method == "GICP") { - factor.reset(new gtsam_points::IntegratedGICPFactor(target_key, source_key, target, source)); + auto f = gtsam::make_shared(target_key, source_key, target, source); + f->set_num_threads(num_threads); + factor = f; } else if (method == "VGICP") { - factor.reset(new gtsam_points::IntegratedVGICPFactor(target_key, source_key, target_voxelmap, source)); + auto f = gtsam::make_shared(target_key, source_key, target_voxelmap, source); + f->set_num_threads(num_threads); + factor = f; } else if (method == "VGICP_CUDA") { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer(); const auto& stream = stream_buffer.first; const auto& buffer = stream_buffer.second; @@ -151,17 +163,26 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin const gtsam_points::GaussianVoxelMap::ConstPtr& target_voxelmap, const gtsam_points::GaussianVoxelMap::ConstPtr& target_voxelmap_gpu, const gtsam_points::PointCloud::ConstPtr& source) { - std::string method = GetParam(); + const auto param = GetParam(); + const std::string method = std::get<0>(param); + const std::string parallelism = std::get<1>(param); + const int num_threads = parallelism == "OMP" ? 2 : 1; gtsam::NonlinearFactor::shared_ptr factor; if (method == "ICP") { - factor.reset(new gtsam_points::IntegratedICPFactor(fixed_target_pose, source_key, target, source)); + auto f = gtsam::make_shared(fixed_target_pose, source_key, target, source); + f->set_num_threads(num_threads); + factor = f; } else if (method == "GICP") { - factor.reset(new gtsam_points::IntegratedGICPFactor(fixed_target_pose, source_key, target, source)); + auto f = gtsam::make_shared(fixed_target_pose, source_key, target, source); + f->set_num_threads(num_threads); + factor = f; } else if (method == "VGICP") { - factor.reset(new gtsam_points::IntegratedVGICPFactor(fixed_target_pose, source_key, target_voxelmap, source)); + auto f = gtsam::make_shared(fixed_target_pose, source_key, target_voxelmap, source); + f->set_num_threads(num_threads); + factor = f; } else if (method == "VGICP_CUDA") { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer(); const auto& stream = stream_buffer.first; const auto& buffer = stream_buffer.second; @@ -209,14 +230,33 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin } }; -INSTANTIATE_TEST_SUITE_P(gtsam_points, MatchingCostFactorTest, testing::Values("ICP", "GICP", "VGICP", "VGICP_CUDA"), [](const auto& info) { - return info.param; -}); +INSTANTIATE_TEST_SUITE_P( + gtsam_points, + MatchingCostFactorTest, + testing::Combine( + testing::Values("ICP", "GICP", "VGICP", "VGICP_CUDA"), +#ifdef GTSAM_POINTS_USE_TBB + testing::Values("NONE", "OMP", "TBB") +#else + testing::Values("NONE", "OMP") +#endif + ), + [](const auto& info) { return std::get<0>(info.param) + "_" + std::get<1>(info.param); }); TEST_P(MatchingCostFactorTest, AlignmentTest) { + const auto param = GetParam(); + const auto method = std::get<0>(param); + const auto parallelism = std::get<1>(param); + + if (parallelism == "TBB") { + gtsam_points::set_tbb_as_default(); + } else { + gtsam_points::set_omp_as_default(); + } + auto f = create_factor(0, 1, frames[0], voxelmaps[0], voxelmaps_gpu[0], frames[1]); if (f == nullptr) { - std::cerr << "[ ] SKIP:" << GetParam() << std::endl; + std::cerr << "[ ] SKIP:" << method + "_" + parallelism << std::endl; return; } diff --git a/src/test/test_types.cpp b/src/test/test_types.cpp index 576e587d..a921f58f 100644 --- a/src/test/test_types.cpp +++ b/src/test/test_types.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -97,7 +98,7 @@ TEST(TestTypes, TestPointCloudCPU) { creation_test(); } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA template void creation_test_gpu() { diff --git a/src/test/test_voxelmap.cpp b/src/test/test_voxelmap.cpp index 8db150c2..70f8263e 100644 --- a/src/test/test_voxelmap.cpp +++ b/src/test/test_voxelmap.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,7 @@ struct VoxelMapTestBase : public testing::Test { frame->add_covs(gtsam_points::estimate_covariances(frame->points, frame->size())); frames.push_back(frame); -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA frames.back() = gtsam_points::PointCloudGPU::clone(*frames.back()); #endif @@ -65,7 +66,7 @@ struct VoxelMapTestBase : public testing::Test { voxelmap->insert(*frames.back()); voxelmaps.push_back(voxelmap); -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA auto voxelmap_gpu = std::make_shared(1.0); voxelmap_gpu->insert(*frames.back()); voxelmaps_gpu.push_back(voxelmap_gpu); @@ -163,7 +164,7 @@ TEST_F(VoxelMapTestBase, VoxelMapCPU) { validate_frame(merged2); } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA TEST_F(VoxelMapTestBase, VoxelMapGPU) { for (int i = 0; i < frames.size(); i++) {