diff --git a/CMakeLists.txt b/CMakeLists.txt index a5d0b1f2..78eb3ef0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,9 @@ endif() # GPU-related if(BUILD_WITH_CUDA) - add_definitions(-DBUILD_GTSAM_POINTS_GPU) + if(NOT GTSAM_POINTS_USE_CUDA) + message(FATAL_ERROR "gtsam_points is not built with CUDA!!") + endif() set(GPU_SRCS src/glim/odometry/odometry_estimation_gpu.cpp) endif() diff --git a/include/glim/preprocess/cloud_preprocessor.hpp b/include/glim/preprocess/cloud_preprocessor.hpp index 7fb21f4b..43274a17 100644 --- a/include/glim/preprocess/cloud_preprocessor.hpp +++ b/include/glim/preprocess/cloud_preprocessor.hpp @@ -57,6 +57,7 @@ class CloudPreprocessor { virtual PreprocessedFrame::Ptr preprocess(const RawPoints::ConstPtr& raw_points); private: + PreprocessedFrame::Ptr preprocess_impl(const RawPoints::ConstPtr& raw_points); std::vector find_neighbors(const Eigen::Vector4d* points, const int num_points, const int k) const; private: @@ -64,6 +65,8 @@ class CloudPreprocessor { Params params; mutable std::mt19937 mt; + + std::shared_ptr tbb_task_arena; }; } // namespace glim \ No newline at end of file diff --git a/src/glim/common/cloud_covariance_estimation.cpp b/src/glim/common/cloud_covariance_estimation.cpp index dfe24ba6..6d168c01 100644 --- a/src/glim/common/cloud_covariance_estimation.cpp +++ b/src/glim/common/cloud_covariance_estimation.cpp @@ -8,6 +8,13 @@ #include +#include +#include + +#ifdef GTSAM_POINTS_USE_TBB +#include +#endif + namespace glim { CloudCovarianceEstimation::CloudCovarianceEstimation(const int num_threads) : regularization_method(RegularizationMethod::PLANE), num_threads(num_threads) {} @@ -47,18 +54,30 @@ void CloudCovarianceEstimation::estimate( assert(k_correspondences * points.size() == neighbors.size()); assert(k_neighbors <= k_correspondences); + // Precompute pt * pt.transpose() std::vector pt_cross(points.size()); -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < points.size(); i++) { - pt_cross[i] = points[i] * points[i].transpose(); + if (gtsam_points::is_omp_default()) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 64) + for (int i = 0; i < points.size(); i++) { + pt_cross[i] = points[i] * points[i].transpose(); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, points.size(), 64), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + pt_cross[i] = points[i] * points[i].transpose(); + } + }); +#else + std::cerr << "error : TBB is not enabled" << std::endl; + abort(); +#endif } normals.resize(points.size()); covs.resize(points.size()); - // Calculate covariances -#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) - for (int i = 0; i < points.size(); i++) { + const auto calc_cov = [&](int i) { Eigen::Vector4d sum_points = Eigen::Vector4d::Zero(); Eigen::Matrix4d sum_cross = Eigen::Matrix4d::Zero(); @@ -80,6 +99,25 @@ void CloudCovarianceEstimation::estimate( if (points[i].dot(normals[i]) > 0.0) { normals[i] = -normals[i]; } + }; + + // Calculate covariances + if (gtsam_points::is_omp_default()) { +#pragma omp parallel for num_threads(num_threads) schedule(guided, 8) + for (int i = 0; i < points.size(); i++) { + calc_cov(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, points.size(), 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + calc_cov(i); + } + }); +#else + std::cerr << "error : TBB is not enabled" << std::endl; + abort(); +#endif } } diff --git a/src/glim/mapping/global_mapping.cpp b/src/glim/mapping/global_mapping.cpp index 2529e116..2913ab6c 100644 --- a/src/glim/mapping/global_mapping.cpp +++ b/src/glim/mapping/global_mapping.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -72,7 +73,7 @@ GlobalMappingParams::GlobalMappingParams() { GlobalMappingParams::~GlobalMappingParams() {} GlobalMapping::GlobalMapping(const GlobalMappingParams& params) : params(params) { -#ifndef BUILD_GTSAM_POINTS_GPU +#ifndef GTSAM_POINTS_USE_CUDA if (params.enable_gpu) { logger->error("GPU-based factors cannot be used because GLIM is built without GPU option!!"); } @@ -97,7 +98,7 @@ GlobalMapping::GlobalMapping(const GlobalMappingParams& params) : params(params) isam2.reset(new gtsam_points::ISAM2ExtDummy(isam2_params)); } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA stream_buffer_roundrobin = std::make_shared(64); #endif @@ -233,7 +234,7 @@ void GlobalMapping::insert_submap(int current, const SubMap::Ptr& submap) { subsampled_submap = gtsam_points::random_sampling(submap->frame, params.randomsampling_rate, mt); } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA if (params.enable_gpu && !submap->frame->points_gpu) { submap->frame = gtsam_points::PointCloudGPU::clone(*submap->frame); } @@ -309,7 +310,7 @@ void GlobalMapping::find_overlapping_submaps(double min_overlap) { if (false) { } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA else if (std::dynamic_pointer_cast(submaps[i]->voxelmaps.back()) && subsampled_submaps[j]->points_gpu) { const auto stream_buffer = std::any_cast>(stream_buffer_roundrobin)->get_stream_buffer(); const auto& stream = stream_buffer.first; @@ -431,7 +432,7 @@ boost::shared_ptr GlobalMapping::create_matching_co factors->emplace_shared(X(i), X(current), voxelmap, subsampled_submaps[current]); } } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA else if (params.registration_error_factor_type == "VGICP_GPU") { const auto stream_buffer = std::any_cast>(stream_buffer_roundrobin)->get_stream_buffer(); const auto& stream = stream_buffer.first; @@ -480,7 +481,7 @@ void GlobalMapping::save(const std::string& path) { for (const auto& factor : isam2->getFactorsUnsafe()) { bool serializable = !boost::dynamic_pointer_cast(factor) -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA && !boost::dynamic_pointer_cast(factor) #endif ; @@ -513,7 +514,7 @@ void GlobalMapping::save(const std::string& path) { } else if (boost::dynamic_pointer_cast(factor.second)) { type = "vgicp"; } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA else if (boost::dynamic_pointer_cast(factor.second)) { type = "vgicp_gpu"; } @@ -612,7 +613,7 @@ bool GlobalMapping::load(const std::string& path) { subsampled_submaps[i] = subsampled_submap; if (params.enable_gpu) { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA subsampled_submaps[i] = gtsam_points::PointCloudGPU::clone(*subsampled_submaps[i]); for (int j = 0; j < params.submap_voxelmap_levels; j++) { @@ -662,7 +663,7 @@ bool GlobalMapping::load(const std::string& path) { if (type == "vgicp" || type == "vgicp_gpu") { if (params.enable_gpu) { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA const auto stream_buffer = std::any_cast>(stream_buffer_roundrobin)->get_stream_buffer(); const auto& stream = stream_buffer.first; const auto& buffer = stream_buffer.second; diff --git a/src/glim/mapping/global_mapping_pose_graph.cpp b/src/glim/mapping/global_mapping_pose_graph.cpp index 15bb4229..47e2e514 100644 --- a/src/glim/mapping/global_mapping_pose_graph.cpp +++ b/src/glim/mapping/global_mapping_pose_graph.cpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -370,14 +370,12 @@ void GlobalMappingPoseGraph::loop_detection_task() { candidates_buffer.erase(candidates_buffer.begin(), candidates_buffer.begin() + eval_count); } - std::vector inlier_fractions(candidates.size()); + std::vector inlier_fractions(candidates.size(), 0.0); std::vector T_target_source(candidates.size()); - // Evaluate loop candidates in parallel. -#pragma omp parallel for num_threads(params.num_threads) schedule(dynamic) - for (int i = 0; i < candidates.size(); i++) { + const auto evaluate_candidate = [&](int i) { if (kill_switch) { - continue; + return; } const auto candidate = candidates[i]; @@ -399,16 +397,7 @@ void GlobalMappingPoseGraph::loop_detection_task() { gtsam_points::LevenbergMarquardtExtParams lm_params; lm_params.setMaxIterations(10); - -#ifdef GTSAM_USE_TBB - auto arena = static_cast(tbb_task_arena.get()); - arena->execute([&] { -#endif - values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize(); - -#ifdef GTSAM_USE_TBB - }); -#endif + values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize(); error = factor->error(values); inlier_fraction = factor->inlier_fraction(); @@ -421,28 +410,47 @@ void GlobalMappingPoseGraph::loop_detection_task() { gtsam_points::LevenbergMarquardtExtParams lm_params; lm_params.setMaxIterations(10); -#ifdef GTSAM_USE_TBB - auto arena = static_cast(tbb_task_arena.get()); - arena->execute([&] { -#endif - values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize(); - -#ifdef GTSAM_USE_TBB - }); -#endif + values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize(); error = factor->error(values); inlier_fraction = factor->inlier_fraction(); } else { logger->warn("unknown registration type: {}", params.registration_type); - continue; + return; } logger->debug("target={}, source={}, error={}, inlier_fraction={}", target->submap->id, source->submap->id, error, inlier_fraction); inlier_fractions[i] = inlier_fraction; T_target_source[i] = values.at(0); - } + }; + + // Evaluate loop candidates in parallel. +#ifdef GTSAM_USE_TBB + auto arena = static_cast(tbb_task_arena.get()); + arena->execute([&] { +#endif + if (gtsam_points::is_omp_default()) { +#pragma omp parallel for num_threads(params.num_threads) schedule(dynamic) + for (int i = 0; i < candidates.size(); i++) { + evaluate_candidate(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, candidates.size(), 2), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + evaluate_candidate(i); + } + }); +#else + std::cerr << "error : TBB is not enabled" << std::endl; + abort(); +#endif + } + +#ifdef GTSAM_USE_TBB + }); +#endif // Check the matching results. std::vector factors; diff --git a/src/glim/mapping/sub_mapping.cpp b/src/glim/mapping/sub_mapping.cpp index d097f54b..4c8ea840 100644 --- a/src/glim/mapping/sub_mapping.cpp +++ b/src/glim/mapping/sub_mapping.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -81,7 +82,7 @@ SubMapping::SubMapping(const SubMappingParams& params) : params(params) { values.reset(new gtsam::Values); graph.reset(new gtsam::NonlinearFactorGraph); -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA stream = std::make_shared(); stream_buffer_roundrobin = std::make_shared(8); #endif @@ -159,7 +160,7 @@ void SubMapping::insert_frame(const EstimationFrame::ConstPtr& odom_frame_) { } } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA if (params.enable_gpu && !odom_frame->frame->points_gpu) { if (params.enable_gpu) { auto stream = std::static_pointer_cast(this->stream); @@ -289,7 +290,7 @@ void SubMapping::insert_frame(const EstimationFrame::ConstPtr& odom_frame_) { graph->emplace_shared(X(keyframe_indices[i]), X(current), voxelmap, keyframes.back()->frame); } } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA else if (params.registration_error_factor_type == "VGICP_GPU") { auto roundrobin = std::static_pointer_cast(stream_buffer_roundrobin); auto stream_buffer = roundrobin->get_stream_buffer(); @@ -379,7 +380,7 @@ void SubMapping::insert_keyframe(const int current, const EstimationFrame::Const *keyframe = *odom_frame; if (params.enable_gpu) { -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA auto stream = std::static_pointer_cast(this->stream); keyframe->frame = gtsam_points::PointCloudGPU::clone(*subsampled_frame, *stream); keyframe->voxelmaps.clear(); @@ -477,7 +478,7 @@ SubMap::Ptr SubMapping::create_submap(bool force_create) const { } // TODO: improve merging process -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA if (params.enable_gpu) { // submap->frame = gtsam_points::merge_frames_gpu(poses_to_merge, keyframes_to_merge, submap_downsample_resolution); } diff --git a/src/glim/preprocess/cloud_preprocessor.cpp b/src/glim/preprocess/cloud_preprocessor.cpp index 63a63d16..c1bb891f 100644 --- a/src/glim/preprocess/cloud_preprocessor.cpp +++ b/src/glim/preprocess/cloud_preprocessor.cpp @@ -3,11 +3,18 @@ #include #include #include +#include #include #include +#include #include +#ifdef GTSAM_POINTS_USE_TBB +#include +#include +#endif + namespace glim { CloudPreprocessorParams::CloudPreprocessorParams() { @@ -28,16 +35,38 @@ CloudPreprocessorParams::CloudPreprocessorParams() { k_correspondences = config.param("preprocess", "k_correspondences", 8); - num_threads = config.param("preprocess", "num_threads", 10); + num_threads = config.param("preprocess", "num_threads", 2); } CloudPreprocessorParams::~CloudPreprocessorParams() {} -CloudPreprocessor::CloudPreprocessor(const CloudPreprocessorParams& params) : params(params) {} +CloudPreprocessor::CloudPreprocessor(const CloudPreprocessorParams& params) : params(params) { +#ifdef GTSAM_POINTS_USE_TBB + if (gtsam_points::is_tbb_default()) { + tbb_task_arena.reset(new tbb::task_arena(params.num_threads)); + } +#endif +} CloudPreprocessor::~CloudPreprocessor() {} PreprocessedFrame::Ptr CloudPreprocessor::preprocess(const RawPoints::ConstPtr& raw_points) { + if (gtsam_points::is_omp_default() || params.num_threads == 1 || !tbb_task_arena) { + return preprocess_impl(raw_points); + } + + PreprocessedFrame::Ptr preprocessed; +#ifdef GTSAM_POINTS_USE_TBB + auto arena = static_cast(tbb_task_arena.get()); + arena->execute([&] { preprocessed = preprocess_impl(raw_points); }); +#else + std::cerr << "error : TBB is not enabled" << std::endl; + abort(); +#endif + return preprocessed; +} + +PreprocessedFrame::Ptr CloudPreprocessor::preprocess_impl(const RawPoints::ConstPtr& raw_points) { spdlog::trace("preprocessing input: {} points", raw_points->size()); gtsam_points::PointCloud::Ptr frame(new gtsam_points::PointCloud); @@ -112,12 +141,29 @@ std::vector CloudPreprocessor::find_neighbors(const Eigen::Vector4d* points std::vector neighbors(num_points * k); -#pragma omp parallel for num_threads(params.num_threads) schedule(guided, 8) - for (int i = 0; i < num_points; i++) { + const auto perpoint_task = [&](int i) { std::vector k_indices(k); std::vector k_sq_dists(k); tree.knn_search(points[i].data(), k, k_indices.data(), k_sq_dists.data()); std::copy(k_indices.begin(), k_indices.end(), neighbors.begin() + i * k); + }; + + if (gtsam_points::is_omp_default()) { +#pragma omp parallel for num_threads(params.num_threads) schedule(guided, 8) + for (int i = 0; i < num_points; i++) { + perpoint_task(i); + } + } else { +#ifdef GTSAM_POINTS_USE_TBB + tbb::parallel_for(tbb::blocked_range(0, num_points, 8), [&](const tbb::blocked_range& range) { + for (int i = range.begin(); i < range.end(); i++) { + perpoint_task(i); + } + }); +#else + std::cerr << "error : TBB is not enabled" << std::endl; + abort(); +#endif } return neighbors; diff --git a/src/glim/viewer/interactive_viewer.cpp b/src/glim/viewer/interactive_viewer.cpp index 39201a86..25fe2283 100644 --- a/src/glim/viewer/interactive_viewer.cpp +++ b/src/glim/viewer/interactive_viewer.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -452,7 +453,7 @@ void InteractiveViewer::globalmap_on_smoother_update(gtsam_points::ISAM2Ext& isa if (boost::dynamic_pointer_cast(factor)) { inserted_factors.push_back(std::make_tuple(FactorType::MATCHING_COST, factor->keys()[0], factor->keys()[1])); } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA if (boost::dynamic_pointer_cast(factor)) { inserted_factors.push_back(std::make_tuple(FactorType::MATCHING_COST, factor->keys()[0], factor->keys()[1])); } diff --git a/src/glim/viewer/memory_monitor.cpp b/src/glim/viewer/memory_monitor.cpp index 1cda02f3..019500bb 100644 --- a/src/glim/viewer/memory_monitor.cpp +++ b/src/glim/viewer/memory_monitor.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace glim { @@ -32,7 +33,7 @@ class MemoryMonitor : public ExtensionModule { } last_update_time = now; -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA size_t gpu_free = 0; size_t gpu_total = 0; gtsam_points::cuda_mem_get_info(&gpu_free, &gpu_total); diff --git a/src/glim/viewer/offline_viewer.cpp b/src/glim/viewer/offline_viewer.cpp index ec989bb3..c85631ff 100644 --- a/src/glim/viewer/offline_viewer.cpp +++ b/src/glim/viewer/offline_viewer.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -22,7 +23,7 @@ void OfflineViewer::setup_ui() { progress_modal.reset(new guik::ProgressModal("offline_viewer_progress")); -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA gtsam_points::LinearizationHook::register_hook([] { return gtsam_points::create_nonlinear_factor_set_gpu(); }); #endif } diff --git a/src/glim/viewer/standard_viewer.cpp b/src/glim/viewer/standard_viewer.cpp index 4806d3d6..710b8667 100644 --- a/src/glim/viewer/standard_viewer.cpp +++ b/src/glim/viewer/standard_viewer.cpp @@ -4,12 +4,13 @@ #include #include +#include #include #include #include #include -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA #include #endif @@ -283,7 +284,7 @@ void StandardViewer::set_callbacks() { continue; } -#ifdef BUILD_GTSAM_POINTS_GPU +#ifdef GTSAM_POINTS_USE_CUDA const auto gpu_factor = boost::dynamic_pointer_cast(factor); if (gpu_factor) { const auto l = [this, idx0](const gtsam::NonlinearFactor* factor) -> std::optional {