Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support TBB backend #82

Merged
merged 3 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ endif()

# GPU-related
if(BUILD_WITH_CUDA)
add_definitions(-DBUILD_GTSAM_POINTS_GPU)
if(NOT GTSAM_POINTS_USE_CUDA)
message(FATAL_ERROR "gtsam_points is not built with CUDA!!")
endif()
set(GPU_SRCS src/glim/odometry/odometry_estimation_gpu.cpp)
endif()

Expand Down
3 changes: 3 additions & 0 deletions include/glim/preprocess/cloud_preprocessor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,16 @@ class CloudPreprocessor {
virtual PreprocessedFrame::Ptr preprocess(const RawPoints::ConstPtr& raw_points);

private:
PreprocessedFrame::Ptr preprocess_impl(const RawPoints::ConstPtr& raw_points);
std::vector<int> find_neighbors(const Eigen::Vector4d* points, const int num_points, const int k) const;

private:
using Params = CloudPreprocessorParams;
Params params;

mutable std::mt19937 mt;

std::shared_ptr<void> tbb_task_arena;
};

} // namespace glim
50 changes: 44 additions & 6 deletions src/glim/common/cloud_covariance_estimation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@

#include <spdlog/spdlog.h>

#include <gtsam_points/config.hpp>
#include <gtsam_points/util/parallelism.hpp>

#ifdef GTSAM_POINTS_USE_TBB
#include <tbb/parallel_for.h>
#endif

namespace glim {

CloudCovarianceEstimation::CloudCovarianceEstimation(const int num_threads) : regularization_method(RegularizationMethod::PLANE), num_threads(num_threads) {}
Expand Down Expand Up @@ -47,18 +54,30 @@ void CloudCovarianceEstimation::estimate(
assert(k_correspondences * points.size() == neighbors.size());
assert(k_neighbors <= k_correspondences);

// Precompute pt * pt.transpose()
std::vector<Eigen::Matrix4d> pt_cross(points.size());
#pragma omp parallel for num_threads(num_threads) schedule(guided, 8)
for (int i = 0; i < points.size(); i++) {
pt_cross[i] = points[i] * points[i].transpose();
if (gtsam_points::is_omp_default()) {
#pragma omp parallel for num_threads(num_threads) schedule(guided, 64)
for (int i = 0; i < points.size(); i++) {
pt_cross[i] = points[i] * points[i].transpose();
}
} else {
#ifdef GTSAM_POINTS_USE_TBB
tbb::parallel_for(tbb::blocked_range<int>(0, points.size(), 64), [&](const tbb::blocked_range<int>& range) {
for (int i = range.begin(); i < range.end(); i++) {
pt_cross[i] = points[i] * points[i].transpose();
}
});
#else
std::cerr << "error : TBB is not enabled" << std::endl;
abort();
#endif
}

normals.resize(points.size());
covs.resize(points.size());

// Calculate covariances
#pragma omp parallel for num_threads(num_threads) schedule(guided, 8)
for (int i = 0; i < points.size(); i++) {
const auto calc_cov = [&](int i) {
Eigen::Vector4d sum_points = Eigen::Vector4d::Zero();
Eigen::Matrix4d sum_cross = Eigen::Matrix4d::Zero();

Expand All @@ -80,6 +99,25 @@ void CloudCovarianceEstimation::estimate(
if (points[i].dot(normals[i]) > 0.0) {
normals[i] = -normals[i];
}
};

// Calculate covariances
if (gtsam_points::is_omp_default()) {
#pragma omp parallel for num_threads(num_threads) schedule(guided, 8)
for (int i = 0; i < points.size(); i++) {
calc_cov(i);
}
} else {
#ifdef GTSAM_POINTS_USE_TBB
tbb::parallel_for(tbb::blocked_range<int>(0, points.size(), 8), [&](const tbb::blocked_range<int>& range) {
for (int i = range.begin(); i < range.end(); i++) {
calc_cov(i);
}
});
#else
std::cerr << "error : TBB is not enabled" << std::endl;
abort();
#endif
}
}

Expand Down
19 changes: 10 additions & 9 deletions src/glim/mapping/global_mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <gtsam/nonlinear/Values.h>
#include <gtsam/nonlinear/NonlinearFactorGraph.h>

#include <gtsam_points/config.hpp>
#include <gtsam_points/types/point_cloud_cpu.hpp>
#include <gtsam_points/types/point_cloud_gpu.hpp>
#include <gtsam_points/types/gaussian_voxelmap_cpu.hpp>
Expand Down Expand Up @@ -72,7 +73,7 @@ GlobalMappingParams::GlobalMappingParams() {
GlobalMappingParams::~GlobalMappingParams() {}

GlobalMapping::GlobalMapping(const GlobalMappingParams& params) : params(params) {
#ifndef BUILD_GTSAM_POINTS_GPU
#ifndef GTSAM_POINTS_USE_CUDA
if (params.enable_gpu) {
logger->error("GPU-based factors cannot be used because GLIM is built without GPU option!!");
}
Expand All @@ -97,7 +98,7 @@ GlobalMapping::GlobalMapping(const GlobalMappingParams& params) : params(params)
isam2.reset(new gtsam_points::ISAM2ExtDummy(isam2_params));
}

#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
stream_buffer_roundrobin = std::make_shared<gtsam_points::StreamTempBufferRoundRobin>(64);
#endif

Expand Down Expand Up @@ -233,7 +234,7 @@ void GlobalMapping::insert_submap(int current, const SubMap::Ptr& submap) {
subsampled_submap = gtsam_points::random_sampling(submap->frame, params.randomsampling_rate, mt);
}

#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
if (params.enable_gpu && !submap->frame->points_gpu) {
submap->frame = gtsam_points::PointCloudGPU::clone(*submap->frame);
}
Expand Down Expand Up @@ -309,7 +310,7 @@ void GlobalMapping::find_overlapping_submaps(double min_overlap) {

if (false) {
}
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
else if (std::dynamic_pointer_cast<gtsam_points::GaussianVoxelMapGPU>(submaps[i]->voxelmaps.back()) && subsampled_submaps[j]->points_gpu) {
const auto stream_buffer = std::any_cast<std::shared_ptr<gtsam_points::StreamTempBufferRoundRobin>>(stream_buffer_roundrobin)->get_stream_buffer();
const auto& stream = stream_buffer.first;
Expand Down Expand Up @@ -431,7 +432,7 @@ boost::shared_ptr<gtsam::NonlinearFactorGraph> GlobalMapping::create_matching_co
factors->emplace_shared<gtsam_points::IntegratedVGICPFactor>(X(i), X(current), voxelmap, subsampled_submaps[current]);
}
}
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
else if (params.registration_error_factor_type == "VGICP_GPU") {
const auto stream_buffer = std::any_cast<std::shared_ptr<gtsam_points::StreamTempBufferRoundRobin>>(stream_buffer_roundrobin)->get_stream_buffer();
const auto& stream = stream_buffer.first;
Expand Down Expand Up @@ -480,7 +481,7 @@ void GlobalMapping::save(const std::string& path) {

for (const auto& factor : isam2->getFactorsUnsafe()) {
bool serializable = !boost::dynamic_pointer_cast<gtsam_points::IntegratedMatchingCostFactor>(factor)
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
&& !boost::dynamic_pointer_cast<gtsam_points::IntegratedVGICPFactorGPU>(factor)
#endif
;
Expand Down Expand Up @@ -513,7 +514,7 @@ void GlobalMapping::save(const std::string& path) {
} else if (boost::dynamic_pointer_cast<gtsam_points::IntegratedVGICPFactor>(factor.second)) {
type = "vgicp";
}
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
else if (boost::dynamic_pointer_cast<gtsam_points::IntegratedVGICPFactorGPU>(factor.second)) {
type = "vgicp_gpu";
}
Expand Down Expand Up @@ -612,7 +613,7 @@ bool GlobalMapping::load(const std::string& path) {
subsampled_submaps[i] = subsampled_submap;

if (params.enable_gpu) {
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
subsampled_submaps[i] = gtsam_points::PointCloudGPU::clone(*subsampled_submaps[i]);

for (int j = 0; j < params.submap_voxelmap_levels; j++) {
Expand Down Expand Up @@ -662,7 +663,7 @@ bool GlobalMapping::load(const std::string& path) {

if (type == "vgicp" || type == "vgicp_gpu") {
if (params.enable_gpu) {
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
const auto stream_buffer = std::any_cast<std::shared_ptr<gtsam_points::StreamTempBufferRoundRobin>>(stream_buffer_roundrobin)->get_stream_buffer();
const auto& stream = stream_buffer.first;
const auto& buffer = stream_buffer.second;
Expand Down
62 changes: 35 additions & 27 deletions src/glim/mapping/global_mapping_pose_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <gtsam_points/optimizers/isam2_ext.hpp>
#include <gtsam_points/optimizers/isam2_ext_dummy.hpp>
#include <gtsam_points/optimizers/levenberg_marquardt_ext.hpp>
#include <gtsam_points/util/easy_profiler.hpp>
#include <gtsam_points/util/parallelism.hpp>

#include <glim/util/config.hpp>
#include <glim/util/serialization.hpp>
Expand Down Expand Up @@ -370,14 +370,12 @@ void GlobalMappingPoseGraph::loop_detection_task() {
candidates_buffer.erase(candidates_buffer.begin(), candidates_buffer.begin() + eval_count);
}

std::vector<double> inlier_fractions(candidates.size());
std::vector<double> inlier_fractions(candidates.size(), 0.0);
std::vector<gtsam::Pose3> T_target_source(candidates.size());

// Evaluate loop candidates in parallel.
#pragma omp parallel for num_threads(params.num_threads) schedule(dynamic)
for (int i = 0; i < candidates.size(); i++) {
const auto evaluate_candidate = [&](int i) {
if (kill_switch) {
continue;
return;
}

const auto candidate = candidates[i];
Expand All @@ -399,16 +397,7 @@ void GlobalMappingPoseGraph::loop_detection_task() {

gtsam_points::LevenbergMarquardtExtParams lm_params;
lm_params.setMaxIterations(10);

#ifdef GTSAM_USE_TBB
auto arena = static_cast<tbb::task_arena*>(tbb_task_arena.get());
arena->execute([&] {
#endif
values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize();

#ifdef GTSAM_USE_TBB
});
#endif
values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize();

error = factor->error(values);
inlier_fraction = factor->inlier_fraction();
Expand All @@ -421,28 +410,47 @@ void GlobalMappingPoseGraph::loop_detection_task() {
gtsam_points::LevenbergMarquardtExtParams lm_params;
lm_params.setMaxIterations(10);

#ifdef GTSAM_USE_TBB
auto arena = static_cast<tbb::task_arena*>(tbb_task_arena.get());
arena->execute([&] {
#endif
values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize();

#ifdef GTSAM_USE_TBB
});
#endif
values = gtsam_points::LevenbergMarquardtOptimizerExt(graph, values, lm_params).optimize();

error = factor->error(values);
inlier_fraction = factor->inlier_fraction();
} else {
logger->warn("unknown registration type: {}", params.registration_type);
continue;
return;
}

logger->debug("target={}, source={}, error={}, inlier_fraction={}", target->submap->id, source->submap->id, error, inlier_fraction);

inlier_fractions[i] = inlier_fraction;
T_target_source[i] = values.at<gtsam::Pose3>(0);
}
};

// Evaluate loop candidates in parallel.
#ifdef GTSAM_USE_TBB
auto arena = static_cast<tbb::task_arena*>(tbb_task_arena.get());
arena->execute([&] {
#endif
if (gtsam_points::is_omp_default()) {
#pragma omp parallel for num_threads(params.num_threads) schedule(dynamic)
for (int i = 0; i < candidates.size(); i++) {
evaluate_candidate(i);
}
} else {
#ifdef GTSAM_POINTS_USE_TBB
tbb::parallel_for(tbb::blocked_range<int>(0, candidates.size(), 2), [&](const tbb::blocked_range<int>& range) {
for (int i = range.begin(); i < range.end(); i++) {
evaluate_candidate(i);
}
});
#else
std::cerr << "error : TBB is not enabled" << std::endl;
abort();
#endif
}

#ifdef GTSAM_USE_TBB
});
#endif

// Check the matching results.
std::vector<gtsam::NonlinearFactor::shared_ptr> factors;
Expand Down
11 changes: 6 additions & 5 deletions src/glim/mapping/sub_mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <gtsam/nonlinear/LinearContainerFactor.h>
#include <gtsam/nonlinear/LevenbergMarquardtOptimizer.h>

#include <gtsam_points/config.hpp>
#include <gtsam_points/types/point_cloud_cpu.hpp>
#include <gtsam_points/types/point_cloud_gpu.hpp>
#include <gtsam_points/types/gaussian_voxelmap_cpu.hpp>
Expand Down Expand Up @@ -81,7 +82,7 @@ SubMapping::SubMapping(const SubMappingParams& params) : params(params) {
values.reset(new gtsam::Values);
graph.reset(new gtsam::NonlinearFactorGraph);

#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
stream = std::make_shared<gtsam_points::CUDAStream>();
stream_buffer_roundrobin = std::make_shared<gtsam_points::StreamTempBufferRoundRobin>(8);
#endif
Expand Down Expand Up @@ -159,7 +160,7 @@ void SubMapping::insert_frame(const EstimationFrame::ConstPtr& odom_frame_) {
}
}

#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
if (params.enable_gpu && !odom_frame->frame->points_gpu) {
if (params.enable_gpu) {
auto stream = std::static_pointer_cast<gtsam_points::CUDAStream>(this->stream);
Expand Down Expand Up @@ -289,7 +290,7 @@ void SubMapping::insert_frame(const EstimationFrame::ConstPtr& odom_frame_) {
graph->emplace_shared<gtsam_points::IntegratedVGICPFactor>(X(keyframe_indices[i]), X(current), voxelmap, keyframes.back()->frame);
}
}
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
else if (params.registration_error_factor_type == "VGICP_GPU") {
auto roundrobin = std::static_pointer_cast<gtsam_points::StreamTempBufferRoundRobin>(stream_buffer_roundrobin);
auto stream_buffer = roundrobin->get_stream_buffer();
Expand Down Expand Up @@ -379,7 +380,7 @@ void SubMapping::insert_keyframe(const int current, const EstimationFrame::Const
*keyframe = *odom_frame;

if (params.enable_gpu) {
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
auto stream = std::static_pointer_cast<gtsam_points::CUDAStream>(this->stream);
keyframe->frame = gtsam_points::PointCloudGPU::clone(*subsampled_frame, *stream);
keyframe->voxelmaps.clear();
Expand Down Expand Up @@ -477,7 +478,7 @@ SubMap::Ptr SubMapping::create_submap(bool force_create) const {
}

// TODO: improve merging process
#ifdef BUILD_GTSAM_POINTS_GPU
#ifdef GTSAM_POINTS_USE_CUDA
if (params.enable_gpu) {
// submap->frame = gtsam_points::merge_frames_gpu(poses_to_merge, keyframes_to_merge, submap_downsample_resolution);
}
Expand Down
Loading
Loading