Skip to content

Commit

Permalink
feat: accelerated preprocessing for centerpoint
Browse files Browse the repository at this point in the history
Signed-off-by: Kenzo Lobos-Tsunekawa <[email protected]>
  • Loading branch information
knzo25 committed May 12, 2024
1 parent 569d56c commit ca1370d
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#ifndef LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
#define LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_

#include <lidar_centerpoint/cuda_utils.hpp>

#include <tf2_ros/buffer.h>
#include <tf2_ros/transform_listener.h>
#ifdef ROS_DISTRO_GALACTIC
Expand Down Expand Up @@ -48,7 +50,10 @@ class DensificationParam

struct PointCloudWithTransform
{
sensor_msgs::msg::PointCloud2 pointcloud_msg;
cuda::unique_ptr<float[]> points_d{nullptr};
std_msgs::msg::Header header;
size_t num_points{0};
size_t point_step{0};
Eigen::Affine3f affine_past2world;
};

Expand All @@ -58,7 +63,8 @@ class PointCloudDensification
explicit PointCloudDensification(const DensificationParam & param);

bool enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t streams);

double getCurrentTimestamp() const { return current_timestamp_; }
Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
Expand All @@ -73,7 +79,8 @@ class PointCloudDensification
unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }

private:
void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
void enqueue(
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine, cudaStream_t stream);
void dequeue();

DensificationParam param_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@

namespace centerpoint
{
cudaError_t generateSweepPoints_launch(
const float * input_points, size_t points_size, int input_point_step, float time_lag,
const float * transform, int num_features, float * output_points, cudaStream_t stream);

cudaError_t generateVoxels_random_launch(
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ class VoxelGeneratorTemplate
explicit VoxelGeneratorTemplate(
const DensificationParam & param, const CenterPointConfig & config);

virtual std::size_t generateSweepPoints(std::vector<float> & points) = 0;
virtual std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) = 0;

bool enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t stream);

protected:
std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
Expand All @@ -50,7 +51,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate
public:
using VoxelGeneratorTemplate::VoxelGeneratorTemplate;

std::size_t generateSweepPoints(std::vector<float> & points) override;
std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) override;
};

} // namespace centerpoint
Expand Down
7 changes: 2 additions & 5 deletions perception/lidar_centerpoint/lib/centerpoint_trt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,11 @@ bool CenterPointTRT::detect(
bool CenterPointTRT::preprocess(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
{
bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream_);
if (!is_success) {
return false;
}
const auto count = vg_ptr_->generateSweepPoints(points_);
CHECK_CUDA_ERROR(cudaMemcpyAsync(
points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
cudaMemcpyHostToDevice, stream_));
const auto count = vg_ptr_->generateSweepPoints(points_d_.get(), stream_);
CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
CHECK_CUDA_ERROR(
cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ PointCloudDensification::PointCloudDensification(const DensificationParam & para
}

bool PointCloudDensification::enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t stream)
{
const auto header = pointcloud_msg.header;

Expand All @@ -73,9 +74,9 @@ bool PointCloudDensification::enqueuePointCloud(
}
auto affine_world2current = transformToEigen(transform_world2current.get());

enqueue(pointcloud_msg, affine_world2current);
enqueue(pointcloud_msg, affine_world2current, stream);
} else {
enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
enqueue(pointcloud_msg, Eigen::Affine3f::Identity(), stream);
}

dequeue();
Expand All @@ -84,12 +85,24 @@ bool PointCloudDensification::enqueuePointCloud(
}

void PointCloudDensification::enqueue(
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current,
cudaStream_t stream)
{
affine_world2current_ = affine_world2current;
current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
pointcloud_cache_.push_front(pointcloud);

assert(sizeof(uint8_t) * msg.width * msg.height * msg.point_step % sizeof(float) == 0);
auto points_d = cuda::make_unique<float[]>(
sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(float));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
points_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
cudaMemcpyHostToDevice, stream));

PointCloudWithTransform pointcloud = {
std::move(points_d), msg.header, msg.width * msg.height, msg.point_step,
affine_world2current.inverse()};

pointcloud_cache_.push_front(std::move(pointcloud));
}

void PointCloudDensification::dequeue()
Expand Down
49 changes: 47 additions & 2 deletions perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@
* limitations under the License.
*/

#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"

#include <lidar_centerpoint/cuda_utils.hpp>
#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
#include <lidar_centerpoint/utils.hpp>

#include <cassert>
namespace
{
const std::size_t MAX_POINT_IN_VOXEL_SIZE = 32; // the same as max_point_in_voxel_size_ in config
Expand All @@ -41,6 +42,50 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9; // the same as encoder_in_featur

namespace centerpoint
{

__global__ void generateSweepPoints_kernel(
const float * input_points, size_t points_size, int input_point_step, float time_lag,
const float * transform_array, int num_features, float * output_points)
{
int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (point_idx >= points_size) return;

const float input_x = input_points[point_idx * input_point_step + 0];
const float input_y = input_points[point_idx * input_point_step + 1];
const float input_z = input_points[point_idx * input_point_step + 2];

output_points[point_idx * num_features + 0] = transform_array[0] * input_x +
transform_array[1] * input_y +
transform_array[2] * input_z + transform_array[3];
output_points[point_idx * num_features + 1] = transform_array[4] * input_x +
transform_array[5] * input_y +
transform_array[6] * input_z + transform_array[7];
output_points[point_idx * num_features + 2] = transform_array[8] * input_x +
transform_array[9] * input_y +
transform_array[10] * input_z + transform_array[11];
output_points[point_idx * num_features + 3] = time_lag;
}

cudaError_t generateSweepPoints_launch(
const float * input_points, size_t points_size, int input_point_step, float time_lag,
const float * transform_array, int num_features, float * output_points, cudaStream_t stream)
{
auto transform_d = cuda::make_unique<float[]>(16);
CHECK_CUDA_ERROR(cudaMemcpyAsync(
transform_d.get(), transform_array, 16 * sizeof(float), cudaMemcpyHostToDevice, stream));

dim3 blocks((points_size + 256 - 1) / 256);
dim3 threads(256);
assert(num_features == 4);

generateSweepPoints_kernel<<<blocks, threads, 0, stream>>>(
input_points, points_size, input_point_step, time_lag, transform_d.get(), num_features,
output_points);

cudaError_t err = cudaGetLastError();
return err;
}

__global__ void generateVoxels_random_kernel(
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
Expand Down
33 changes: 15 additions & 18 deletions perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include "lidar_centerpoint/preprocess/voxel_generator.hpp"

#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>

#include <sensor_msgs/point_cloud2_iterator.hpp>

namespace centerpoint
Expand All @@ -38,35 +40,30 @@ VoxelGeneratorTemplate::VoxelGeneratorTemplate(
}

bool VoxelGeneratorTemplate::enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t stream)
{
return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream);
}

std::size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t stream)
{
Eigen::Vector3f point_current, point_past;
size_t point_counter{};
size_t point_counter = 0;
for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
pc_cache_iter++) {
auto pc_msg = pc_cache_iter->pointcloud_msg;
auto sweep_num_points = pc_cache_iter->num_points;
auto point_step = pc_cache_iter->point_step;
auto affine_past2current =
pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
float time_lag = static_cast<float>(
pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds());
pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());

for (sensor_msgs::PointCloud2ConstIterator<float> x_iter(pc_msg, "x"), y_iter(pc_msg, "y"),
z_iter(pc_msg, "z");
x_iter != x_iter.end(); ++x_iter, ++y_iter, ++z_iter) {
point_past << *x_iter, *y_iter, *z_iter;
point_current = affine_past2current * point_past;
generateSweepPoints_launch(
pc_cache_iter->points_d.get(), sweep_num_points, point_step / sizeof(float), time_lag,
affine_past2current.matrix().data(), config_.point_feature_size_, points_d + point_counter,
stream);

points.at(point_counter * config_.point_feature_size_) = point_current.x();
points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
++point_counter;
}
point_counter += sweep_num_points;
}
return point_counter;
}
Expand Down

0 comments on commit ca1370d

Please sign in to comment.