perf(image_projection_based_fusion,lidar_centerpoint): add cuda prepr…

…ocess (#5681)
autowarefoundation · Nov 29, 2023 · 5daaf87 · 5daaf87
1 parent b65bfe0
commit 5daaf87
Show file tree

Hide file tree

Showing 14 changed files with 325 additions and 209 deletions.
diff --git a/...d_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp b/...d_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
@@ -23,6 +23,7 @@
 
 namespace image_projection_based_fusion
 {
+static constexpr size_t CAPACITY_POINT = 1000000;
 class PointPaintingTRT : public centerpoint::CenterPointTRT
 {
 public:
@@ -33,8 +34,6 @@ class PointPaintingTRT : public centerpoint::CenterPointTRT
     const centerpoint::DensificationParam & densification_param,
     const centerpoint::CenterPointConfig & config);
 
-  ~PointPaintingTRT();
-
 protected:
   bool preprocess(
     const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg,

diff --git a/...d_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp b/...d_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp
@@ -20,9 +20,19 @@
 
 namespace image_projection_based_fusion
 {
+cudaError_t generateVoxels_random_launch(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
+  cudaStream_t stream);
+
+cudaError_t generateBaseFeatures_launch(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream);
+
 cudaError_t generateFeatures_launch(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
+  const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
   const float voxel_size_y, const float voxel_size_z, const float range_min_x,
   const float range_min_y, const float range_min_z, float * features,
   const std::size_t encoder_in_feature_size, cudaStream_t stream);

diff --git a/...sed_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp b/...sed_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
@@ -27,9 +27,7 @@ class VoxelGenerator : public centerpoint::VoxelGenerator
 public:
   using centerpoint::VoxelGenerator::VoxelGenerator;
 
-  std::size_t pointsToVoxels(
-    std::vector<float> & voxels, std::vector<int> & coordinates,
-    std::vector<float> & num_points_per_voxel) override;
+  std::size_t generateSweepPoints(std::vector<float> & points) override;
 };
 }  // namespace image_projection_based_fusion
 

diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp
@@ -43,26 +43,37 @@ bool PointPaintingTRT::preprocess(
   if (!is_success) {
     return false;
   }
-  num_voxels_ = vg_ptr_pp_->pointsToVoxels(voxels_, coordinates_, num_points_per_voxel_);
-  if (num_voxels_ == 0) {
-    return false;
-  }
-  const auto voxels_size =
-    num_voxels_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_;
-  const auto coordinates_size = num_voxels_ * config_.point_dim_size_;
-  // memcpy from host to device (not copy empty voxels)
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    voxels_d_.get(), voxels_.data(), voxels_size * sizeof(float), cudaMemcpyHostToDevice));
+  const auto count = vg_ptr_pp_->generateSweepPoints(points_);
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    coordinates_d_.get(), coordinates_.data(), coordinates_size * sizeof(int),
-    cudaMemcpyHostToDevice));
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    num_points_per_voxel_d_.get(), num_points_per_voxel_.data(), num_voxels_ * sizeof(float),
-    cudaMemcpyHostToDevice));
-  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+    points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_, stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_, stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    voxels_d_.get(), 0,
+    config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_ *
+      sizeof(float),
+    stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    coordinates_d_.get(), 0, config_.max_voxel_size_ * config_.point_dim_size_ * sizeof(int),
+    stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_));
+
+  CHECK_CUDA_ERROR(image_projection_based_fusion::generateVoxels_random_launch(
+    points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_,
+    config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_,
+    config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_,
+    mask_d_.get(), voxels_buffer_d_.get(), stream_));
+
+  CHECK_CUDA_ERROR(image_projection_based_fusion::generateBaseFeatures_launch(
+    mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
+    num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
+    stream_));
 
   CHECK_CUDA_ERROR(image_projection_based_fusion::generateFeatures_launch(
-    voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_,
+    voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(),
     config_.max_voxel_size_, config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_,
     config_.range_min_x_, config_.range_min_y_, config_.range_min_z_, encoder_in_features_d_.get(),
     config_.encoder_in_feature_size_, stream_));

diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu b/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu
@@ -57,9 +57,105 @@ std::size_t divup(const std::size_t a, const std::size_t b)
 
 namespace image_projection_based_fusion
 {
+__global__ void generateVoxels_random_kernel(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  float x = points[point_idx * POINT_FEATURE_SIZE];
+  float y = points[point_idx * POINT_FEATURE_SIZE + 1];
+  float z = points[point_idx * POINT_FEATURE_SIZE + 2];
+
+  if (
+    x < min_x_range || x >= max_x_range || y < min_y_range || y >= max_y_range || z < min_z_range ||
+    z >= max_z_range)
+    return;
+
+  int voxel_idx = floorf((x - min_x_range) / pillar_x_size);
+  int voxel_idy = floorf((y - min_y_range) / pillar_y_size);
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+
+  unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
+
+  if (point_id >= MAX_POINT_IN_VOXEL_SIZE) return;
+  float * address =
+    voxels + (voxel_index * MAX_POINT_IN_VOXEL_SIZE + point_id) * POINT_FEATURE_SIZE;
+  for (unsigned int i = 0; i < POINT_FEATURE_SIZE; ++i) {
+    atomicExch(address + i, points[point_idx * POINT_FEATURE_SIZE + i]);
+  }
+}
+
+cudaError_t generateVoxels_random_launch(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
+  cudaStream_t stream)
+{
+  dim3 blocks((points_size + 256 - 1) / 256);
+  dim3 threads(256);
+  generateVoxels_random_kernel<<<blocks, threads, 0, stream>>>(
+    points, points_size, min_x_range, max_x_range, min_y_range, max_y_range, min_z_range,
+    max_z_range, pillar_x_size, pillar_y_size, pillar_z_size, grid_y_size, grid_x_size, mask,
+    voxels);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void generateBaseFeatures_kernel(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs)
+{
+  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;
+
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+  unsigned int count = mask[voxel_index];
+  if (!(count > 0)) return;
+  count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE;
+
+  unsigned int current_pillarId = 0;
+  current_pillarId = atomicAdd(pillar_num, 1);
+
+  voxel_num[current_pillarId] = count;
+
+  uint3 idx = {0, voxel_idy, voxel_idx};
+  ((uint3 *)voxel_idxs)[current_pillarId] = idx;
+
+  for (int i = 0; i < count; i++) {
+    int inIndex = voxel_index * MAX_POINT_IN_VOXEL_SIZE + i;
+    int outIndex = current_pillarId * MAX_POINT_IN_VOXEL_SIZE + i;
+    for (unsigned int j = 0; j < POINT_FEATURE_SIZE; ++j) {
+      voxel_features[outIndex * POINT_FEATURE_SIZE + j] = voxels[inIndex * POINT_FEATURE_SIZE + j];
+    }
+  }
+
+  // clear buffer for next infer
+  atomicExch(mask + voxel_index, 0);
+}
+
+// create 4 channels
+cudaError_t generateBaseFeatures_launch(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream)
+{
+  dim3 threads = {32, 32};
+  dim3 blocks = {
+    (grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y};
+
+  generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>(
+    mask, voxels, grid_y_size, grid_x_size, pillar_num, voxel_features, voxel_num, voxel_idxs);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
 __global__ void generateFeatures_kernel(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
+  const unsigned int * num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
   const float range_min_x, const float range_min_y, const float range_min_z, float * features,
   const std::size_t encoder_in_feature_size)
 {
@@ -70,7 +166,8 @@ __global__ void generateFeatures_kernel(
   int point_idx = threadIdx.x % MAX_POINT_IN_VOXEL_SIZE;
   int pillar_idx_inBlock = threadIdx.x / MAX_POINT_IN_VOXEL_SIZE;
 
-  if (pillar_idx >= num_voxels) return;
+  unsigned int num_pillars = num_voxels[0];
+  if (pillar_idx >= num_pillars) return;
 
   // load src
   __shared__ float pillarSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE][POINT_FEATURE_SIZE];
@@ -155,7 +252,7 @@ __global__ void generateFeatures_kernel(
 
 cudaError_t generateFeatures_launch(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
+  const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
   const float voxel_size_y, const float voxel_size_z, const float range_min_x,
   const float range_min_y, const float range_min_z, float * features,
   const std::size_t encoder_in_feature_size, cudaStream_t stream)

diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
@@ -18,29 +18,10 @@
 
 namespace image_projection_based_fusion
 {
-std::size_t VoxelGenerator::pointsToVoxels(
-  std::vector<float> & voxels, std::vector<int> & coordinates,
-  std::vector<float> & num_points_per_voxel)
+size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
 {
-  // voxels (float): (max_num_voxels * max_num_points_per_voxel * point_feature_size)
-  // coordinates (int): (max_num_voxels * point_dim_size)
-  // num_points_per_voxel (float): (max_num_voxels)
-
-  const std::size_t grid_size = config_.grid_size_z_ * config_.grid_size_y_ * config_.grid_size_x_;
-  std::vector<std::optional<int>> coord_to_voxel_idx(grid_size, -1);
-
-  std::size_t voxel_cnt = 0;  // @return
-  // std::array<float, config_.point_feature_size> point;
-  // std::array<float, config_.point_dim_size> coord_zyx;
-  std::vector<float> point;
-  point.resize(config_.point_feature_size_);
-  std::vector<float> coord_zyx;
-  coord_zyx.resize(config_.point_dim_size_);
-  bool out_of_range;
-  std::size_t point_cnt;
-  int c, coord_idx, voxel_idx;
   Eigen::Vector3f point_current, point_past;
-
+  size_t point_counter{};
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
        pc_cache_iter++) {
     auto pc_msg = pc_cache_iter->pointcloud_msg;
@@ -55,62 +36,25 @@ std::size_t VoxelGenerator::pointsToVoxels(
       point_past << *x_iter, *y_iter, *z_iter;
       point_current = affine_past2current * point_past;
 
-      point[0] = point_current.x();
-      point[1] = point_current.y();
-      point[2] = point_current.z();
-      point[3] = time_lag;
+      points.at(point_counter * config_.point_feature_size_) = point_current.x();
+      points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
+      points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
+      points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
       // decode the class value back to one-hot binary and assign it to point
-      std::fill(point.begin() + 4, point.end(), 0);
+      for (size_t i = 0; i < config_.class_size_; ++i) {
+        points.at(point_counter * config_.point_feature_size_ + 4 + i) = 0.f;
+      }
       auto class_value = static_cast<int>(*class_iter);
-      auto iter = point.begin() + 4;
+      auto iter = points.begin() + point_counter * config_.point_feature_size_ + 4;
       while (class_value > 0) {
         *iter = class_value % 2;
         class_value /= 2;
         ++iter;
       }
-
-      out_of_range = false;
-      for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
-        c = static_cast<int>((point[di] - range_[di]) * recip_voxel_size_[di]);
-        if (c < 0 || c >= grid_size_[di]) {
-          out_of_range = true;
-          break;
-        }
-        coord_zyx[config_.point_dim_size_ - di - 1] = c;
-      }
-      if (out_of_range) {
-        continue;
-      }
-
-      coord_idx = coord_zyx[0] * config_.grid_size_y_ * config_.grid_size_x_ +
-                  coord_zyx[1] * config_.grid_size_x_ + coord_zyx[2];
-      voxel_idx = coord_to_voxel_idx[coord_idx].value();
-      if (voxel_idx == -1) {
-        voxel_idx = voxel_cnt;
-        if (voxel_cnt >= config_.max_voxel_size_) {
-          continue;
-        }
-
-        voxel_cnt++;
-        coord_to_voxel_idx[coord_idx] = voxel_idx;
-        for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
-          coordinates[voxel_idx * config_.point_dim_size_ + di] = coord_zyx[di];
-        }
-      }
-
-      point_cnt = num_points_per_voxel[voxel_idx];
-      if (point_cnt < config_.max_point_in_voxel_size_) {
-        for (std::size_t fi = 0; fi < config_.point_feature_size_; fi++) {
-          voxels
-            [voxel_idx * config_.max_point_in_voxel_size_ * config_.point_feature_size_ +
-             point_cnt * config_.point_feature_size_ + fi] = point[fi];
-        }
-        num_points_per_voxel[voxel_idx]++;
-      }
+      ++point_counter;
     }
   }
-
-  return voxel_cnt;
+  return point_counter;
 }
 
 }  // namespace image_projection_based_fusion
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
@@ -32,6 +32,8 @@
 
 namespace centerpoint
 {
+static constexpr size_t CAPACITY_POINT = 1000000;
+
 class NetworkParam
 {
 public:
@@ -59,7 +61,7 @@ class CenterPointTRT
     const NetworkParam & encoder_param, const NetworkParam & head_param,
     const DensificationParam & densification_param, const CenterPointConfig & config);
 
-  ~CenterPointTRT();
+  virtual ~CenterPointTRT();
 
   bool detect(
     const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
@@ -83,12 +85,13 @@ class CenterPointTRT
 
   std::size_t class_size_{0};
   CenterPointConfig config_;
-  std::size_t num_voxels_{0};
   std::size_t encoder_in_feature_size_{0};
   std::size_t spatial_features_size_{0};
-  std::vector<float> voxels_;
-  std::vector<int> coordinates_;
-  std::vector<float> num_points_per_voxel_;
+  std::size_t voxels_buffer_size_{0};
+  std::size_t mask_size_{0};
+  std::size_t voxels_size_{0};
+  std::size_t coordinates_size_{0};
+  std::vector<float> points_;
   cuda::unique_ptr<float[]> voxels_d_{nullptr};
   cuda::unique_ptr<int[]> coordinates_d_{nullptr};
   cuda::unique_ptr<float[]> num_points_per_voxel_d_{nullptr};
@@ -101,6 +104,10 @@ class CenterPointTRT
   cuda::unique_ptr<float[]> head_out_dim_d_{nullptr};
   cuda::unique_ptr<float[]> head_out_rot_d_{nullptr};
   cuda::unique_ptr<float[]> head_out_vel_d_{nullptr};
+  cuda::unique_ptr<float[]> points_d_{nullptr};
+  cuda::unique_ptr<float[]> voxels_buffer_d_{nullptr};
+  cuda::unique_ptr<unsigned int[]> mask_d_{nullptr};
+  cuda::unique_ptr<unsigned int[]> num_voxels_d_{nullptr};
 };
 
 }  // namespace centerpoint

diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
@@ -21,7 +21,7 @@
 namespace centerpoint
 {
 cudaError_t scatterFeatures_launch(
-  const float * pillar_features, const int * coords, const std::size_t num_pillars,
+  const float * pillar_features, const int * coords, const unsigned int * num_pillars,
   const std::size_t max_voxel_size, const std::size_t encoder_out_feature_size,
   const std::size_t grid_size_x, const std::size_t grid_size_y, float * scattered_features,
   cudaStream_t stream);