[Opt] Optimizing the performance of bitmap_to_csr (#2516)

This PR optimizes the performance of `bitmap_to_csr` related kernels by 14~1000 times. It could also benefit the `bitset_to_csr` in the future. #### After (Updated Dec 08) ```shell --------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------------------------------------------------- BitmapToCsrBench<uint32_t, int64_t, float>/0/manual_time 0.161 ms 0.197 ms 4350 rows*cols=1*100000000 sparsity=0.95 BitmapToCsrBench<uint32_t, int64_t, float>/1/manual_time 0.110 ms 0.147 ms 6363 rows*cols=1*100000000 sparsity=0.99 BitmapToCsrBench<uint32_t, int64_t, float>/2/manual_time 14.2 ms 14.2 ms 50 rows*cols=100*100000000 sparsity=0.95 BitmapToCsrBench<uint32_t, int64_t, float>/3/manual_time 8.76 ms 8.80 ms 80 rows*cols=100*100000000 sparsity=0.99 ``` #### Before ```shell --------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------------------------------------------------- BitmapToCsrBench<uint32_t, int64_t, float>/0/manual_time 176 ms 176 ms 4 rows*cols=1*100000000 sparsity=0.95 BitmapToCsrBench<uint32_t, int64_t, float>/1/manual_time 146 ms 146 ms 5 rows*cols=1*100000000 sparsity=0.99 BitmapToCsrBench<uint32_t, int64_t, float>/2/manual_time 180 ms 180 ms 4 rows*cols=100*100000000 sparsity=0.95 BitmapToCsrBench<uint32_t, int64_t, float>/3/manual_time 148 ms 148 ms 5 rows*cols=100*100000000 sparsity=0.99 ``` Authors: - rhdong (https://github.com/rhdong) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #2516
rapidsai · Dec 11, 2024 · 3720d8e · 3720d8e
1 parent 1e5030d
commit 3720d8e
Show file tree

Hide file tree

Showing 4 changed files with 319 additions and 185 deletions.
diff --git a/cpp/bench/prims/sparse/bitmap_to_csr.cu b/cpp/bench/prims/sparse/bitmap_to_csr.cu
@@ -71,7 +71,7 @@ struct BitmapToCsrBench : public fixture {
   index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
     for (auto& item : bitmap) {
@@ -141,7 +141,27 @@ const std::vector<bench_param<index_t>> getInputs()
   };
 
   const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
-    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.01f, 0.1f, 0.2f, 0.5f});
+    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.99f, 0.9f, 0.8f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+template <typename index_t = int64_t>
+const std::vector<bench_param<index_t>> getLargeInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(1), index_t(100)}, {index_t(100 * 1000000)}, {0.95f, 0.99f});
 
   param_vec.reserve(params_group.size());
   for (TestParams params : params_group) {
@@ -153,4 +173,6 @@ const std::vector<bench_param<index_t>> getInputs()
 RAFT_BENCH_REGISTER((BitmapToCsrBench<uint32_t, int, float>), "", getInputs<int>());
 RAFT_BENCH_REGISTER((BitmapToCsrBench<uint64_t, int, double>), "", getInputs<int>());
 
+RAFT_BENCH_REGISTER((BitmapToCsrBench<uint32_t, int64_t, float>), "", getLargeInputs<int64_t>());
+
 }  // namespace raft::bench::sparse