From 40a0415d4cdcb625198d81f6426d7a0505072293 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Fri, 27 Sep 2024 21:00:38 +0100
Subject: [PATCH 1/7] switched ast benchmarks from googlebench to nvbench

---
 cpp/benchmarks/CMakeLists.txt                 |  2 +-
 cpp/benchmarks/ast/transform.cpp              | 51 +++++++------------
 .../synchronization/synchronization.cpp       | 33 ++++++------
 .../synchronization/synchronization.hpp       |  6 +++
 4 files changed, 44 insertions(+), 48 deletions(-)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4113e38dcf4..d99689befdc 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_BENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 65a44532cf1..a2957a21c01 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,17 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
 #include <list>
 #include <memory>
@@ -35,13 +38,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-class AST : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-static void BM_ast_transform(benchmark::State& state)
+static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
@@ -86,38 +86,25 @@ static void BM_ast_transform(benchmark::State& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
+    flush_device_L2_cache(rmm::cuda_stream_view{state.get_cuda_stream().get_stream()});
     cudf::compute_column(table, expression_tree_root);
-  }
+  });
 
   // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
-}
-
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
+  state.add_global_memory_reads(static_cast<int64_t>(state.get_summaries().size()) * table_size *
+                                (tree_levels + 1) * sizeof(key_type));
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
-  (::benchmark::State & st)                                                                \
+  static void name(::nvbench::state& st)                                                   \
   {                                                                                        \
-    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+    ::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                  \
   }                                                                                        \
-  BENCHMARK_REGISTER_F(AST, name)                                                          \
-    ->Apply(CustomRanges)                                                                  \
-    ->Unit(benchmark::kMillisecond)                                                        \
-    ->UseManualTime();
+  NVBENCH_BENCH(name)                                                                      \
+    .set_name(#name)                                                                       \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})           \
+    .add_int64_axis("tree_levels", {1, 5, 10})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
index 5993bb23542..c6419be4cc1 100644
--- a/cpp/benchmarks/synchronization/synchronization.cpp
+++ b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,26 +21,29 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
+void flush_device_L2_cache(rmm::cuda_stream_view stream)
+{
+  int current_device = 0;
+  CUDF_CUDA_TRY(cudaGetDevice(&current_device));
+
+  int l2_cache_bytes = 0;
+  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
+
+  if (l2_cache_bytes > 0) {
+    int const memset_value = 0;
+    rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
+    CUDF_CUDA_TRY(
+      cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
+  }
+}
+
 cuda_event_timer::cuda_event_timer(benchmark::State& state,
                                    bool flush_l2_cache,
                                    rmm::cuda_stream_view stream)
   : stream(stream), p_state(&state)
 {
   // flush all of L2$
-  if (flush_l2_cache) {
-    int current_device = 0;
-    CUDF_CUDA_TRY(cudaGetDevice(&current_device));
-
-    int l2_cache_bytes = 0;
-    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
-
-    if (l2_cache_bytes > 0) {
-      int const memset_value = 0;
-      rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-      CUDF_CUDA_TRY(
-        cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
-    }
-  }
+  if (flush_l2_cache) { flush_device_L2_cache(stream); }
 
   CUDF_CUDA_TRY(cudaEventCreate(&start));
   CUDF_CUDA_TRY(cudaEventCreate(&stop));
diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
index cc3bf828d60..69f3230fa59 100644
--- a/cpp/benchmarks/synchronization/synchronization.hpp
+++ b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -69,6 +69,12 @@
 #include <benchmark/benchmark.h>
 #include <driver_types.h>
 
+/**
+ * @brief clears the L2$ by cudaMemset'ing a buffer of L2$ size
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void flush_device_L2_cache(rmm::cuda_stream_view stream = cudf::get_default_stream());
+
 class cuda_event_timer {
  public:
   /**

From cb8812d778a185d0c29300399883ddd18be811ca Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Sat, 28 Sep 2024 01:04:23 +0100
Subject: [PATCH 2/7] reverted port of device L2 cache flush util

---
 cpp/benchmarks/ast/transform.cpp              |  7 ++---
 .../synchronization/synchronization.cpp       | 31 +++++++++----------
 .../synchronization/synchronization.hpp       |  6 ----
 3 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index a2957a21c01..66d268f15d1 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
@@ -86,10 +85,8 @@ static void BM_ast_transform(nvbench::state& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
-    flush_device_L2_cache(rmm::cuda_stream_view{state.get_cuda_stream().get_stream()});
-    cudf::compute_column(table, expression_tree_root);
-  });
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 
   // Use the number of bytes read from global memory
   state.add_global_memory_reads(static_cast<int64_t>(state.get_summaries().size()) * table_size *
diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
index c6419be4cc1..760668e22ec 100644
--- a/cpp/benchmarks/synchronization/synchronization.cpp
+++ b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -21,29 +21,26 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-void flush_device_L2_cache(rmm::cuda_stream_view stream)
-{
-  int current_device = 0;
-  CUDF_CUDA_TRY(cudaGetDevice(&current_device));
-
-  int l2_cache_bytes = 0;
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
-
-  if (l2_cache_bytes > 0) {
-    int const memset_value = 0;
-    rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-    CUDF_CUDA_TRY(
-      cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
-  }
-}
-
 cuda_event_timer::cuda_event_timer(benchmark::State& state,
                                    bool flush_l2_cache,
                                    rmm::cuda_stream_view stream)
   : stream(stream), p_state(&state)
 {
   // flush all of L2$
-  if (flush_l2_cache) { flush_device_L2_cache(stream); }
+  if (flush_l2_cache) {
+    int current_device = 0;
+    CUDF_CUDA_TRY(cudaGetDevice(&current_device));
+
+    int l2_cache_bytes = 0;
+    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
+
+    if (l2_cache_bytes > 0) {
+      int const memset_value = 0;
+      rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
+      CUDF_CUDA_TRY(
+        cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
+    }
+  }
 
   CUDF_CUDA_TRY(cudaEventCreate(&start));
   CUDF_CUDA_TRY(cudaEventCreate(&stop));
diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
index 69f3230fa59..cc3bf828d60 100644
--- a/cpp/benchmarks/synchronization/synchronization.hpp
+++ b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -69,12 +69,6 @@
 #include <benchmark/benchmark.h>
 #include <driver_types.h>
 
-/**
- * @brief clears the L2$ by cudaMemset'ing a buffer of L2$ size
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-void flush_device_L2_cache(rmm::cuda_stream_view stream = cudf::get_default_stream());
-
 class cuda_event_timer {
  public:
   /**

From 6988454e6f27a2f2ecc4929383701f51012b63b6 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Sat, 28 Sep 2024 01:06:05 +0100
Subject: [PATCH 3/7] renamed AST_BENCH to AST_NVBENCH

---
 cpp/benchmarks/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d99689befdc..a12002752e7 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureNVBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------

From 6588b73a5810a538186f8b2a0a7c8fe87878bbe6 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Sat, 28 Sep 2024 01:07:55 +0100
Subject: [PATCH 4/7] reverted license notice change

---
 cpp/benchmarks/synchronization/synchronization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
index 760668e22ec..5993bb23542 100644
--- a/cpp/benchmarks/synchronization/synchronization.cpp
+++ b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From ca9575647a855ca369852023ae9bbf21966b3df7 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Mon, 30 Sep 2024 12:24:13 +0100
Subject: [PATCH 5/7] fixed global memory read calculation for ast benchmark

Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 cpp/benchmarks/ast/transform.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 66d268f15d1..0f5787955da 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -89,8 +89,7 @@ static void BM_ast_transform(nvbench::state& state)
              [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads(static_cast<int64_t>(state.get_summaries().size()) * table_size *
-                                (tree_levels + 1) * sizeof(key_type));
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \

From 242a8b438eeba5c1ce16aded99fff4082ee66baa Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Tue, 1 Oct 2024 10:00:55 +0000
Subject: [PATCH 6/7] fixed ast benchmarks display of throughput measurements

---
 cpp/benchmarks/ast/transform.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 0f5787955da..af14d48d242 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -85,11 +85,11 @@ static void BM_ast_transform(nvbench::state& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  state.exec(nvbench::exec_tag::sync,
-             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
-
   // Use the number of bytes read from global memory
   state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \

From 5cc26c340d97065c23eb1046392dc3b2ac3b9982 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Tue, 1 Oct 2024 16:34:41 +0100
Subject: [PATCH 7/7] reordered ast benchmarks's axis

---
 cpp/benchmarks/ast/transform.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index af14d48d242..f44f26e4d2c 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -99,8 +99,8 @@ static void BM_ast_transform(nvbench::state& state)
   }                                                                                        \
   NVBENCH_BENCH(name)                                                                      \
     .set_name(#name)                                                                       \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})           \
-    .add_int64_axis("tree_levels", {1, 5, 10})
+    .add_int64_axis("tree_levels", {1, 5, 10})                                             \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);