[draft] Add bytes_per_second to groupby nvbench benchmarks

This patch adds memory statistics for the GROUPBY_NVBENCH benchmarks. For this purpose helper functions are introduced to compute the payload size for: - Column - Table - Groupby execution results This patch relates to #13735.
rapidsai · Aug 30, 2023 · 405ca60 · 405ca60
1 parent 7b9f4a1
commit 405ca60
Show file tree

Hide file tree

Showing 7 changed files with 191 additions and 11 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -40,8 +40,9 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
-                               synchronization/synchronization.cpp io/cuio_common.cpp
+  cudf_benchmark_common OBJECT
+  "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp" synchronization/synchronization.cpp
+  io/cuio_common.cpp common/memory_statistics.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
 add_custom_command(

diff --git a/cpp/benchmarks/common/memory_statistics.cpp b/cpp/benchmarks/common/memory_statistics.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "memory_statistics.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/null_mask.hpp>
+
+#include <numeric>
+
+uint64_t required_bytes(const cudf::column_view& column)
+{
+  uint64_t read_bytes = 0;
+
+  switch (column.type().id()) {
+    case cudf::type_id::STRING:
+      CUDF_FAIL("required bytes not implemented for STRING columns");
+      break;
+    case cudf::type_id::STRUCT:  // fallthrough
+    case cudf::type_id::LIST:
+      read_bytes += std::accumulate(
+        column.child_begin(), column.child_end(), 0, [](uint64_t acc, const auto& col) {
+          return acc + required_bytes(col);
+        });
+      break;
+    case cudf::type_id::DICTIONARY32:
+      CUDF_FAIL("required bytes not implemented for DICTIONARY columns");
+      break;
+    default:
+      CUDF_EXPECTS(cudf::is_fixed_width(column.type()), "Invalid element type");
+      read_bytes += column.size() * cudf::size_of(column.type());
+      break;
+  }
+  if (column.nullable()) { read_bytes += cudf::bitmask_allocation_size_bytes(column.size()); }
+
+  return read_bytes;
+}
+
+uint64_t required_bytes(const cudf::table_view& table)
+{
+  return std::accumulate(table.begin(), table.end(), 0, [](uint64_t acc, const auto& col) {
+    return acc + required_bytes(col);
+  });
+}
+
+uint64_t required_bytes(
+  const cudf::host_span<cudf::groupby::aggregation_result>& aggregation_results)
+{
+  uint64_t read_bytes = 0;
+
+  for (auto const& aggregation : aggregation_results) {  // vector of aggregation results
+    for (auto const& col : aggregation.results) {        // vector of columns per result
+      read_bytes += required_bytes(col->view());
+    }
+  }
+
+  return read_bytes;
+}
diff --git a/cpp/benchmarks/common/memory_statistics.hpp b/cpp/benchmarks/common/memory_statistics.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+/**
+ * @brief Calculate the number of bytes needed to completely read/write the provided column.
+ *
+ * The functions computes only the size of the payload of the column in bytes, it excludes
+ * any metadata.
+ *
+ * @param column View of the input column
+ * @returns Number of bytes needed to read or write the column.
+ */
+uint64_t required_bytes(const cudf::column_view& column);
+
+/**
+ * @brief Calculate the number of bytes needed to completely read/write the provided table.
+ *
+ * The functions computes only the size of the payload of the table in bytes, it excludes
+ * any metadata.
+ *
+ * @param table View of the input table.
+ * @returns Number of bytes needed to read or write the table.
+ */
+uint64_t required_bytes(const cudf::table_view& table);
+
+/**
+ * @brief Calculate the number of bytes needed to completely read/write the provided sequence of
+ * aggregation results.
+ *
+ * The functions computes only the size of the payload of the aggregation results in bytes, it
+ * excludes any metadata.
+ *
+ * @param aggregation_results Sequence of aggregation results from groupby execution.
+ * @returns Number of bytes needed to read or write the aggregation results.
+ */
+uint64_t required_bytes(
+  const cudf::host_span<cudf::groupby::aggregation_result>& aggregation_results);
diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
@@ -15,11 +15,14 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/memory_statistics.hpp>
 
 #include <cudf/groupby.hpp>
 
 #include <nvbench/nvbench.cuh>
 
+#include <optional>
+
 template <typename Type>
 void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
 {
@@ -32,24 +35,36 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
   }();
 
   auto const vals = [&] {
-    auto builder = data_profile_builder().cardinality(0).distribution(
+    data_profile profile = data_profile_builder().cardinality(0).distribution(
       cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
     if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
-      builder.null_probability(null_freq);
+      profile.set_null_probability(null_freq);
     } else {
-      builder.no_validity();
+      profile.set_null_probability(std::nullopt);
     }
-    return create_random_column(cudf::type_to_id<Type>(), row_count{size}, data_profile{builder});
+    return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
   }();
 
-  auto keys_view = keys->view();
-  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
+  auto const keys_view  = keys->view();
+  auto const keys_table = cudf::table_view({keys_view, keys_view, keys_view});
+  auto gb_obj           = cudf::groupby::groupby(keys_table);
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
 
+  // Add memory statistics
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table));
+
+  // The number of written bytes depends on random distribution of keys.
+  // For larger sizes it converges against the number of unique elements
+  // in the input distribution (101 elements)
+  auto [res_table, res_agg] = gb_obj.aggregate(requests);
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));
+
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });

diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/memory_statistics.hpp>
 
 #include <cudf/groupby.hpp>
 
@@ -58,11 +59,22 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
     return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
   }();
 
-  auto gb_obj =
-    cudf::groupby::groupby(cudf::table_view({keys->view(), keys->view(), keys->view()}));
-  auto const requests = make_aggregation_request_vector(
+  auto const keys_table = cudf::table_view({keys->view(), keys->view(), keys->view()});
+  auto gb_obj           = cudf::groupby::groupby(keys_table);
+  auto const requests   = make_aggregation_request_vector(
     *vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());
 
+  // Add memory statistics
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table));
+
+  // The number of written bytes depends on random distribution of keys.
+  // For larger sizes it converges against the number of unique elements
+  // in the input distribution (101 elements)
+  auto [res_table, res_agg] = gb_obj.aggregate(requests);
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));
+
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });

diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/memory_statistics.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/groupby.hpp>
@@ -53,6 +54,17 @@ static void nvbench_groupby_rank(nvbench::state& state,
   cudf::groupby::groupby gb_obj(
     keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO);
 
+  // Add memory statistics
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(order_by));
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys));
+
+  // The number of written bytes depends on random distribution of keys.
+  // For larger sizes it converges against the number of unique elements
+  // in the input distribution (101 elements)
+  auto [res_table, res_agg] = gb_obj.scan(requests);
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));
+
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
     // groupby scan uses sort implementation

diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/memory_statistics.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -83,6 +84,17 @@ void bench_groupby_struct_keys(nvbench::state& state)
   auto stream = cudf::get_default_stream();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
 
+  // Add memory statistics
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table.view()));
+  state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));
+
+  // The number of written bytes depends on random distribution of keys.
+  // For larger sizes it converges against the number of unique elements
+  // in the input distribution (101 elements)
+  auto [res_table, res_agg] = gb_obj.aggregate(requests);
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
+  state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));
+
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
 }