Skip to content

Commit

Permalink
[draft] Add bytes_per_second to groupby nvbench benchmarks
Browse files Browse the repository at this point in the history
This patch adds memory statistics for the GROUPBY_NVBENCH benchmarks.

For this purpose helper functions are introduced to compute the payload
size for:
  - Column
  - Table
  - Groupby execution results

This patch relates to #13735.
  • Loading branch information
Martin Marenz committed Aug 30, 2023
1 parent 7b9f4a1 commit 405ca60
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 11 deletions.
5 changes: 3 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ target_include_directories(

# Use an OBJECT library so we only compile these helper source files only once
add_library(
cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
synchronization/synchronization.cpp io/cuio_common.cpp
cudf_benchmark_common OBJECT
"${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp" synchronization/synchronization.cpp
io/cuio_common.cpp common/memory_statistics.cpp
)
target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
add_custom_command(
Expand Down
71 changes: 71 additions & 0 deletions cpp/benchmarks/common/memory_statistics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "memory_statistics.hpp"

#include <cudf/column/column.hpp>
#include <cudf/null_mask.hpp>

#include <numeric>

uint64_t required_bytes(const cudf::column_view& column)
{
uint64_t read_bytes = 0;

switch (column.type().id()) {
case cudf::type_id::STRING:
CUDF_FAIL("required bytes not implemented for STRING columns");
break;
case cudf::type_id::STRUCT: // fallthrough
case cudf::type_id::LIST:
read_bytes += std::accumulate(
column.child_begin(), column.child_end(), 0, [](uint64_t acc, const auto& col) {
return acc + required_bytes(col);
});
break;
case cudf::type_id::DICTIONARY32:
CUDF_FAIL("required bytes not implemented for DICTIONARY columns");
break;
default:
CUDF_EXPECTS(cudf::is_fixed_width(column.type()), "Invalid element type");
read_bytes += column.size() * cudf::size_of(column.type());
break;
}
if (column.nullable()) { read_bytes += cudf::bitmask_allocation_size_bytes(column.size()); }

return read_bytes;
}

uint64_t required_bytes(const cudf::table_view& table)
{
return std::accumulate(table.begin(), table.end(), 0, [](uint64_t acc, const auto& col) {
return acc + required_bytes(col);
});
}

uint64_t required_bytes(
const cudf::host_span<cudf::groupby::aggregation_result>& aggregation_results)
{
uint64_t read_bytes = 0;

for (auto const& aggregation : aggregation_results) { // vector of aggregation results
for (auto const& col : aggregation.results) { // vector of columns per result
read_bytes += required_bytes(col->view());
}
}

return read_bytes;
}
57 changes: 57 additions & 0 deletions cpp/benchmarks/common/memory_statistics.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/groupby.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

/**
* @brief Calculate the number of bytes needed to completely read/write the provided column.
*
* The functions computes only the size of the payload of the column in bytes, it excludes
* any metadata.
*
* @param column View of the input column
* @returns Number of bytes needed to read or write the column.
*/
uint64_t required_bytes(const cudf::column_view& column);

/**
* @brief Calculate the number of bytes needed to completely read/write the provided table.
*
* The functions computes only the size of the payload of the table in bytes, it excludes
* any metadata.
*
* @param table View of the input table.
* @returns Number of bytes needed to read or write the table.
*/
uint64_t required_bytes(const cudf::table_view& table);

/**
* @brief Calculate the number of bytes needed to completely read/write the provided sequence of
* aggregation results.
*
* The functions computes only the size of the payload of the aggregation results in bytes, it
* excludes any metadata.
*
* @param aggregation_results Sequence of aggregation results from groupby execution.
* @returns Number of bytes needed to read or write the aggregation results.
*/
uint64_t required_bytes(
const cudf::host_span<cudf::groupby::aggregation_result>& aggregation_results);
27 changes: 21 additions & 6 deletions cpp/benchmarks/groupby/group_max.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>

#include <cudf/groupby.hpp>

#include <nvbench/nvbench.cuh>

#include <optional>

template <typename Type>
void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
{
Expand All @@ -32,24 +35,36 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
}();

auto const vals = [&] {
auto builder = data_profile_builder().cardinality(0).distribution(
data_profile profile = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
builder.null_probability(null_freq);
profile.set_null_probability(null_freq);
} else {
builder.no_validity();
profile.set_null_probability(std::nullopt);
}
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, data_profile{builder});
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
}();

auto keys_view = keys->view();
auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
auto const keys_view = keys->view();
auto const keys_table = cudf::table_view({keys_view, keys_view, keys_view});
auto gb_obj = cudf::groupby::groupby(keys_table);

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.aggregate(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
Expand Down
18 changes: 15 additions & 3 deletions cpp/benchmarks/groupby/group_nunique.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>

#include <cudf/groupby.hpp>

Expand Down Expand Up @@ -58,11 +59,22 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
}();

auto gb_obj =
cudf::groupby::groupby(cudf::table_view({keys->view(), keys->view(), keys->view()}));
auto const requests = make_aggregation_request_vector(
auto const keys_table = cudf::table_view({keys->view(), keys->view(), keys->view()});
auto gb_obj = cudf::groupby::groupby(keys_table);
auto const requests = make_aggregation_request_vector(
*vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.aggregate(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
Expand Down
12 changes: 12 additions & 0 deletions cpp/benchmarks/groupby/group_rank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/groupby.hpp>
Expand Down Expand Up @@ -53,6 +54,17 @@ static void nvbench_groupby_rank(nvbench::state& state,
cudf::groupby::groupby gb_obj(
keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO);

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(order_by));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.scan(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
// groupby scan uses sort implementation
Expand Down
12 changes: 12 additions & 0 deletions cpp/benchmarks/groupby/group_struct_keys.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>

#include <cudf_test/column_wrapper.hpp>

Expand Down Expand Up @@ -83,6 +84,17 @@ void bench_groupby_struct_keys(nvbench::state& state)
auto stream = cudf::get_default_stream();
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table.view()));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.aggregate(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
}
Expand Down

0 comments on commit 405ca60

Please sign in to comment.