diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 96e24efac8a..7f0570113f4 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -40,8 +40,9 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( - cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp" - synchronization/synchronization.cpp io/cuio_common.cpp + cudf_benchmark_common OBJECT + "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp" synchronization/synchronization.cpp + io/cuio_common.cpp common/memory_statistics.cpp ) target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) add_custom_command( diff --git a/cpp/benchmarks/common/memory_statistics.cpp b/cpp/benchmarks/common/memory_statistics.cpp new file mode 100644 index 00000000000..42f13417797 --- /dev/null +++ b/cpp/benchmarks/common/memory_statistics.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "memory_statistics.hpp" + +#include +#include + +#include + +uint64_t required_bytes(const cudf::column_view& column) +{ + uint64_t read_bytes = 0; + + switch (column.type().id()) { + case cudf::type_id::STRING: + CUDF_FAIL("required bytes not implemented for STRING columns"); + break; + case cudf::type_id::STRUCT: // fallthrough + case cudf::type_id::LIST: + read_bytes += std::accumulate( + column.child_begin(), column.child_end(), 0, [](uint64_t acc, const auto& col) { + return acc + required_bytes(col); + }); + break; + case cudf::type_id::DICTIONARY32: + CUDF_FAIL("required bytes not implemented for DICTIONARY columns"); + break; + default: + CUDF_EXPECTS(cudf::is_fixed_width(column.type()), "Invalid element type"); + read_bytes += column.size() * cudf::size_of(column.type()); + break; + } + if (column.nullable()) { read_bytes += cudf::bitmask_allocation_size_bytes(column.size()); } + + return read_bytes; +} + +uint64_t required_bytes(const cudf::table_view& table) +{ + return std::accumulate(table.begin(), table.end(), 0, [](uint64_t acc, const auto& col) { + return acc + required_bytes(col); + }); +} + +uint64_t required_bytes( + const cudf::host_span& aggregation_results) +{ + uint64_t read_bytes = 0; + + for (auto const& aggregation : aggregation_results) { // vector of aggregation results + for (auto const& col : aggregation.results) { // vector of columns per result + read_bytes += required_bytes(col->view()); + } + } + + return read_bytes; +} diff --git a/cpp/benchmarks/common/memory_statistics.hpp b/cpp/benchmarks/common/memory_statistics.hpp new file mode 100644 index 00000000000..3fbf4405e17 --- /dev/null +++ b/cpp/benchmarks/common/memory_statistics.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +/** + * @brief Calculate the number of bytes needed to completely read/write the provided column. + * + * The functions computes only the size of the payload of the column in bytes, it excludes + * any metadata. + * + * @param column View of the input column + * @returns Number of bytes needed to read or write the column. + */ +uint64_t required_bytes(const cudf::column_view& column); + +/** + * @brief Calculate the number of bytes needed to completely read/write the provided table. + * + * The functions computes only the size of the payload of the table in bytes, it excludes + * any metadata. + * + * @param table View of the input table. + * @returns Number of bytes needed to read or write the table. + */ +uint64_t required_bytes(const cudf::table_view& table); + +/** + * @brief Calculate the number of bytes needed to completely read/write the provided sequence of + * aggregation results. + * + * The functions computes only the size of the payload of the aggregation results in bytes, it + * excludes any metadata. + * + * @param aggregation_results Sequence of aggregation results from groupby execution. + * @returns Number of bytes needed to read or write the aggregation results. + */ +uint64_t required_bytes( + const cudf::host_span& aggregation_results); diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index e65c37f001d..f74412809f8 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -15,11 +15,14 @@ */ #include +#include #include #include +#include + template void bench_groupby_max(nvbench::state& state, nvbench::type_list) { @@ -32,24 +35,36 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list) }(); auto const vals = [&] { - auto builder = data_profile_builder().cardinality(0).distribution( + data_profile profile = data_profile_builder().cardinality(0).distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 1000); if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) { - builder.null_probability(null_freq); + profile.set_null_probability(null_freq); } else { - builder.no_validity(); + profile.set_null_probability(std::nullopt); } - return create_random_column(cudf::type_to_id(), row_count{size}, data_profile{builder}); + return create_random_column(cudf::type_to_id(), row_count{size}, profile); }(); - auto keys_view = keys->view(); - auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view})); + auto const keys_view = keys->view(); + auto const keys_table = cudf::table_view({keys_view, keys_view, keys_view}); + auto gb_obj = cudf::groupby::groupby(keys_table); std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_max_aggregation()); + // Add memory statistics + state.add_global_memory_reads(required_bytes(vals->view())); + state.add_global_memory_reads(required_bytes(keys_table)); + + // The number of written bytes depends on random distribution of keys. + // For larger sizes it converges against the number of unique elements + // in the input distribution (101 elements) + auto [res_table, res_agg] = gb_obj.aggregate(requests); + state.add_global_memory_writes(required_bytes(res_table->view())); + state.add_global_memory_writes(required_bytes(res_agg)); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp index 63d738b2951..34e1c52b280 100644 --- a/cpp/benchmarks/groupby/group_nunique.cpp +++ b/cpp/benchmarks/groupby/group_nunique.cpp @@ -15,6 +15,7 @@ */ #include +#include #include @@ -58,11 +59,22 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list) return create_random_column(cudf::type_to_id(), row_count{size}, profile); }(); - auto gb_obj = - cudf::groupby::groupby(cudf::table_view({keys->view(), keys->view(), keys->view()})); - auto const requests = make_aggregation_request_vector( + auto const keys_table = cudf::table_view({keys->view(), keys->view(), keys->view()}); + auto gb_obj = cudf::groupby::groupby(keys_table); + auto const requests = make_aggregation_request_vector( *vals, cudf::make_nunique_aggregation()); + // Add memory statistics + state.add_global_memory_reads(required_bytes(vals->view())); + state.add_global_memory_reads(required_bytes(keys_table)); + + // The number of written bytes depends on random distribution of keys. + // For larger sizes it converges against the number of unique elements + // in the input distribution (101 elements) + auto [res_table, res_agg] = gb_obj.aggregate(requests); + state.add_global_memory_writes(required_bytes(res_table->view())); + state.add_global_memory_writes(required_bytes(res_agg)); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp index 2122720a421..fff0f945277 100644 --- a/cpp/benchmarks/groupby/group_rank.cpp +++ b/cpp/benchmarks/groupby/group_rank.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include @@ -53,6 +54,17 @@ static void nvbench_groupby_rank(nvbench::state& state, cudf::groupby::groupby gb_obj( keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO); + // Add memory statistics + state.add_global_memory_reads(required_bytes(order_by)); + state.add_global_memory_reads(required_bytes(keys)); + + // The number of written bytes depends on random distribution of keys. + // For larger sizes it converges against the number of unique elements + // in the input distribution (101 elements) + auto [res_table, res_agg] = gb_obj.scan(requests); + state.add_global_memory_writes(required_bytes(res_table->view())); + state.add_global_memory_writes(required_bytes(res_agg)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; // groupby scan uses sort implementation diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp index 44a12c1c30e..fad9bf194f4 100644 --- a/cpp/benchmarks/groupby/group_struct_keys.cpp +++ b/cpp/benchmarks/groupby/group_struct_keys.cpp @@ -15,6 +15,7 @@ */ #include +#include #include @@ -83,6 +84,17 @@ void bench_groupby_struct_keys(nvbench::state& state) auto stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + // Add memory statistics + state.add_global_memory_reads(required_bytes(keys_table.view())); + state.add_global_memory_reads(required_bytes(vals->view())); + + // The number of written bytes depends on random distribution of keys. + // For larger sizes it converges against the number of unique elements + // in the input distribution (101 elements) + auto [res_table, res_agg] = gb_obj.aggregate(requests); + state.add_global_memory_writes(required_bytes(res_table->view())); + state.add_global_memory_writes(required_bytes(res_agg)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); }