Skip to content

Commit

Permalink
Switched BINARY_OP Benchmarks from GoogleBench to NVBench (#16963)
Browse files Browse the repository at this point in the history
This merge request switches the Benchmarking solution for the BINARY_OP benchmarks from GoogleBench to NVBench

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: #16963
  • Loading branch information
lamarrr authored Oct 4, 2024
1 parent 2fa2e6a commit a784321
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 74 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ ConfigureNVBench(AST_NVBENCH ast/transform.cpp)

# ##################################################################################################
# * binaryop benchmark ----------------------------------------------------------------------------
ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)

# ##################################################################################################
# * nvtext benchmark -------------------------------------------------------------------
Expand Down
65 changes: 19 additions & 46 deletions cpp/benchmarks/binaryop/binaryop.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,15 +15,14 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/binaryop.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <vector>

// This set of benchmarks is designed to be a comparison for the AST benchmarks

Expand All @@ -33,23 +32,21 @@ enum class TreeType {
};

template <typename key_type, TreeType tree_type, bool reuse_columns>
class BINARYOP : public cudf::benchmark {};

template <typename key_type, TreeType tree_type, bool reuse_columns>
static void BM_binaryop_transform(benchmark::State& state)
static void BM_binaryop_transform(nvbench::state& state)
{
auto const table_size{static_cast<cudf::size_type>(state.range(0))};
auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};

// Create table data
auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
auto const source_table = create_sequence_table(
cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
cudf::table_view table{*source_table};

// Execute benchmark
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
// Use the number of bytes read from global memory
state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
// Execute tree that chains additions like (((a + b) + c) + d)
auto const op = cudf::binary_operator::ADD;
auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
Expand All @@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state)
result = cudf::binary_operation(result->view(), col, op, result_data_type);
});
}
}

// Use the number of bytes read from global memory
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
(tree_levels + 1) * sizeof(key_type));
});
}

#define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \
(::benchmark::State & st) { BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); }
\
static void name(::nvbench::state& st) \
{ \
BM_binaryop_transform<key_type, tree_type, reuse_columns>(st); \
} \
NVBENCH_BENCH(name) \
.add_int64_axis("tree_levels", {1, 2, 5, 10}) \
.add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})

BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
int32_t,
Expand All @@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
double,
TreeType::IMBALANCED_LEFT,
false);

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 2, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();
47 changes: 20 additions & 27 deletions cpp/benchmarks/binaryop/compiled_binaryop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,18 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/binaryop.hpp>

class COMPILED_BINARYOP : public cudf::benchmark {};
#include <nvbench/nvbench.cuh>

template <typename TypeLhs, typename TypeRhs, typename TypeOut>
void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
{
auto const column_size{static_cast<cudf::size_type>(state.range(0))};
auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));

auto const source_table = create_random_table(
{cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
{cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});

auto lhs = cudf::column_view(source_table->get_column(0));
auto rhs = cudf::column_view(source_table->get_column(1));
Expand All @@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
// Call once for hot cache.
cudf::binary_operation(lhs, rhs, binop, output_dtype);

for (auto _ : state) {
cuda_event_timer timer(state, true);
cudf::binary_operation(lhs, rhs, binop, output_dtype);
}

// use number of bytes read and written to global memory
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
(sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
state.add_global_memory_reads<TypeLhs>(table_size);
state.add_global_memory_reads<TypeRhs>(table_size);
state.add_global_memory_reads<TypeOut>(table_size);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
}

#define BM_STRINGIFY(a) #a

// TODO tparam boolean for null.
#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \
BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \
(::benchmark::State & st) \
{ \
BM_compiled_binaryop<lhs, rhs, tout>(st, cudf::binary_operator::bop); \
} \
BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */
#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \
static void name(::nvbench::state& st) \
{ \
::BM_compiled_binaryop<lhs, rhs, tout>(st, ::cudf::binary_operator::bop); \
} \
NVBENCH_BENCH(name) \
.set_name("compiled_binary_op_" BM_STRINGIFY(name)) \
.add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})

#define build_name(a, b, c, d) a##_##b##_##c##_##d

Expand Down

0 comments on commit a784321

Please sign in to comment.