Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switched AST benchmarks from GoogleBench to NVBench #16952

Merged
merged 10 commits into from
Oct 4, 2024
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)

# ##################################################################################################
# * ast benchmark ---------------------------------------------------------------------------------
ConfigureBench(AST_BENCH ast/transform.cpp)
ConfigureNVBench(AST_BENCH ast/transform.cpp)
lamarrr marked this conversation as resolved.
Show resolved Hide resolved

# ##################################################################################################
# * binaryop benchmark ----------------------------------------------------------------------------
Expand Down
51 changes: 19 additions & 32 deletions cpp/benchmarks/ast/transform.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,14 +15,17 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/transform.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <list>
#include <memory>
Expand All @@ -35,13 +38,10 @@ enum class TreeType {
};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
class AST : public cudf::benchmark {};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
static void BM_ast_transform(benchmark::State& state)
static void BM_ast_transform(nvbench::state& state)
{
auto const table_size{static_cast<cudf::size_type>(state.range(0))};
auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));

// Create table data
auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
Expand Down Expand Up @@ -86,38 +86,25 @@ static void BM_ast_transform(benchmark::State& state)

auto const& expression_tree_root = expressions.back();

// Execute benchmark
for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
flush_device_L2_cache(rmm::cuda_stream_view{state.get_cuda_stream().get_stream()});
cudf::compute_column(table, expression_tree_root);
}
});

// Use the number of bytes read from global memory
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
(tree_levels + 1) * sizeof(key_type));
}

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
state.add_global_memory_reads(static_cast<int64_t>(state.get_summaries().size()) * table_size *
(tree_levels + 1) * sizeof(key_type));
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \
(::benchmark::State & st) \
static void name(::nvbench::state& st) \
{ \
BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); \
::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); \
} \
BENCHMARK_REGISTER_F(AST, name) \
->Apply(CustomRanges) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) \
.add_int64_axis("tree_levels", {1, 5, 10})

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
Expand Down
33 changes: 18 additions & 15 deletions cpp/benchmarks/synchronization/synchronization.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,26 +21,29 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

void flush_device_L2_cache(rmm::cuda_stream_view stream)
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
{
int current_device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
int const memset_value = 0;
rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
CUDF_CUDA_TRY(
cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
}
}

cuda_event_timer::cuda_event_timer(benchmark::State& state,
bool flush_l2_cache,
rmm::cuda_stream_view stream)
: stream(stream), p_state(&state)
{
// flush all of L2$
if (flush_l2_cache) {
int current_device = 0;
CUDF_CUDA_TRY(cudaGetDevice(&current_device));

int l2_cache_bytes = 0;
CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));

if (l2_cache_bytes > 0) {
int const memset_value = 0;
rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
CUDF_CUDA_TRY(
cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
}
}
if (flush_l2_cache) { flush_device_L2_cache(stream); }

CUDF_CUDA_TRY(cudaEventCreate(&start));
CUDF_CUDA_TRY(cudaEventCreate(&stop));
Expand Down
6 changes: 6 additions & 0 deletions cpp/benchmarks/synchronization/synchronization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@
#include <benchmark/benchmark.h>
#include <driver_types.h>

/**
* @brief clears the L2$ by cudaMemset'ing a buffer of L2$ size
* @param stream CUDA stream used for device memory operations and kernel launches
*/
void flush_device_L2_cache(rmm::cuda_stream_view stream = cudf::get_default_stream());

class cuda_event_timer {
public:
/**
Expand Down
Loading