From 40a0415d4cdcb625198d81f6426d7a0505072293 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Fri, 27 Sep 2024 21:00:38 +0100 Subject: [PATCH 1/7] switched ast benchmarks from googlebench to nvbench --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/ast/transform.cpp | 51 +++++++------------ .../synchronization/synchronization.cpp | 33 ++++++------ .../synchronization/synchronization.hpp | 6 +++ 4 files changed, 44 insertions(+), 48 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..d99689befdc 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_BENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 65a44532cf1..a2957a21c01 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,17 @@ */ #include -#include #include #include #include +#include + #include +#include + #include #include #include @@ -35,13 +38,10 @@ enum class TreeType { }; template -class AST : public cudf::benchmark {}; - -template -static void BM_ast_transform(benchmark::State& state) +static void BM_ast_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size = static_cast(state.get_int64("table_size")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -86,38 +86,25 @@ static void BM_ast_transform(benchmark::State& state) auto const& expression_tree_root = expressions.back(); - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + flush_device_L2_cache(rmm::cuda_stream_view{state.get_cuda_stream().get_stream()}); cudf::compute_column(table, expression_tree_root); - } + }); // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); -} - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } + state.add_global_memory_reads(static_cast(state.get_summaries().size()) * table_size * + (tree_levels + 1) * sizeof(key_type)); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) \ + static void name(::nvbench::state& st) \ { \ - BM_ast_transform(st); \ + ::BM_ast_transform(st); \ } \ - BENCHMARK_REGISTER_F(AST, name) \ - ->Apply(CustomRanges) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_int64_axis("tree_levels", {1, 5, 10}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp index 5993bb23542..c6419be4cc1 100644 --- a/cpp/benchmarks/synchronization/synchronization.cpp +++ b/cpp/benchmarks/synchronization/synchronization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,26 +21,29 @@ #include #include +void flush_device_L2_cache(rmm::cuda_stream_view stream) +{ + int current_device = 0; + CUDF_CUDA_TRY(cudaGetDevice(¤t_device)); + + int l2_cache_bytes = 0; + CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); + + if (l2_cache_bytes > 0) { + int const memset_value = 0; + rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); + CUDF_CUDA_TRY( + cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); + } +} + cuda_event_timer::cuda_event_timer(benchmark::State& state, bool flush_l2_cache, rmm::cuda_stream_view stream) : stream(stream), p_state(&state) { // flush all of L2$ - if (flush_l2_cache) { - int current_device = 0; - CUDF_CUDA_TRY(cudaGetDevice(¤t_device)); - - int l2_cache_bytes = 0; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); - - if (l2_cache_bytes > 0) { - int const memset_value = 0; - rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); - CUDF_CUDA_TRY( - cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); - } - } + if (flush_l2_cache) { flush_device_L2_cache(stream); } CUDF_CUDA_TRY(cudaEventCreate(&start)); CUDF_CUDA_TRY(cudaEventCreate(&stop)); diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp index cc3bf828d60..69f3230fa59 100644 --- a/cpp/benchmarks/synchronization/synchronization.hpp +++ b/cpp/benchmarks/synchronization/synchronization.hpp @@ -69,6 +69,12 @@ #include #include +/** + * @brief clears the L2$ by cudaMemset'ing a buffer of L2$ size + * @param stream CUDA stream used for device memory operations and kernel launches + */ +void flush_device_L2_cache(rmm::cuda_stream_view stream = cudf::get_default_stream()); + class cuda_event_timer { public: /** From cb8812d778a185d0c29300399883ddd18be811ca Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Sat, 28 Sep 2024 01:04:23 +0100 Subject: [PATCH 2/7] reverted port of device L2 cache flush util --- cpp/benchmarks/ast/transform.cpp | 7 ++--- .../synchronization/synchronization.cpp | 31 +++++++++---------- .../synchronization/synchronization.hpp | 6 ---- 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index a2957a21c01..66d268f15d1 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -86,10 +85,8 @@ static void BM_ast_transform(nvbench::state& state) auto const& expression_tree_root = expressions.back(); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { - flush_device_L2_cache(rmm::cuda_stream_view{state.get_cuda_stream().get_stream()}); - cudf::compute_column(table, expression_tree_root); - }); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); // Use the number of bytes read from global memory state.add_global_memory_reads(static_cast(state.get_summaries().size()) * table_size * diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp index c6419be4cc1..760668e22ec 100644 --- a/cpp/benchmarks/synchronization/synchronization.cpp +++ b/cpp/benchmarks/synchronization/synchronization.cpp @@ -21,29 +21,26 @@ #include #include -void flush_device_L2_cache(rmm::cuda_stream_view stream) -{ - int current_device = 0; - CUDF_CUDA_TRY(cudaGetDevice(¤t_device)); - - int l2_cache_bytes = 0; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); - - if (l2_cache_bytes > 0) { - int const memset_value = 0; - rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); - CUDF_CUDA_TRY( - cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); - } -} - cuda_event_timer::cuda_event_timer(benchmark::State& state, bool flush_l2_cache, rmm::cuda_stream_view stream) : stream(stream), p_state(&state) { // flush all of L2$ - if (flush_l2_cache) { flush_device_L2_cache(stream); } + if (flush_l2_cache) { + int current_device = 0; + CUDF_CUDA_TRY(cudaGetDevice(¤t_device)); + + int l2_cache_bytes = 0; + CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); + + if (l2_cache_bytes > 0) { + int const memset_value = 0; + rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); + CUDF_CUDA_TRY( + cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); + } + } CUDF_CUDA_TRY(cudaEventCreate(&start)); CUDF_CUDA_TRY(cudaEventCreate(&stop)); diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp index 69f3230fa59..cc3bf828d60 100644 --- a/cpp/benchmarks/synchronization/synchronization.hpp +++ b/cpp/benchmarks/synchronization/synchronization.hpp @@ -69,12 +69,6 @@ #include #include -/** - * @brief clears the L2$ by cudaMemset'ing a buffer of L2$ size - * @param stream CUDA stream used for device memory operations and kernel launches - */ -void flush_device_L2_cache(rmm::cuda_stream_view stream = cudf::get_default_stream()); - class cuda_event_timer { public: /** From 6988454e6f27a2f2ecc4929383701f51012b63b6 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Sat, 28 Sep 2024 01:06:05 +0100 Subject: [PATCH 3/7] renamed AST_BENCH to AST_NVBENCH --- cpp/benchmarks/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d99689befdc..a12002752e7 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -330,7 +330,7 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureNVBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- From 6588b73a5810a538186f8b2a0a7c8fe87878bbe6 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Sat, 28 Sep 2024 01:07:55 +0100 Subject: [PATCH 4/7] reverted license notice change --- cpp/benchmarks/synchronization/synchronization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp index 760668e22ec..5993bb23542 100644 --- a/cpp/benchmarks/synchronization/synchronization.cpp +++ b/cpp/benchmarks/synchronization/synchronization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From ca9575647a855ca369852023ae9bbf21966b3df7 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Mon, 30 Sep 2024 12:24:13 +0100 Subject: [PATCH 5/7] fixed global memory read calculation for ast benchmark Co-authored-by: Yunsong Wang --- cpp/benchmarks/ast/transform.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 66d268f15d1..0f5787955da 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -89,8 +89,7 @@ static void BM_ast_transform(nvbench::state& state) [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); // Use the number of bytes read from global memory - state.add_global_memory_reads(static_cast(state.get_summaries().size()) * table_size * - (tree_levels + 1) * sizeof(key_type)); + state.add_global_memory_reads(table_size * (tree_levels + 1)); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ From 242a8b438eeba5c1ce16aded99fff4082ee66baa Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Tue, 1 Oct 2024 10:00:55 +0000 Subject: [PATCH 6/7] fixed ast benchmarks display of throughput measurements --- cpp/benchmarks/ast/transform.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 0f5787955da..af14d48d242 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -85,11 +85,11 @@ static void BM_ast_transform(nvbench::state& state) auto const& expression_tree_root = expressions.back(); - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); - // Use the number of bytes read from global memory state.add_global_memory_reads(table_size * (tree_levels + 1)); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ From 5cc26c340d97065c23eb1046392dc3b2ac3b9982 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Tue, 1 Oct 2024 16:34:41 +0100 Subject: [PATCH 7/7] reordered ast benchmarks's axis --- cpp/benchmarks/ast/transform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index af14d48d242..f44f26e4d2c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -99,8 +99,8 @@ static void BM_ast_transform(nvbench::state& state) } \ NVBENCH_BENCH(name) \ .set_name(#name) \ - .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ - .add_int64_axis("tree_levels", {1, 5, 10}) + .add_int64_axis("tree_levels", {1, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);