Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor histogram reduction using cuco::static_set::insert_and_find #16485

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7abfd2a
Add histogram bench
srinivasyadav18 Aug 2, 2024
e0aa78b
refactor histogram using static_map insert_or_apply
srinivasyadav18 Aug 2, 2024
de74ff8
minor nits
srinivasyadav18 Aug 6, 2024
4a939c4
Merge branch 'branch-24.10' into histogram_insert_or_apply
srinivasyadav18 Aug 6, 2024
11464ba
more minor nits
srinivasyadav18 Aug 6, 2024
9029297
fix cmake format
srinivasyadav18 Aug 6, 2024
c9b9bc2
Merge branch 'branch-24.10' into histogram_insert_or_apply
srinivasyadav18 Aug 7, 2024
5fc6c9a
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Sep 24, 2024
54fba88
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Sep 26, 2024
249e957
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 2, 2024
56a5d59
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 2, 2024
d274003
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 3, 2024
920fd16
Refactor `histogram` with `cuco::static_set`
mhaseeb123 Oct 4, 2024
61e173c
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 4, 2024
c2d8153
Minor comment update
mhaseeb123 Oct 4, 2024
9aca13e
Minor improvements
mhaseeb123 Oct 4, 2024
8bef2b0
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 5, 2024
7ddf0aa
Fix for benchmark
mhaseeb123 Oct 5, 2024
2134f51
Update cardinality axis for reduction
mhaseeb123 Oct 5, 2024
d4dd4d2
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 7, 2024
d35b4fc
Minor style fix
mhaseeb123 Oct 7, 2024
aaf6837
Minor style fix
mhaseeb123 Oct 7, 2024
f63fc4a
Style fix
mhaseeb123 Oct 7, 2024
a5dcc82
Remove hash_reduce_by_row and use rmm nosync policy
mhaseeb123 Oct 7, 2024
79f07dd
Add minor comments
mhaseeb123 Oct 7, 2024
de720e5
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 7, 2024
b48b76b
Add functor to dispatch static set with provided equality comparator
mhaseeb123 Oct 8, 2024
813630f
Revert the dispatcher functor
mhaseeb123 Oct 8, 2024
30772ae
Merge branch 'histogram_insert_or_apply' of https://github.com/sriniv…
mhaseeb123 Oct 8, 2024
34d41ab
Merge branch 'branch-24.12' into histogram_insert_or_apply
mhaseeb123 Oct 8, 2024
a872a97
Revert the functor
mhaseeb123 Oct 9, 2024
ab733d5
Minor
mhaseeb123 Oct 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ ConfigureBench(
reduction/reduce.cpp reduction/scan.cpp
)
ConfigureNVBench(
REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp
REDUCTION_NVBENCH reduction/histogram.cpp reduction/rank.cpp reduction/scan_structs.cpp
reduction/segmented_reduce.cpp
)

# ##################################################################################################
Expand All @@ -231,7 +232,7 @@ ConfigureBench(
)

ConfigureNVBench(
GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
GROUPBY_NVBENCH groupby/group_histogram.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp
groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
)

Expand Down
86 changes: 86 additions & 0 deletions cpp/benchmarks/groupby/group_histogram.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>

#include <cudf/groupby.hpp>

#include <nvbench/nvbench.cuh>

template <typename Type>
void groupby_histogram_helper(nvbench::state& state,
cudf::size_type num_rows,
cudf::size_type cardinality,
double null_probability)
{
auto const keys = [&] {
data_profile const profile =
data_profile_builder()
.cardinality(cardinality)
.no_validity()
.distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
}();

auto const vals = [&] {
auto builder = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
if (null_probability > 0) {
builder.null_probability(null_probability);
} else {
builder.no_validity();
}
return create_random_column(
cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
}();

auto keys_view = keys->view();
auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view}));

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_histogram_aggregation<cudf::groupby_aggregation>());

auto const mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
auto const result = gb_obj.aggregate(requests);
});
auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
}

template <typename Type>
void bench_groupby_histogram(nvbench::state& state, nvbench::type_list<Type>)
{
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const null_probability = state.get_float64("null_probability");

groupby_histogram_helper<Type>(state, num_rows, cardinality, null_probability);
}

NVBENCH_BENCH_TYPES(bench_groupby_histogram,
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
.set_name("groupby_histogram")
.add_float64_axis("null_probability", {0, 0.1, 0.9})
.add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000})
.add_int64_power_of_two_axis("num_rows", {12, 18, 24});

74 changes: 74 additions & 0 deletions cpp/benchmarks/reduction/histogram.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "cudf/aggregation.hpp"
#include "cudf/detail/aggregation/aggregation.hpp"

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/nvbench_utilities.hpp>
#include <benchmarks/common/table_utilities.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/detail/aggregation/aggregation.hpp>
#include <cudf/reduction.hpp>
#include <cudf/reduction/detail/histogram.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

template <typename type>
static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list<type>)
{
auto const dtype = cudf::type_to_id<type>();

auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const null_probability = state.get_float64("null_probability");

if (cardinality > num_rows) {
state.skip("cardinality > num_rows");
return;
}

data_profile const profile = data_profile_builder()
.null_probability(null_probability)
.cardinality(cardinality)
.distribution(dtype, distribution_id::UNIFORM, 0, num_rows);

auto const input = create_random_column(dtype, row_count{num_rows}, profile);
auto agg = cudf::make_histogram_aggregation<cudf::reduce_aggregation>();
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::reduce(*input, *agg, input->type(), stream_view);
});

state.add_element_count(input->size());
}

using data_type = nvbench::type_list<int32_t, int64_t>;

NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type))
.set_name("histogram")
.add_float64_axis("null_probability", {0.1})
.add_int64_axis("cardinality", {1})
.add_int64_axis("num_rows",
{
10000, // 10k
100000, // 100k
1000000, // 1M
10000000, // 10M
100000000, // 100M
srinivasyadav18 marked this conversation as resolved.
Show resolved Hide resolved
});
2 changes: 1 addition & 1 deletion cpp/include/cudf/reduction/detail/histogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace cudf::reduction::detail {
* @param partial_counts An optional column containing count for each row
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate memory of the returned objects
* @return A pair of array contains the (stable-order) indices of the distinct rows in the input
* @return A pair of array contains the indices of the distinct rows in the input
* table, and their corresponding distinct counts
*/
[[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/groupby/sort/group_histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <rmm/resource_ref.hpp>

#include <thrust/gather.h>
#include <thrust/sort.h>

namespace cudf::groupby::detail {

Expand Down Expand Up @@ -57,6 +58,12 @@ std::unique_ptr<column> build_histogram(column_view const& values,
auto [distinct_indices, distinct_counts] =
cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr);

// compute_row_frequencies does not provide stable ordering
thrust::sort_by_key(rmm::exec_policy(stream),
distinct_indices->begin(),
distinct_indices->end(),
distinct_counts->mutable_view().begin<int64_t>());
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved

// Gather the distinct rows for the output histogram.
auto out_table = cudf::detail::gather(labeled_values,
*distinct_indices,
Expand Down
Loading
Loading