Skip to content

Commit

Permalink
Add a shortcut for when the input clusters are all empty for the tdig…
Browse files Browse the repository at this point in the history
…est merge (#16897)

Fixes #16881. This is a new attempt to fix it.

Previously in #16675, I flipped the `has_nulls` flag to true as I thought that empty clusters should be explicitly stored in the offsets and handled properly. It turns out that it was not a good idea. After a long debugging process, I am convinced now that the existing logic is valid and should work well except for one case, where all input tdigests to the tdigest merge are empty. So, I have decided to add a [shortcut to handle that particular edge case](https://github.com/rapidsai/cudf/pull/16897/files#diff-c03df2b421f7a51b28007d575fd32ba2530970351ba7e7e0f7fad8057350870cR1349-R1354) in `group_merge_tdigest()` in this PR. This shortcut is executed only when all clusters are empty in all groups. This PR does not change any other logic. Other changes in this PR are:

- New unit tests to cover the edge case.
- `make_empty_tdigest_column` has been renamed to `make_tdigest_column_of_empty_clusters` and expanded to take `num_rows`.
- Some new documentation based on my understanding for the `merge_tdigests()` function.

Before making this PR, I have run the integration tests of the spark-rapids that were previously reported in NVIDIA/spark-rapids#11463 that my first attempt had caused them failing. They have all passed with this PR change.

Authors:
  - Jihoon Son (https://github.com/jihoonson)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - https://github.com/nvdbaranec

URL: #16897
  • Loading branch information
jihoonson authored Oct 1, 2024
1 parent 04baa22 commit 194d5f4
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 98 deletions.
18 changes: 10 additions & 8 deletions cpp/include/cudf/detail/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,28 +143,30 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
rmm::device_async_resource_ref mr);

/**
* @brief Create an empty tdigest column.
* @brief Create a tdigest column of empty tdigests.
*
* An empty tdigest column contains a single row of length 0
* The column created contains the specified number of rows of empty tdigests.
*
* @param num_rows The number of rows in the output column.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns An empty tdigest column.
* @returns A tdigest column of empty clusters.
*/
CUDF_EXPORT
std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Create an empty tdigest scalar.
* @brief Create a scalar of an empty tdigest cluster.
*
* An empty tdigest scalar is a struct_scalar that contains a single row of length 0
* The returned scalar is a struct_scalar that contains a single row of an empty cluster.
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns An empty tdigest scalar.
* @returns A scalar of an empty tdigest cluster.
*/
std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
Expand Down
20 changes: 10 additions & 10 deletions cpp/include/cudf_test/tdigest_utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);

// NOTE: an empty tdigest column still has 1 row.
auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
}
Expand Down Expand Up @@ -562,12 +562,12 @@ template <typename MergeFunc>
void tdigest_merge_empty(MergeFunc merge_op)
{
// 3 empty tdigests all in the same group
auto a = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto b = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto c = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto a = cudf::tdigest::detail::make_empty_tdigests_column(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto b = cudf::tdigest::detail::make_empty_tdigests_column(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto c = cudf::tdigest::detail::make_empty_tdigests_column(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
std::vector<column_view> cols;
cols.push_back(*a);
cols.push_back(*b);
Expand All @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
auto const delta = 1000;
auto result = merge_op(*values, delta);

auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto expected = cudf::tdigest::detail::make_empty_tdigests_column(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
}
Expand Down
23 changes: 12 additions & 11 deletions cpp/src/quantiles/tdigest/tdigest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -292,32 +292,33 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
}

std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
std::unique_ptr<column> make_empty_tdigests_column(size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto offsets = cudf::make_fixed_width_column(
data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
thrust::fill(rmm::exec_policy(stream),
offsets->mutable_view().begin<size_type>(),
offsets->mutable_view().end<size_type>(),
0);

auto min_col =
cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
auto min_col = cudf::make_numeric_column(
data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
thrust::fill(rmm::exec_policy(stream),
min_col->mutable_view().begin<double>(),
min_col->mutable_view().end<double>(),
0);
auto max_col =
cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
auto max_col = cudf::make_numeric_column(
data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
thrust::fill(rmm::exec_policy(stream),
max_col->mutable_view().begin<double>(),
max_col->mutable_view().end<double>(),
0);

return make_tdigest_column(1,
make_empty_column(type_id::FLOAT64),
make_empty_column(type_id::FLOAT64),
return make_tdigest_column(num_rows,
cudf::make_empty_column(type_id::FLOAT64),
cudf::make_empty_column(type_id::FLOAT64),
std::move(offsets),
std::move(min_col),
std::move(max_col),
Expand All @@ -338,7 +339,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto contents = make_empty_tdigest_column(stream, mr)->release();
auto contents = make_empty_tdigests_column(1, stream, mr)->release();
return std::make_unique<struct_scalar>(
std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
}
Expand Down
Loading

0 comments on commit 194d5f4

Please sign in to comment.