From 182843ac16d9f4a7202d3156261b70fe0326dced Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 10 Sep 2024 14:19:27 -0700 Subject: [PATCH] Fix empty cluster handling in tdigest merge --- cpp/include/cudf/detail/tdigest/tdigest.hpp | 17 +-- cpp/include/cudf_test/tdigest_utilities.cuh | 20 +-- cpp/src/quantiles/tdigest/tdigest.cu | 23 +-- .../quantiles/tdigest/tdigest_aggregation.cu | 84 +++++++++-- cpp/tests/groupby/tdigest_tests.cu | 135 +++++++++++++++++- .../quantiles/percentile_approx_test.cpp | 4 +- 6 files changed, 233 insertions(+), 50 deletions(-) diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index 80a4460023f..672b95e2d01 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -143,28 +143,29 @@ std::unique_ptr make_tdigest_column(size_type num_rows, rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest column. + * @brief Create a tdigest column of empty clusters. * - * An empty tdigest column contains a single row of length 0 + * The column created contains the specified number of rows of empty clusters. * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest column. + * @returns A tdigest column of empty clusters. */ CUDF_EXPORT -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest scalar. + * @brief Create a scalar of an empty tdigest cluster. * - * An empty tdigest scalar is a struct_scalar that contains a single row of length 0 + * The returned scalar is a struct_scalar that contains a single row of an empty cluster. * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest scalar. + * @returns A scalar of an empty tdigest cluster. */ std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh index 1758790cd64..be7d19b2227 100644 --- a/cpp/include/cudf_test/tdigest_utilities.cuh +++ b/cpp/include/cudf_test/tdigest_utilities.cuh @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op) static_cast(values).type(), tdigest_gen{}, op, values, delta); // NOTE: an empty tdigest column still has 1 row. - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); } @@ -562,12 +562,12 @@ template void tdigest_merge_empty(MergeFunc merge_op) { // 3 empty tdigests all in the same group - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto b = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); cols.push_back(*b); @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op) auto const delta = 1000; auto result = merge_op(*values, delta); - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 0d017cf1f13..76cd55bf994 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -292,32 +292,33 @@ std::unique_ptr make_tdigest_column(size_type num_rows, return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr); } -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto offsets = cudf::make_fixed_width_column( - data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); + data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), offsets->mutable_view().begin(), offsets->mutable_view().end(), 0); - auto min_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto min_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), min_col->mutable_view().begin(), min_col->mutable_view().end(), 0); - auto max_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto max_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), max_col->mutable_view().begin(), max_col->mutable_view().end(), 0); - return make_tdigest_column(1, - make_empty_column(type_id::FLOAT64), - make_empty_column(type_id::FLOAT64), + return make_tdigest_column(num_rows, + cudf::make_empty_column(type_id::FLOAT64), + cudf::make_empty_column(type_id::FLOAT64), std::move(offsets), std::move(min_col), std::move(max_col), @@ -338,7 +339,7 @@ std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto contents = make_empty_tdigest_column(stream, mr)->release(); + auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release(); return std::make_unique( std::move(*std::make_unique(std::move(contents.children))), true, stream, mr); } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 2dd25a7b890..04e66318e71 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -367,7 +367,6 @@ std::unique_ptr to_tdigest_scalar(std::unique_ptr&& tdigest, * @param group_num_clusters Output. The number of output clusters for each input group. * @param group_cluster_offsets Offsets per-group to the start of it's clusters * @param has_nulls Whether or not the input contains nulls - * */ template @@ -661,6 +660,10 @@ std::unique_ptr build_output_column(size_type num_rows, mr); } +/** + * @brief A functor which returns the cluster index within a group that the value at + * the given value index falls into. + */ template struct compute_tdigests_keys_fn { int const delta; @@ -706,8 +709,8 @@ struct compute_tdigests_keys_fn { * boundaries. * * @param delta tdigest compression level - * @param values_begin Beginning of the range of input values. - * @param values_end End of the range of input values. + * @param centroids_begin Beginning of the range of centroids. + * @param centroids_end End of the range of centroids. * @param cumulative_weight Functor which returns cumulative weight and group information for * an absolute input value index. * @param min_col Column containing the minimum value per group. @@ -750,7 +753,9 @@ std::unique_ptr compute_tdigests(int delta, // double // max // } // - if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (total_clusters == 0) { + return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); + } // each input group represents an individual tdigest. within each tdigest, we want the keys // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall @@ -1021,6 +1026,38 @@ struct group_key_func { } }; +/** + * @brief Perform a merge aggregation of tdigests. This function usually takes the input as the + * outputs of multiple `typed_group_tdigest` calls, and merges them. + * + * A tdigest cluster can be empty in the input, which means that there was no valid input data to + * generate that cluster. These empty clusters are currently stored differently in different parts + * of the tdigest column. They are "compressed" in the `means`, `weights`, and `offsets` columns, + * but not in the `min` and `max` columns. + * - The `means` and `weights` columns do not contain values for empty clusters. + * - The empty clusters are stored as two consecutive same values. For example, given an offsets + * column of (0, 1, 1, 2), the second cluster where its offset is 1 is empty. Note that the offsets + * are the offsets for the means and the weights. + * - The `min` and `max` columns contain 0s for the empty clusters. + * + * @param tdv input tdigests. These should have been sorted by the key, but may have not by the + * centroid mean within each group. + * @param h_outer_offsets a host iterator of the offsets to the start of each group. A group is + * counted as one even when the cluster is empty in it. The offsets should have the same values as + * the ones in `group_offsets`. + * @param group_offsets a device iterator of the offsets to the start of each group. A group is + * counted as one even when the cluster is empty in it. The offsets should have the same values as + * the ones in `h_outer_offsets`. + * @param group_labels a device iterator of the the group label for each tdigest cluster including + * empty clusters. + * @param num_group_labels the number of unique group labels. + * @param num_groups the number of groups. + * @param max_centroids the maximum number of centroids (clusters) in the tdigest. + * @param stream CUDA stream + * @param mr device memory resource + * + * @return A column containing the merged tdigests. + */ template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, HGroupOffsetIter h_outer_offsets, @@ -1032,10 +1069,17 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - // thrust::merge and thrust::merge_by_key don't provide what we need. What we would need is an - // algorithm like a super-merge that takes two layers of keys: one which identifies the outer - // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the - // outer groups. + // Sort the tdigests by the centroid mean within each group and then pass them to + // `compute_tdigests()`. Note that the input has been sorted by the key, but has not by the mean + // yet within each group. For sorting by the key, see + // `store_result_functor::get_grouped_values()`. + // + // NOTE: the current implementation is quite complex and involves offset copy from the device to + // the host. `thrust::merge` and `thrust::merge_by_key` don't provide what we need. What we would + // need is an algorithm like a super-merge that takes two layers of keys: one which identifies the + // outer grouping of tdigests, and one which identifies the inner groupings of the tdigests within + // the outer groups. + // // TODO: investigate replacing the iterative merge with a single stable_sort_by_key. // bring tdigest offsets back to the host @@ -1076,7 +1120,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, return cudf::detail::slice(tdigests_unsliced, {start, end}, stream); }); - // merge + // merge sorted return cudf::detail::merge(unmerged_tdigests, {0}, {order::ASCENDING}, @@ -1161,6 +1205,10 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto const delta = max_centroids; + // TDigest merge takes the output of typed_group_tdigest as its input, which must not have + // any nulls. + auto const has_nulls = false; + // generate cluster info auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( delta, @@ -1177,7 +1225,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, group_labels, group_offsets, {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, - false, + has_nulls, stream, mr); @@ -1202,7 +1250,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, group_cluster_wl, std::move(group_cluster_offsets), total_clusters, - false, + has_nulls, stream, mr); } @@ -1267,7 +1315,9 @@ std::unique_ptr group_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (col.size() == 0) { + return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); + } auto const delta = max_centroids; return cudf::type_dispatcher(col.type(), @@ -1293,7 +1343,15 @@ std::unique_ptr group_merge_tdigest(column_view const& input, tdigest_column_view tdv(input); if (num_groups == 0 || input.size() == 0) { - return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); + return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); + } + + if (tdv.means().size() == 0) { + // `group_merge_tdigest` takes the output of `typed_group_tdigest` as its input, which wipes + // out the means and weights for empty clusters. Thus, no mean here indicates that all clusters + // are empty in the input. Let's skip all complex computation in the below, but just return + // an empty tdigest per group. + return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr); } // bring group offsets back to the host diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index baa59026b07..c2da87911d6 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups) cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0}; int const delta = 1000; - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto b = cudf::type_dispatcher( static_cast(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto d = cudf::type_dispatcher( static_cast(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta); - auto e = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); @@ -507,3 +507,126 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } + +std::unique_ptr do_agg( + cudf::column_view key, + cudf::column_view val, + std::function()> make_agg) +{ + std::vector keys; + keys.push_back(key); + cudf::table_view const key_table(keys); + + cudf::groupby::groupby gb(key_table); + std::vector requests; + cudf::groupby::aggregation_request req; + req.values = val; + req.aggregations.push_back(make_agg()); + requests.push_back(std::move(req)); + + auto result = gb.aggregate(std::move(requests)); + + std::vector> result_columns; + for (auto&& c : result.first->release()) { + result_columns.push_back(std::move(c)); + } + + EXPECT_EQ(result.second.size(), 1); + EXPECT_EQ(result.second[0].results.size(), 1); + result_columns.push_back(std::move(result.second[0].results[0])); + + return std::make_unique(std::move(result_columns)); +} + +TEST_F(TDigestMergeTest, AllValuesAreNull) +{ + // The input must be sorted by the key. + // See `aggregate_result_functor::operator()` for details. + auto const keys = cudf::test::fixed_width_column_wrapper{{0, 0, 1, 1, 2}}; + auto const keys_view = cudf::column_view(keys); + auto val_elems = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + // All values are null + return false; + }); + auto const vals = cudf::test::fixed_width_column_wrapper{ + val_elems, val_elems + keys_view.size(), val_valids}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_computed_keys_view{expected_computed_keys}; + auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + expected_computed_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(), + compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_merged_keys_view{expected_merged_keys}; + auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + expected_merged_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view()); +} + +TEST_F(TDigestMergeTest, AllValuesInOneGroupIsNull) +{ + cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper vals{{10.0, 20.0, {}, {}, 30.0}, + {true, true, false, false, true}}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2, 3}}; + + cudf::test::fixed_width_column_wrapper expected_means{10, 20, 30}; + cudf::test::fixed_width_column_wrapper expected_weights{1, 1, 1}; + cudf::test::fixed_width_column_wrapper expected_offsets{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper expected_mins{10.0, 20.0, 0.0, 30.0}; + cudf::test::fixed_width_column_wrapper expected_maxes{10.0, 20.0, 0.0, 30.0}; + auto const expected_values = + cudf::tdigest::detail::make_tdigest_column(4, + std::make_unique(expected_means), + std::make_unique(expected_weights), + std::make_unique(expected_offsets), + std::make_unique(expected_mins), + std::make_unique(expected_maxes), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), merge_result->get_column(1).view()); +} diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 915717713df..7359f0406fc 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {}; TEST_F(PercentileApproxTest, EmptyInput) { - auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; std::vector input;