Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove UNKNOWN_NULL_COUNT #13372

Merged
merged 12 commits into from
May 24, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 1 addition & 18 deletions cpp/include/cudf/column/column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,6 @@ class column_view_base {
/**
* @brief Returns the count of null elements
*
* @note If the column was constructed with `UNKNOWN_NULL_COUNT`, or if at any
* point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the
* first invocation of `null_count()` will compute and store the count of null
* elements indicated by the `null_mask` (if it exists).
*
* @return The count of null elements
*/
[[nodiscard]] size_type null_count() const;
Expand Down Expand Up @@ -263,10 +258,6 @@ class column_view_base {
*
* If `null_count()` is zero, `null_mask` is optional.
*
* If the null count of the `null_mask` is not specified, it defaults to
* `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then
* compute the null count if `null_mask` exists.
*
* If `type` is `EMPTY`, the specified `null_count` will be ignored and
* `null_count()` will always return the same value as `size()`
*
Expand Down Expand Up @@ -357,10 +348,6 @@ class column_view : public detail::column_view_base {
*
* If `null_count()` is zero, `null_mask` is optional.
*
* If the null count of the `null_mask` is not specified, it defaults to
* `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then
* compute the null count if `null_mask` exists.
*
* If `type` is `EMPTY`, the specified `null_count` will be ignored and
* `null_count()` will always return the same value as `size()`
*
Expand Down Expand Up @@ -509,12 +496,8 @@ class mutable_column_view : public detail::column_view_base {

/**
* @brief Construct a `mutable_column_view` from pointers to device memory for
*the elements and bitmask of the column.
* the elements and bitmask of the column.

* If the null count of the `null_mask` is not specified, it defaults to
* `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then
* compute the null count.
*
* If `type` is `EMPTY`, the specified `null_count` will be ignored and
* `null_count()` will always return the same value as `size()`
*
Expand Down
5 changes: 0 additions & 5 deletions cpp/include/cudf/detail/copy_range.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,6 @@ void copy_range(SourceValueIterator source_value_begin,
auto grid = cudf::detail::grid_1d{num_items, block_size, 1};

if (target.nullable()) {
// TODO: if null_count is UNKNOWN_NULL_COUNT, no need to update null
// count (if null_count is UNKNOWN_NULL_COUNT, invoking null_count()
// will scan the entire bitmask array, and this can be surprising
// in performance if the copy range is small and the column size is
// large).
rmm::device_scalar<size_type> null_count(target.null_count(), stream);

auto kernel =
Expand Down
8 changes: 0 additions & 8 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,6 @@ size_type distance(T f, T l)
return static_cast<size_type>(std::distance(f, l));
}

/**
* @brief Indicates an unknown null count.
*
* Use this value when constructing any column-like object to indicate that
* the null count should be computed on the first invocation of `null_count()`.
*/
static constexpr size_type UNKNOWN_NULL_COUNT{-1};

/**
* @brief Indicates the order in which elements should be sorted.
*/
Expand Down
25 changes: 2 additions & 23 deletions cpp/src/column/column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,37 +117,16 @@ mutable_column_view column::mutable_view()
child_views.emplace_back(*c);
}

// Store the old null count before resetting it. By accessing the value
// directly instead of calling `this->null_count()`, we can avoid a potential
// invocation of `cudf::detail::null_count()`. This does however mean that
// calling `this->null_count()` on the resulting mutable view could still
// potentially invoke `cudf::detail::null_count()`.
auto current_null_count = _null_count;

// The elements of a column could be changed through a `mutable_column_view`, therefore the
// existing `null_count` is no longer valid. Reset it to `UNKNOWN_NULL_COUNT` forcing it to be
// recomputed on the next invocation of `this->null_count()`.
set_null_count(cudf::UNKNOWN_NULL_COUNT);

return mutable_column_view{type(),
size(),
_data.data(),
static_cast<bitmask_type*>(_null_mask.data()),
current_null_count,
_null_count,
0,
child_views};
}

// If the null count is known, return it. Else, compute and return it
size_type column::null_count() const
{
CUDF_FUNC_RANGE();
if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
_null_count = cudf::detail::null_count(
static_cast<bitmask_type const*>(_null_mask.data()), 0, size(), cudf::get_default_stream());
}
return _null_count;
}
size_type column::null_count() const { return _null_count; }
vyasr marked this conversation as resolved.
Show resolved Hide resolved

void column::set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count)
{
Expand Down
12 changes: 2 additions & 10 deletions cpp/src/column/column_view.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,15 +63,7 @@ column_view_base::column_view_base(data_type type,
}
}

// If null count is known, returns it. Else, compute and return it
size_type column_view_base::null_count() const
{
if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
_null_count = cudf::detail::null_count(
null_mask(), offset(), offset() + size(), cudf::get_default_stream());
}
return _null_count;
}
size_type column_view_base::null_count() const { return _null_count; }
vyasr marked this conversation as resolved.
Show resolved Hide resolved

size_type column_view_base::null_count(size_type begin, size_type end) const
{
Expand Down
38 changes: 26 additions & 12 deletions cpp/src/groupby/sort/group_std.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

Expand Down Expand Up @@ -131,18 +132,31 @@ struct var_functor {
}

// set nulls
auto result_view = mutable_column_device_view::create(*result, stream);
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
group_sizes.size(),
[d_result = *result_view, d_group_sizes, ddof] __device__(size_type i) {
size_type group_size = d_group_sizes[i];
if (group_size == 0 or group_size - ddof <= 0)
d_result.set_null(i);
else
d_result.set_valid(i);
});

auto result_view = mutable_column_device_view::create(*result, stream);
auto null_count = rmm::device_scalar<cudf::size_type>(0, stream, mr);
auto d_null_count = null_count.data();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
group_sizes.size(),
[d_result = *result_view, d_group_sizes, ddof, d_null_count] __device__(size_type i) {
size_type group_size = d_group_sizes[i];
if (group_size == 0 or group_size - ddof <= 0) {
d_result.set_null(i);
// Assuming that typical data does not have too many nulls this
// atomic shouldn't serialize the code too much. The alternatives
// would be 1) writing a more complex kernel using cub/shmem to
// increase parallelism, or 2) calling `cudf::count_nulls` after the
// fact. (1) is more work than it's worth without benchmarking, and
// this approach should outperform (2) unless large amounts of the
// data is null.
atomicAdd(d_null_count, 1);
} else {
d_result.set_valid(i);
}
});

result->set_null_count(null_count.value(stream));
return result;
}

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/ast/expressions.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/transform.hpp>
#include <cudf/detail/utilities/cuda.cuh>
Expand Down Expand Up @@ -128,6 +129,8 @@ std::unique_ptr<column> compute_column(table_view const& table,
*table_device, device_expression_data, *mutable_output_device);
}
CUDF_CHECK_CUDA(stream.value());
output_column->set_null_count(
cudf::detail::null_count(mutable_output_device->null_mask(), 0, output_column->size(), stream));
return output_column;
}

Expand Down
18 changes: 4 additions & 14 deletions cpp/tests/copying/utility_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,15 +194,7 @@ TYPED_TEST(AllocateLikeTest, ColumnNumericTestSameSize)
input = make_numeric_column(
cudf::data_type{cudf::type_to_id<TypeParam>()}, size, cudf::mask_state::ALL_VALID);
got = cudf::allocate_like(input->view());
EXPECT_EQ(input->type(), got->type());
EXPECT_EQ(input->size(), got->size());
EXPECT_EQ(input->nullable(), got->nullable());
EXPECT_EQ(input->num_children(), got->num_children());
// CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL includes checking the null-count property.
// This value will be incorrect since the null mask will contain uninitialized bits
// and the null-count set to UNKNOWN_NULL_COUNT on return from allocate_like().
// This means any subsequent call to null_count() will try to compute the null-count
// using the uninitialized null-mask.
CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(*input, *got);
}

TYPED_TEST(AllocateLikeTest, ColumnNumericTestSpecifiedSize)
Expand All @@ -221,15 +213,13 @@ TYPED_TEST(AllocateLikeTest, ColumnNumericTestSpecifiedSize)
input = make_numeric_column(
cudf::data_type{cudf::type_to_id<TypeParam>()}, size, cudf::mask_state::ALL_VALID);
got = cudf::allocate_like(input->view(), specified_size);
// Can't use CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL because the sizes of
// the two columns are different.
EXPECT_EQ(input->type(), got->type());
EXPECT_EQ(specified_size, got->size());
EXPECT_EQ(0, got->null_count());
EXPECT_EQ(input->nullable(), got->nullable());
EXPECT_EQ(input->num_children(), got->num_children());
// CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL includes checking the null-count property.
// This value will be incorrect since the null mask will contain uninitialized bits
// and the null-count set to UNKNOWN_NULL_COUNT on return from allocate_like().
// This means any subsequent call to null_count() will try to compute the null-count
// using the uninitialized null-mask.
}

CUDF_TEST_PROGRAM_MAIN()
30 changes: 30 additions & 0 deletions cpp/tests/groupby/var_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,33 @@ TYPED_TEST(groupby_var_test, dictionary)
expect_vals,
cudf::make_variance_aggregation<cudf::groupby_aggregation>());
}

// This test ensures that the same results are produced by the sort-based and
// hash-based implementations of groupby-variance.
TYPED_TEST(groupby_var_test, sort_vs_hash)
{
using K = int32_t;
using V = double;

cudf::test::fixed_width_column_wrapper<K> keys{50, 30, 90, 80};
cudf::test::fixed_width_column_wrapper<V> vals{380.0, 370.0, 24.0, 26.0};

cudf::groupby::groupby gb_obj(cudf::table_view({keys}));

auto agg1 = cudf::make_variance_aggregation<cudf::groupby_aggregation>();

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back();
requests[0].values = vals;
requests[0].aggregations.push_back(std::move(agg1));

auto result1 = gb_obj.aggregate(requests);

// This agg forces a sort groupby.
auto agg2 = cudf::make_quantile_aggregation<cudf::groupby_aggregation>({0.25});
requests[0].aggregations.push_back(std::move(agg2));

auto result2 = gb_obj.aggregate(requests);

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result1.second[0].results[0], *result2.second[0].results[0]);
vyasr marked this conversation as resolved.
Show resolved Hide resolved
}
6 changes: 6 additions & 0 deletions java/src/main/native/src/row_conversion.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2269,6 +2269,9 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
}
}

for (auto &col : output_columns) {
col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size()));
}
return std::make_unique<table>(std::move(output_columns));
}

Expand Down Expand Up @@ -2324,6 +2327,9 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());

for (auto &col : output_columns) {
col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size()));
}
return std::make_unique<table>(std::move(output_columns));
} else {
CUDF_FAIL("Only fixed width types are currently supported");
Expand Down