Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change indices for dictionary column to signed integer type #17390

Merged
merged 11 commits into from
Dec 4, 2024
6 changes: 3 additions & 3 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -460,18 +460,18 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
*/
struct index_element_fn {
template <typename IndexType,
CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>)>
CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_signed_v<IndexType>)>
__device__ size_type operator()(column_device_view const& indices, size_type index)
{
return static_cast<size_type>(indices.element<IndexType>(index));
}

template <typename IndexType,
typename... Args,
CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>))>
CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_signed_v<IndexType>))>
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
__device__ size_type operator()(Args&&... args)
{
CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type");
CUDF_UNREACHABLE("dictionary indices must be a signed integral type");
}
};

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/dictionary/encode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace dictionary {
*
* The null mask and null count are copied from the input column to the output column.
*
* @throw cudf::logic_error if indices type is not an unsigned integer type
* @throw cudf::logic_error if indices type is not a signed integer type
* @throw cudf::logic_error if the column to encode is already a DICTIONARY type
*
* @code{.pseudo}
Expand All @@ -58,7 +58,7 @@ namespace dictionary {
*/
std::unique_ptr<column> encode(
column_view const& column,
data_type indices_type = data_type{type_id::UINT32},
data_type indices_type = data_type{type_id::INT32},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

Expand Down
8 changes: 4 additions & 4 deletions cpp/include/cudf_test/column_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
{
wrapped =
cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down Expand Up @@ -1009,7 +1009,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
{
wrapped = cudf::dictionary::encode(
fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down Expand Up @@ -1173,7 +1173,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
{
wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down Expand Up @@ -1210,7 +1210,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
: column_wrapper{}
{
wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/column/column_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key");
return make_dictionary_column(
make_column_from_scalar(s, 1, stream, mr),
make_column_from_scalar(numeric_scalar<uint32_t>(0, true, stream), size, stream, mr),
make_column_from_scalar(numeric_scalar<int32_t>(0, true, stream), size, stream, mr),
rmm::device_buffer{0, stream, mr},
0);
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/dictionary/add_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
auto indices_column = [&] {
column_view gather_result = table_indices.front()->view();
auto const indices_size = gather_result.size();
// we can just use the lower-bound/gather data directly for UINT32 case
if (indices_type.id() == type_id::UINT32) {
// we can just use the lower-bound/gather data directly for INT32 case
if (indices_type.id() == type_id::INT32) {
auto contents = table_indices.front()->release();
return std::make_unique<column>(data_type{type_id::UINT32},
return std::make_unique<column>(data_type{type_id::INT32},
indices_size,
std::move(*(contents.data.release())),
rmm::device_buffer{0, stream, mr},
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/detail/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
std::transform(columns.begin(), columns.end(), indices_views.begin(), [](auto cv) {
auto dict_view = dictionary_column_view(cv);
if (dict_view.is_empty()) {
return column_view{data_type{type_id::UINT32}, 0, nullptr, nullptr, 0};
return column_view{data_type{type_id::INT32}, 0, nullptr, nullptr, 0};
}
return dict_view.get_indices_annotated(); // nicely includes validity mask and view offset
});
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dictionary/dictionary_factories.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ struct dispatch_create_indices {
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
CUDF_EXPECTS(std::is_signed<IndexType>(), "indices must be an signed type");
column_view indices_view{
indices.type(), indices.size(), indices.data<IndexType>(), nullptr, 0, indices.offset()};
return std::make_unique<column>(indices_view, stream, mr);
Expand Down Expand Up @@ -83,7 +83,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
{
CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls");
CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls");
CUDF_EXPECTS(is_unsigned(indices_column->type()), "indices must be type unsigned integer");
CUDF_EXPECTS(!is_unsigned(indices_column->type()), "indices must be type unsigned integer");
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

auto count = indices_column->size();
std::vector<std::unique_ptr<column>> children;
Expand Down
12 changes: 4 additions & 8 deletions cpp/src/dictionary/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ std::unique_ptr<column> encode(column_view const& input_column,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
CUDF_EXPECTS(!is_unsigned(indices_type), "indices must be type signed integer");
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
"cannot encode a dictionary from a dictionary");

Expand All @@ -63,10 +63,6 @@ std::unique_ptr<column> encode(column_view const& input_column,
keys_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); // remove the null-mask
}

// the encode() returns INT32 for indices
if (indices_column->type().id() != indices_type.id())
indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr);

// create column with keys_column and indices_column
return make_dictionary_column(std::move(keys_column),
std::move(indices_column),
Expand All @@ -79,9 +75,9 @@ std::unique_ptr<column> encode(column_view const& input_column,
*/
data_type get_indices_type_for_size(size_type keys_size)
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
{
if (keys_size <= std::numeric_limits<uint8_t>::max()) return data_type{type_id::UINT8};
if (keys_size <= std::numeric_limits<uint16_t>::max()) return data_type{type_id::UINT16};
return data_type{type_id::UINT32};
if (keys_size <= std::numeric_limits<int8_t>::max()) return data_type{type_id::INT8};
if (keys_size <= std::numeric_limits<int16_t>::max()) return data_type{type_id::INT16};
return data_type{type_id::INT32};
}

} // namespace detail
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dictionary/remove_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,11 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
// search the indices values with key indices to look for any holes
auto const matches = [&] {
// build keys index to verify against indices values
rmm::device_uvector<uint32_t> keys_positions(keys_size, stream);
rmm::device_uvector<int32_t> keys_positions(keys_size, stream);
thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
// wrap the indices for comparison in contains()
column_view keys_positions_view(
data_type{type_id::UINT32}, keys_size, keys_positions.data(), nullptr, 0);
data_type{type_id::INT32}, keys_size, keys_positions.data(), nullptr, 0);
return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
}();
auto d_matches = matches->view().data<bool>();
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/dictionary/search.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,9 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
if (dictionary.is_empty())
return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
if (dictionary.is_empty()) {
return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
}
return type_dispatcher<dispatch_storage_type>(
dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr);
}
Expand All @@ -169,8 +170,9 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
if (dictionary.is_empty())
return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
if (dictionary.is_empty()) {
return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
}
return type_dispatcher<dispatch_storage_type>(
dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr);
}
Expand Down
17 changes: 5 additions & 12 deletions cpp/src/interop/from_arrow_device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -194,19 +194,12 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);

auto const dict_indices_type = [&schema]() -> data_type {
// cudf dictionary requires an unsigned type for the indices,
// since it is invalid for an arrow dictionary to contain negative
// indices, we can safely use the unsigned equivalent without having
// to modify the buffers.
// cudf dictionary requires a signed type for the indices
switch (schema->storage_type) {
case NANOARROW_TYPE_INT8:
case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
case NANOARROW_TYPE_INT64:
case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
}
}();
Expand Down
17 changes: 5 additions & 12 deletions cpp/src/interop/from_arrow_host.cu
Original file line number Diff line number Diff line change
Expand Up @@ -267,19 +267,12 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::dictiona
get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);

auto const dict_indices_type = [&schema]() -> data_type {
// cudf dictionary requires an unsigned type for the indices,
// since it is invalid for an arrow dictionary to contain negative
// indices, we can safely use the unsigned equivalent without having
// to modify the buffers.
// cudf dictionary requires a signed type for the indices
switch (schema->storage_type) {
case NANOARROW_TYPE_INT8:
case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
case NANOARROW_TYPE_INT64:
case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
}
}();
Expand Down
6 changes: 3 additions & 3 deletions cpp/tests/copying/get_value_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ TYPED_TEST_SUITE(DictionaryGetValueTest, cudf::test::FixedWidthTypesWithoutFixed
TYPED_TEST(DictionaryGetValueTest, BasicGet)
{
cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
cudf::test::fixed_width_column_wrapper<uint32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
cudf::test::fixed_width_column_wrapper<int32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
auto col = cudf::make_dictionary_column(keys, indices);

auto s = cudf::get_element(*col, 2);
Expand All @@ -147,7 +147,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet)
TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
{
cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
cudf::test::fixed_width_column_wrapper<uint32_t> indices(
cudf::test::fixed_width_column_wrapper<int32_t> indices(
{0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
auto col = cudf::make_dictionary_column(keys, indices);

Expand All @@ -163,7 +163,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
TYPED_TEST(DictionaryGetValueTest, GetNull)
{
cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
cudf::test::fixed_width_column_wrapper<uint32_t> indices(
cudf::test::fixed_width_column_wrapper<int32_t> indices(
{0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
auto col = cudf::make_dictionary_column(keys, indices);

Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/dictionary/add_keys_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ TEST_F(DictionaryAddKeysTest, StringsColumn)
cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee", "fff"});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
cudf::test::fixed_width_column_wrapper<int8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
}

Expand All @@ -58,7 +58,7 @@ TEST_F(DictionaryAddKeysTest, FloatColumn)
cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 5.0, 7.125};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint8_t> expected{2, 4, 1, 0, 4, 1};
cudf::test::fixed_width_column_wrapper<int8_t> expected{2, 4, 1, 0, 4, 1};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
}

Expand Down
8 changes: 4 additions & 4 deletions cpp/tests/dictionary/encode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ TEST_F(DictionaryEncodeTest, EncodeStringColumn)
cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee"});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
cudf::test::fixed_width_column_wrapper<int32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
}

Expand All @@ -48,7 +48,7 @@ TEST_F(DictionaryEncodeTest, EncodeFloat)
cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 7.125};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint32_t> expected{2, 3, 1, 0, 3, 1};
cudf::test::fixed_width_column_wrapper<int32_t> expected{2, 3, 1, 0, 3, 1};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
}

Expand All @@ -64,14 +64,14 @@ TEST_F(DictionaryEncodeTest, EncodeWithNull)
cudf::test::fixed_width_column_wrapper<int64_t> keys_expected{0, 111, 222, 333, 444};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
cudf::test::fixed_width_column_wrapper<int32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
}

TEST_F(DictionaryEncodeTest, InvalidEncode)
{
cudf::test::fixed_width_column_wrapper<int16_t> input{0, 1, 2, 3, -1, -2, -3};

EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::INT16}),
EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::UINT16}),
cudf::logic_error);
}
Loading
Loading