Skip to content

Commit

Permalink
Change indices for dictionary column to signed integer type
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Nov 20, 2024
1 parent d01f332 commit 532bb77
Show file tree
Hide file tree
Showing 23 changed files with 93 additions and 100 deletions.
6 changes: 3 additions & 3 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -460,18 +460,18 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
*/
struct index_element_fn {
template <typename IndexType,
CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>)>
CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_signed_v<IndexType>)>
__device__ size_type operator()(column_device_view const& indices, size_type index)
{
return static_cast<size_type>(indices.element<IndexType>(index));
}

template <typename IndexType,
typename... Args,
CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>))>
CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_signed_v<IndexType>))>
__device__ size_type operator()(Args&&... args)
{
CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type");
CUDF_UNREACHABLE("dictionary indices must be a signed integral type");
}
};

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/dictionary/encode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace dictionary {
*
* The null mask and null count are copied from the input column to the output column.
*
* @throw cudf::logic_error if indices type is not an unsigned integer type
* @throw cudf::logic_error if indices type is not a signed integer type
* @throw cudf::logic_error if the column to encode is already a DICTIONARY type
*
* @code{.pseudo}
Expand All @@ -58,7 +58,7 @@ namespace dictionary {
*/
std::unique_ptr<column> encode(
column_view const& column,
data_type indices_type = data_type{type_id::UINT32},
data_type indices_type = data_type{type_id::INT32},
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

Expand Down
8 changes: 4 additions & 4 deletions cpp/include/cudf_test/column_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
{
wrapped =
cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down Expand Up @@ -1009,7 +1009,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
{
wrapped = cudf::dictionary::encode(
fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down Expand Up @@ -1173,7 +1173,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
{
wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down Expand Up @@ -1210,7 +1210,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
: column_wrapper{}
{
wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
cudf::data_type{type_id::UINT32},
cudf::data_type{type_id::INT32},
cudf::test::get_default_stream());
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/column/column_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key");
return make_dictionary_column(
make_column_from_scalar(s, 1, stream, mr),
make_column_from_scalar(numeric_scalar<uint32_t>(0, true, stream), size, stream, mr),
make_column_from_scalar(numeric_scalar<int32_t>(0, true, stream), size, stream, mr),
rmm::device_buffer{0, stream, mr},
0);
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/dictionary/add_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
auto indices_column = [&] {
column_view gather_result = table_indices.front()->view();
auto const indices_size = gather_result.size();
// we can just use the lower-bound/gather data directly for UINT32 case
if (indices_type.id() == type_id::UINT32) {
// we can just use the lower-bound/gather data directly for INT32 case
if (indices_type.id() == type_id::INT32) {
auto contents = table_indices.front()->release();
return std::make_unique<column>(data_type{type_id::UINT32},
return std::make_unique<column>(data_type{type_id::INT32},
indices_size,
std::move(*(contents.data.release())),
rmm::device_buffer{0, stream, mr},
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/detail/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
std::transform(columns.begin(), columns.end(), indices_views.begin(), [](auto cv) {
auto dict_view = dictionary_column_view(cv);
if (dict_view.is_empty()) {
return column_view{data_type{type_id::UINT32}, 0, nullptr, nullptr, 0};
return column_view{data_type{type_id::INT32}, 0, nullptr, nullptr, 0};
}
return dict_view.get_indices_annotated(); // nicely includes validity mask and view offset
});
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dictionary/dictionary_factories.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ struct dispatch_create_indices {
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
CUDF_EXPECTS(std::is_signed<IndexType>(), "indices must be an signed type");
column_view indices_view{
indices.type(), indices.size(), indices.data<IndexType>(), nullptr, 0, indices.offset()};
return std::make_unique<column>(indices_view, stream, mr);
Expand Down Expand Up @@ -83,7 +83,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
{
CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls");
CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls");
CUDF_EXPECTS(is_unsigned(indices_column->type()), "indices must be type unsigned integer");
CUDF_EXPECTS(!is_unsigned(indices_column->type()), "indices must be type unsigned integer");

auto count = indices_column->size();
std::vector<std::unique_ptr<column>> children;
Expand Down
12 changes: 4 additions & 8 deletions cpp/src/dictionary/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ std::unique_ptr<column> encode(column_view const& input_column,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
CUDF_EXPECTS(!is_unsigned(indices_type), "indices must be type signed integer");
CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
"cannot encode a dictionary from a dictionary");

Expand All @@ -63,10 +63,6 @@ std::unique_ptr<column> encode(column_view const& input_column,
keys_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); // remove the null-mask
}

// the encode() returns INT32 for indices
if (indices_column->type().id() != indices_type.id())
indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr);

// create column with keys_column and indices_column
return make_dictionary_column(std::move(keys_column),
std::move(indices_column),
Expand All @@ -79,9 +75,9 @@ std::unique_ptr<column> encode(column_view const& input_column,
*/
data_type get_indices_type_for_size(size_type keys_size)
{
if (keys_size <= std::numeric_limits<uint8_t>::max()) return data_type{type_id::UINT8};
if (keys_size <= std::numeric_limits<uint16_t>::max()) return data_type{type_id::UINT16};
return data_type{type_id::UINT32};
if (keys_size <= std::numeric_limits<int8_t>::max()) return data_type{type_id::INT8};
if (keys_size <= std::numeric_limits<int16_t>::max()) return data_type{type_id::INT16};
return data_type{type_id::INT32};
}

} // namespace detail
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dictionary/remove_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,11 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
// search the indices values with key indices to look for any holes
auto const matches = [&] {
// build keys index to verify against indices values
rmm::device_uvector<uint32_t> keys_positions(keys_size, stream);
rmm::device_uvector<int32_t> keys_positions(keys_size, stream);
thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
// wrap the indices for comparison in contains()
column_view keys_positions_view(
data_type{type_id::UINT32}, keys_size, keys_positions.data(), nullptr, 0);
data_type{type_id::INT32}, keys_size, keys_positions.data(), nullptr, 0);
return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
}();
auto d_matches = matches->view().data<bool>();
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/dictionary/search.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,9 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
if (dictionary.is_empty())
return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
if (dictionary.is_empty()) {
return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
}
return type_dispatcher<dispatch_storage_type>(
dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr);
}
Expand All @@ -169,8 +170,9 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
if (dictionary.is_empty())
return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
if (dictionary.is_empty()) {
return std::make_unique<numeric_scalar<int32_t>>(0, false, stream, mr);
}
return type_dispatcher<dispatch_storage_type>(
dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr);
}
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/interop/from_arrow_device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -200,13 +200,13 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
// to modify the buffers.
switch (schema->storage_type) {
case NANOARROW_TYPE_INT8:
case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
case NANOARROW_TYPE_UINT8: return data_type(type_id::INT8);
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
case NANOARROW_TYPE_UINT16: return data_type(type_id::INT16);
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
case NANOARROW_TYPE_UINT32: return data_type(type_id::INT32);
case NANOARROW_TYPE_INT64:
case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
case NANOARROW_TYPE_UINT64: return data_type(type_id::INT64);
default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
}
}();
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/interop/from_arrow_host.cu
Original file line number Diff line number Diff line change
Expand Up @@ -273,13 +273,13 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::dictiona
// to modify the buffers.
switch (schema->storage_type) {
case NANOARROW_TYPE_INT8:
case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
case NANOARROW_TYPE_UINT8: return data_type(type_id::INT8);
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
case NANOARROW_TYPE_UINT16: return data_type(type_id::INT16);
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
case NANOARROW_TYPE_UINT32: return data_type(type_id::INT32);
case NANOARROW_TYPE_INT64:
case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
case NANOARROW_TYPE_UINT64: return data_type(type_id::INT64);
default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
}
}();
Expand Down
6 changes: 3 additions & 3 deletions cpp/tests/copying/get_value_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ TYPED_TEST_SUITE(DictionaryGetValueTest, cudf::test::FixedWidthTypesWithoutFixed
TYPED_TEST(DictionaryGetValueTest, BasicGet)
{
cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
cudf::test::fixed_width_column_wrapper<uint32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
cudf::test::fixed_width_column_wrapper<int32_t> indices{0, 0, 1, 2, 1, 3, 3, 2};
auto col = cudf::make_dictionary_column(keys, indices);

auto s = cudf::get_element(*col, 2);
Expand All @@ -147,7 +147,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet)
TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
{
cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
cudf::test::fixed_width_column_wrapper<uint32_t> indices(
cudf::test::fixed_width_column_wrapper<int32_t> indices(
{0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
auto col = cudf::make_dictionary_column(keys, indices);

Expand All @@ -163,7 +163,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
TYPED_TEST(DictionaryGetValueTest, GetNull)
{
cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
cudf::test::fixed_width_column_wrapper<uint32_t> indices(
cudf::test::fixed_width_column_wrapper<int32_t> indices(
{0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
auto col = cudf::make_dictionary_column(keys, indices);

Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/dictionary/add_keys_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ TEST_F(DictionaryAddKeysTest, StringsColumn)
cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee", "fff"});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
cudf::test::fixed_width_column_wrapper<int8_t> indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
}

Expand All @@ -58,7 +58,7 @@ TEST_F(DictionaryAddKeysTest, FloatColumn)
cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 5.0, 7.125};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint8_t> expected{2, 4, 1, 0, 4, 1};
cudf::test::fixed_width_column_wrapper<int8_t> expected{2, 4, 1, 0, 4, 1};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
}

Expand Down
8 changes: 4 additions & 4 deletions cpp/tests/dictionary/encode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ TEST_F(DictionaryEncodeTest, EncodeStringColumn)
cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee"});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
cudf::test::fixed_width_column_wrapper<int32_t> indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected);
}

Expand All @@ -48,7 +48,7 @@ TEST_F(DictionaryEncodeTest, EncodeFloat)
cudf::test::fixed_width_column_wrapper<float> keys_expected{-11.75, 0.5, 4.25, 7.125};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint32_t> expected{2, 3, 1, 0, 3, 1};
cudf::test::fixed_width_column_wrapper<int32_t> expected{2, 3, 1, 0, 3, 1};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
}

Expand All @@ -64,14 +64,14 @@ TEST_F(DictionaryEncodeTest, EncodeWithNull)
cudf::test::fixed_width_column_wrapper<int64_t> keys_expected{0, 111, 222, 333, 444};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);

cudf::test::fixed_width_column_wrapper<uint32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
cudf::test::fixed_width_column_wrapper<int32_t> expected{4, 0, 3, 1, 2, 5, 2, 4, 0};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected);
}

TEST_F(DictionaryEncodeTest, InvalidEncode)
{
cudf::test::fixed_width_column_wrapper<int16_t> input{0, 1, 2, 3, -1, -2, -3};

EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::INT16}),
EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::UINT16}),
cudf::logic_error);
}
Loading

0 comments on commit 532bb77

Please sign in to comment.