From 532bb7766e23144d2e271b541cf9bed46827a2d0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 20 Nov 2024 16:18:23 -0500 Subject: [PATCH 1/5] Change indices for dictionary column to signed integer type --- .../cudf/column/column_device_view.cuh | 6 ++-- cpp/include/cudf/dictionary/encode.hpp | 4 +-- cpp/include/cudf_test/column_wrapper.hpp | 8 ++--- cpp/src/column/column_factories.cpp | 2 +- cpp/src/dictionary/add_keys.cu | 6 ++-- cpp/src/dictionary/detail/concatenate.cu | 2 +- cpp/src/dictionary/dictionary_factories.cu | 4 +-- cpp/src/dictionary/encode.cu | 12 +++---- cpp/src/dictionary/remove_keys.cu | 4 +-- cpp/src/dictionary/search.cu | 10 +++--- cpp/src/interop/from_arrow_device.cu | 8 ++--- cpp/src/interop/from_arrow_host.cu | 8 ++--- cpp/tests/copying/get_value_tests.cpp | 6 ++-- cpp/tests/dictionary/add_keys_test.cpp | 4 +-- cpp/tests/dictionary/encode_test.cpp | 8 ++--- cpp/tests/dictionary/factories_test.cpp | 31 +++++++++---------- cpp/tests/dictionary/search_test.cpp | 16 +++++----- cpp/tests/interop/from_arrow_host_test.cpp | 20 ++++++------ cpp/tests/interop/nanoarrow_utils.hpp | 2 ++ cpp/tests/interop/to_arrow_device_test.cpp | 5 ++- cpp/tests/interop/to_arrow_test.cpp | 7 ++--- cpp/tests/rolling/lead_lag_test.cpp | 4 +-- cpp/tests/streams/dictionary_test.cpp | 16 +++++----- 23 files changed, 93 insertions(+), 100 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 35a39ef9758..db6d5255616 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -460,7 +460,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { */ struct index_element_fn { template () and std::is_unsigned_v)> + CUDF_ENABLE_IF(is_index_type() and std::is_signed_v)> __device__ size_type operator()(column_device_view const& indices, size_type index) { return static_cast(indices.element(index)); @@ -468,10 +468,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base { template () and std::is_unsigned_v))> + CUDF_ENABLE_IF(not(is_index_type() and std::is_signed_v))> __device__ size_type operator()(Args&&... args) { - CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type"); + CUDF_UNREACHABLE("dictionary indices must be a signed integral type"); } }; diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp index dc81fd74992..ced6bd2afa4 100644 --- a/cpp/include/cudf/dictionary/encode.hpp +++ b/cpp/include/cudf/dictionary/encode.hpp @@ -41,7 +41,7 @@ namespace dictionary { * * The null mask and null count are copied from the input column to the output column. * - * @throw cudf::logic_error if indices type is not an unsigned integer type + * @throw cudf::logic_error if indices type is not a signed integer type * @throw cudf::logic_error if the column to encode is already a DICTIONARY type * * @code{.pseudo} @@ -58,7 +58,7 @@ namespace dictionary { */ std::unique_ptr encode( column_view const& column, - data_type indices_type = data_type{type_id::UINT32}, + data_type indices_type = data_type{type_id::INT32}, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 6206c1311d2..6300bb87572 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -974,7 +974,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { { wrapped = cudf::dictionary::encode(fixed_width_column_wrapper(begin, end), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } @@ -1009,7 +1009,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { { wrapped = cudf::dictionary::encode( fixed_width_column_wrapper(begin, end, v), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } @@ -1173,7 +1173,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{} { wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } @@ -1210,7 +1210,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { : column_wrapper{} { wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 972f97e8668..050c23b0a3d 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -178,7 +178,7 @@ std::unique_ptr make_dictionary_from_scalar(scalar const& s, CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key"); return make_dictionary_column( make_column_from_scalar(s, 1, stream, mr), - make_column_from_scalar(numeric_scalar(0, true, stream), size, stream, mr), + make_column_from_scalar(numeric_scalar(0, true, stream), size, stream, mr), rmm::device_buffer{0, stream, mr}, 0); } diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 565055009ba..a851fc6069d 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -106,10 +106,10 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column auto indices_column = [&] { column_view gather_result = table_indices.front()->view(); auto const indices_size = gather_result.size(); - // we can just use the lower-bound/gather data directly for UINT32 case - if (indices_type.id() == type_id::UINT32) { + // we can just use the lower-bound/gather data directly for INT32 case + if (indices_type.id() == type_id::INT32) { auto contents = table_indices.front()->release(); - return std::make_unique(data_type{type_id::UINT32}, + return std::make_unique(data_type{type_id::INT32}, indices_size, std::move(*(contents.data.release())), rmm::device_buffer{0, stream, mr}, diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index b3a8bb4cd20..0f17858094b 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -252,7 +252,7 @@ std::unique_ptr concatenate(host_span columns, std::transform(columns.begin(), columns.end(), indices_views.begin(), [](auto cv) { auto dict_view = dictionary_column_view(cv); if (dict_view.is_empty()) { - return column_view{data_type{type_id::UINT32}, 0, nullptr, nullptr, 0}; + return column_view{data_type{type_id::INT32}, 0, nullptr, nullptr, 0}; } return dict_view.get_indices_annotated(); // nicely includes validity mask and view offset }); diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index 3e0c98d36ea..de9550f012d 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -33,7 +33,7 @@ struct dispatch_create_indices { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(std::is_unsigned(), "indices must be an unsigned type"); + CUDF_EXPECTS(std::is_signed(), "indices must be an signed type"); column_view indices_view{ indices.type(), indices.size(), indices.data(), nullptr, 0, indices.offset()}; return std::make_unique(indices_view, stream, mr); @@ -83,7 +83,7 @@ std::unique_ptr make_dictionary_column(std::unique_ptr keys_colu { CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls"); CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls"); - CUDF_EXPECTS(is_unsigned(indices_column->type()), "indices must be type unsigned integer"); + CUDF_EXPECTS(!is_unsigned(indices_column->type()), "indices must be type unsigned integer"); auto count = indices_column->size(); std::vector> children; diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index c8ccb511e8f..9e934341787 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -44,7 +44,7 @@ std::unique_ptr encode(column_view const& input_column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer"); + CUDF_EXPECTS(!is_unsigned(indices_type), "indices must be type signed integer"); CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32, "cannot encode a dictionary from a dictionary"); @@ -63,10 +63,6 @@ std::unique_ptr encode(column_view const& input_column, keys_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); // remove the null-mask } - // the encode() returns INT32 for indices - if (indices_column->type().id() != indices_type.id()) - indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr); - // create column with keys_column and indices_column return make_dictionary_column(std::move(keys_column), std::move(indices_column), @@ -79,9 +75,9 @@ std::unique_ptr encode(column_view const& input_column, */ data_type get_indices_type_for_size(size_type keys_size) { - if (keys_size <= std::numeric_limits::max()) return data_type{type_id::UINT8}; - if (keys_size <= std::numeric_limits::max()) return data_type{type_id::UINT16}; - return data_type{type_id::UINT32}; + if (keys_size <= std::numeric_limits::max()) return data_type{type_id::INT8}; + if (keys_size <= std::numeric_limits::max()) return data_type{type_id::INT16}; + return data_type{type_id::INT32}; } } // namespace detail diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 119f43a4ae9..59c8453cf33 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -180,11 +180,11 @@ std::unique_ptr remove_unused_keys(dictionary_column_view const& diction // search the indices values with key indices to look for any holes auto const matches = [&] { // build keys index to verify against indices values - rmm::device_uvector keys_positions(keys_size, stream); + rmm::device_uvector keys_positions(keys_size, stream); thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end()); // wrap the indices for comparison in contains() column_view keys_positions_view( - data_type{type_id::UINT32}, keys_size, keys_positions.data(), nullptr, 0); + data_type{type_id::INT32}, keys_size, keys_positions.data(), nullptr, 0); return cudf::detail::contains(indices_view, keys_positions_view, stream, mr); }(); auto d_matches = matches->view().data(); diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu index 04e2c17635d..286b1a87df2 100644 --- a/cpp/src/dictionary/search.cu +++ b/cpp/src/dictionary/search.cu @@ -158,8 +158,9 @@ std::unique_ptr get_index(dictionary_column_view const& dictionary, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (dictionary.is_empty()) - return std::make_unique>(0, false, stream, mr); + if (dictionary.is_empty()) { + return std::make_unique>(0, false, stream, mr); + } return type_dispatcher( dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr); } @@ -169,8 +170,9 @@ std::unique_ptr get_insert_index(dictionary_column_view const& dictionar rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (dictionary.is_empty()) - return std::make_unique>(0, false, stream, mr); + if (dictionary.is_empty()) { + return std::make_unique>(0, false, stream, mr); + } return type_dispatcher( dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr); } diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 057e563c86e..5e7f3ecf0be 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -200,13 +200,13 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( // to modify the buffers. switch (schema->storage_type) { case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_UINT8: return data_type(type_id::INT8); case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_UINT16: return data_type(type_id::INT16); case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_UINT32: return data_type(type_id::INT32); case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + case NANOARROW_TYPE_UINT64: return data_type(type_id::INT64); default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); } }(); diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 2e9504a6726..893256b53f8 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -273,13 +273,13 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()storage_type) { case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_UINT8: return data_type(type_id::INT8); case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_UINT16: return data_type(type_id::INT16); case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_UINT32: return data_type(type_id::INT32); case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + case NANOARROW_TYPE_UINT64: return data_type(type_id::INT64); default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); } }(); diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp index b2d64dac7c8..9e8525cd96b 100644 --- a/cpp/tests/copying/get_value_tests.cpp +++ b/cpp/tests/copying/get_value_tests.cpp @@ -132,7 +132,7 @@ TYPED_TEST_SUITE(DictionaryGetValueTest, cudf::test::FixedWidthTypesWithoutFixed TYPED_TEST(DictionaryGetValueTest, BasicGet) { cudf::test::fixed_width_column_wrapper keys({6, 7, 8, 9}); - cudf::test::fixed_width_column_wrapper indices{0, 0, 1, 2, 1, 3, 3, 2}; + cudf::test::fixed_width_column_wrapper indices{0, 0, 1, 2, 1, 3, 3, 2}; auto col = cudf::make_dictionary_column(keys, indices); auto s = cudf::get_element(*col, 2); @@ -147,7 +147,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet) TYPED_TEST(DictionaryGetValueTest, GetFromNullable) { cudf::test::fixed_width_column_wrapper keys({6, 7, 8, 9}); - cudf::test::fixed_width_column_wrapper indices( + cudf::test::fixed_width_column_wrapper indices( {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false}); auto col = cudf::make_dictionary_column(keys, indices); @@ -163,7 +163,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable) TYPED_TEST(DictionaryGetValueTest, GetNull) { cudf::test::fixed_width_column_wrapper keys({6, 7, 8, 9}); - cudf::test::fixed_width_column_wrapper indices( + cudf::test::fixed_width_column_wrapper indices( {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false}); auto col = cudf::make_dictionary_column(keys, indices); diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp index ebc8c11e86c..da8231fb8be 100644 --- a/cpp/tests/dictionary/add_keys_test.cpp +++ b/cpp/tests/dictionary/add_keys_test.cpp @@ -41,7 +41,7 @@ TEST_F(DictionaryAddKeysTest, StringsColumn) cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee", "fff"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0}); + cudf::test::fixed_width_column_wrapper indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected); } @@ -58,7 +58,7 @@ TEST_F(DictionaryAddKeysTest, FloatColumn) cudf::test::fixed_width_column_wrapper keys_expected{-11.75, 0.5, 4.25, 5.0, 7.125}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper expected{2, 4, 1, 0, 4, 1}; + cudf::test::fixed_width_column_wrapper expected{2, 4, 1, 0, 4, 1}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected); } diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp index dfa3ede5d46..46319bb376d 100644 --- a/cpp/tests/dictionary/encode_test.cpp +++ b/cpp/tests/dictionary/encode_test.cpp @@ -34,7 +34,7 @@ TEST_F(DictionaryEncodeTest, EncodeStringColumn) cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0}); + cudf::test::fixed_width_column_wrapper indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected); } @@ -48,7 +48,7 @@ TEST_F(DictionaryEncodeTest, EncodeFloat) cudf::test::fixed_width_column_wrapper keys_expected{-11.75, 0.5, 4.25, 7.125}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper expected{2, 3, 1, 0, 3, 1}; + cudf::test::fixed_width_column_wrapper expected{2, 3, 1, 0, 3, 1}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected); } @@ -64,7 +64,7 @@ TEST_F(DictionaryEncodeTest, EncodeWithNull) cudf::test::fixed_width_column_wrapper keys_expected{0, 111, 222, 333, 444}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper expected{4, 0, 3, 1, 2, 5, 2, 4, 0}; + cudf::test::fixed_width_column_wrapper expected{4, 0, 3, 1, 2, 5, 2, 4, 0}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected); } @@ -72,6 +72,6 @@ TEST_F(DictionaryEncodeTest, InvalidEncode) { cudf::test::fixed_width_column_wrapper input{0, 1, 2, 3, -1, -2, -3}; - EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::INT16}), + EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::UINT16}), cudf::logic_error); } diff --git a/cpp/tests/dictionary/factories_test.cpp b/cpp/tests/dictionary/factories_test.cpp index 051ea45aed6..30e3984d66d 100644 --- a/cpp/tests/dictionary/factories_test.cpp +++ b/cpp/tests/dictionary/factories_test.cpp @@ -29,7 +29,7 @@ struct DictionaryFactoriesTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryFactoriesTest, CreateFromColumnViews) { cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"}); - cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; auto dictionary = cudf::make_dictionary_column(keys, values); cudf::dictionary_column_view view(dictionary->view()); @@ -41,8 +41,8 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumnViews) TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls) { cudf::test::fixed_width_column_wrapper keys{-11.75, 4.25, 7.125, 0.5, 12.0}; - std::vector h_values{1, 3, 2, 0, 1, 4, 1}; - cudf::test::fixed_width_column_wrapper indices( + std::vector h_values{1, 3, 2, 0, 1, 4, 1}; + cudf::test::fixed_width_column_wrapper indices( h_values.begin(), h_values.end(), thrust::make_transform_iterator(h_values.begin(), [](auto v) { return v > 0; })); @@ -50,8 +50,7 @@ TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls) cudf::dictionary_column_view view(dictionary->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), - h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -59,16 +58,15 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumns) { std::vector h_keys{"pear", "apple", "fruit", "macintosh"}; cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{1, 2, 3, 1, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{1, 2, 3, 1, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto dictionary = cudf::make_dictionary_column(keys.release(), values.release(), rmm::device_buffer{}, 0); cudf::dictionary_column_view view(dictionary->view()); cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), - h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -77,8 +75,8 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls) { std::vector h_keys{-1234567890, -987654321, 0, 19283714}; cudf::test::fixed_width_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{1, 2, 3, 1, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{1, 2, 3, 1, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto size = static_cast(h_values.size()); rmm::device_buffer null_mask = create_null_mask(size, cudf::mask_state::ALL_NULL); auto dictionary = @@ -88,8 +86,7 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls) EXPECT_EQ(size, view.null_count()); cudf::test::fixed_width_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), - h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -98,15 +95,15 @@ TEST_F(DictionaryFactoriesTest, KeysWithNulls) { cudf::test::fixed_width_column_wrapper keys{{0, 1, 2, 3, 4}, {true, true, true, false, true}}; - cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; + cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error); } TEST_F(DictionaryFactoriesTest, IndicesWithNulls) { cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 3, 4}; - cudf::test::fixed_width_column_wrapper indices{{5, 4, 3, 2, 1, 0}, - {true, true, true, false, true, false}}; + cudf::test::fixed_width_column_wrapper indices{{5, 4, 3, 2, 1, 0}, + {true, true, true, false, true, false}}; EXPECT_THROW( cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0), cudf::logic_error); @@ -115,7 +112,7 @@ TEST_F(DictionaryFactoriesTest, IndicesWithNulls) TEST_F(DictionaryFactoriesTest, InvalidIndices) { cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 3, 4}; - cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; + cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error); EXPECT_THROW( cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0), diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp index 2774173b80a..d5877f12184 100644 --- a/cpp/tests/dictionary/search_test.cpp +++ b/cpp/tests/dictionary/search_test.cpp @@ -31,8 +31,8 @@ TEST_F(DictionarySearchTest, StringsColumn) auto result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("ccc")); EXPECT_TRUE(result->is_valid()); - auto n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{3}, n_result->value()); + auto n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{3}, n_result->value()); result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("eee")); EXPECT_FALSE(result->is_valid()); @@ -40,8 +40,8 @@ TEST_F(DictionarySearchTest, StringsColumn) cudf::string_scalar("eee"), cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{5}, n_result->value()); + n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{5}, n_result->value()); } TEST_F(DictionarySearchTest, WithNulls) @@ -51,8 +51,8 @@ TEST_F(DictionarySearchTest, WithNulls) auto result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar(4)); EXPECT_TRUE(result->is_valid()); - auto n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{0}, n_result->value()); + auto n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{0}, n_result->value()); result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar(5)); EXPECT_FALSE(result->is_valid()); @@ -60,8 +60,8 @@ TEST_F(DictionarySearchTest, WithNulls) cudf::numeric_scalar(5), cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{1}, n_result->value()); + n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{1}, n_result->value()); } TEST_F(DictionarySearchTest, EmptyColumn) diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index d93ef28aab8..1ab11b374b6 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -460,19 +460,17 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) // test dictionary arrays with different index types // cudf asserts that the index type must be unsigned auto array1 = - get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); auto array2 = - get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); auto array3 = - get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); // create equivalent cudf dictionary columns auto keys_col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 7}); - auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); - auto ind2_col = - cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); - auto ind3_col = - cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind2_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind3_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); vector_of_columns columns; columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col)); @@ -485,19 +483,19 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) ArrowSchemaInit(input_schema.get()); NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3)); - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0])); NANOARROW_THROW_NOT_OK( ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b")); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1])); NANOARROW_THROW_NOT_OK( ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c")); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2])); NANOARROW_THROW_NOT_OK( diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 8be7e087b6d..7d655cdbaba 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -212,6 +212,8 @@ DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64); DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); DEFINE_NANOARROW_STORAGE(int32_t, INT32); DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128); +DEFINE_NANOARROW_STORAGE(int8_t, INT8); +DEFINE_NANOARROW_STORAGE(int16_t, INT16); #undef DEFINE_NANOARROW_STORAGE diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 29aa928c277..112b3e1d8e2 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -48,7 +48,6 @@ get_nanoarrow_cudf_table(cudf::size_type length) .release()); auto col4 = cudf::test::fixed_width_column_wrapper( test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); - auto dict_col = cudf::dictionary::encode(col4); columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), test_data.bool_data.end(), @@ -103,7 +102,7 @@ get_nanoarrow_cudf_table(cudf::size_type length) schema->children[1]->flags = 0; } - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_INT32)); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2])); NANOARROW_THROW_NOT_OK( ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); @@ -181,7 +180,7 @@ get_nanoarrow_tables(cudf::size_type length) populate_from_col(arrow->children[0], table->get_column(0).view()); populate_from_col(arrow->children[1], table->get_column(1).view()); - populate_dict_from_col( + populate_dict_from_col( arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view())); populate_from_col(arrow->children[3], table->get_column(3).view()); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 86295d8efb1..aebdc11d445 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -63,7 +63,6 @@ std::pair, std::shared_ptr> get_table auto validity_generator = []() { return rand() % 7 != 0; }; std::generate( list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); - // cudf::size_type n = 0; std::generate( list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { return (n++) * length_of_individual_list; @@ -87,7 +86,6 @@ std::pair, std::shared_ptr> get_table .release()); auto col4 = cudf::test::fixed_width_column_wrapper( int64_data.begin(), int64_data.end(), validity.begin()); - auto dict_col = cudf::dictionary::encode(col4); columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( bool_data.begin(), bool_data.end(), bool_validity.begin()) @@ -120,11 +118,12 @@ std::pair, std::shared_ptr> get_table auto int64array = get_arrow_array(int64_data, validity); auto string_array = get_arrow_array(string_data, validity); + auto dict_col = cudf::dictionary::encode(col4); cudf::dictionary_column_view view(dict_col->view()); auto keys = cudf::test::to_host(view.keys()).first; - auto indices = cudf::test::to_host(view.indices()).first; + auto indices = cudf::test::to_host(view.indices()).first; auto dict_array = get_arrow_dict_array(std::vector(keys.begin(), keys.end()), - std::vector(indices.begin(), indices.end()), + std::vector(indices.begin(), indices.end()), validity); auto boolarray = get_arrow_array(bool_data, bool_validity); auto list_array = get_arrow_list_array( diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp index 6519b0ed4ee..d82f512329f 100644 --- a/cpp/tests/rolling/lead_lag_test.cpp +++ b/cpp/tests/rolling/lead_lag_test.cpp @@ -1098,7 +1098,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary) auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release(); auto expected_values = - cudf::test::fixed_width_column_wrapper{ + cudf::test::fixed_width_column_wrapper{ {2, 3, 4, 5, 0, 0, 7, 8, 9, 10, 0, 0}, cudf::test::iterators::nulls_at(std::vector{4, 5, 10, 11})} .release(); @@ -1118,7 +1118,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary) auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release(); auto expected_values = - cudf::test::fixed_width_column_wrapper{ + cudf::test::fixed_width_column_wrapper{ {0, 0, 1, 2, 3, 4, 0, 6, 0, 7, 8, 9}, cudf::test::iterators::nulls_at(std::vector{0, 6})} .release(); auto expected_output = diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp index 03e4cf47470..498504ef212 100644 --- a/cpp/tests/streams/dictionary_test.cpp +++ b/cpp/tests/streams/dictionary_test.cpp @@ -29,7 +29,7 @@ class DictionaryTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryTest, FactoryColumnViews) { cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"}); - cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; auto dictionary = cudf::make_dictionary_column(keys, values, cudf::test::get_default_stream()); cudf::dictionary_column_view view(dictionary->view()); @@ -42,15 +42,15 @@ TEST_F(DictionaryTest, FactoryColumns) { std::vector h_keys{"aaa", "ccc", "ddd", "www"}; cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto dictionary = cudf::make_dictionary_column( keys.release(), values.release(), cudf::test::get_default_stream()); cudf::dictionary_column_view view(dictionary->view()); cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -59,15 +59,15 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount) { std::vector h_keys{"aaa", "ccc", "ddd", "www"}; cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto dictionary = cudf::make_dictionary_column( keys.release(), values.release(), rmm::device_buffer{}, 0, cudf::test::get_default_stream()); cudf::dictionary_column_view view(dictionary->view()); cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -75,7 +75,7 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount) TEST_F(DictionaryTest, Encode) { cudf::test::fixed_width_column_wrapper col({1, 2, 3, 4, 5}); - cudf::data_type int32_type(cudf::type_id::UINT32); + cudf::data_type int32_type(cudf::type_id::INT32); cudf::column_view col_view = col; cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream()); } From 628a17059377d96c2d3f6229e7deb54273b3ea5e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 25 Nov 2024 10:46:45 -0500 Subject: [PATCH 2/5] support only signed-to-signed in arrow interop --- cpp/src/interop/from_arrow_device.cu | 17 +++++------------ cpp/src/interop/from_arrow_host.cu | 17 +++++------------ 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 5e7f3ecf0be..cb3c4c55a61 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -194,19 +194,12 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr); auto const dict_indices_type = [&schema]() -> data_type { - // cudf dictionary requires an unsigned type for the indices, - // since it is invalid for an arrow dictionary to contain negative - // indices, we can safely use the unsigned equivalent without having - // to modify the buffers. + // cudf dictionary requires a signed type for the indices switch (schema->storage_type) { - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: return data_type(type_id::INT8); - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: return data_type(type_id::INT16); - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: return data_type(type_id::INT32); - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: return data_type(type_id::INT64); + case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); + case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); + case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); + case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); } }(); diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 893256b53f8..b5d2427e288 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -267,19 +267,12 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()dictionary, keys_type, true, stream, mr); auto const dict_indices_type = [&schema]() -> data_type { - // cudf dictionary requires an unsigned type for the indices, - // since it is invalid for an arrow dictionary to contain negative - // indices, we can safely use the unsigned equivalent without having - // to modify the buffers. + // cudf dictionary requires a signed type for the indices switch (schema->storage_type) { - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: return data_type(type_id::INT8); - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: return data_type(type_id::INT16); - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: return data_type(type_id::INT32); - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: return data_type(type_id::INT64); + case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); + case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); + case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); + case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); } }(); From 29297804c2128e878d96eeedcb5d7346806e34e0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 25 Nov 2024 10:52:26 -0500 Subject: [PATCH 3/5] sort DEFINE_NANOARROW_STORAGE items --- cpp/tests/interop/nanoarrow_utils.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 7d655cdbaba..b7b8202a3c2 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -200,20 +200,20 @@ struct nanoarrow_storage_type {}; static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \ } -DEFINE_NANOARROW_STORAGE(bool, BOOL); +DEFINE_NANOARROW_STORAGE(int8_t, INT8); +DEFINE_NANOARROW_STORAGE(int16_t, INT16); +DEFINE_NANOARROW_STORAGE(int32_t, INT32); DEFINE_NANOARROW_STORAGE(int64_t, INT64); +DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); DEFINE_NANOARROW_STORAGE(uint16_t, UINT16); DEFINE_NANOARROW_STORAGE(uint64_t, UINT64); +DEFINE_NANOARROW_STORAGE(bool, BOOL); DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32); DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64); DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64); DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64); DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64); -DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); -DEFINE_NANOARROW_STORAGE(int32_t, INT32); DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128); -DEFINE_NANOARROW_STORAGE(int8_t, INT8); -DEFINE_NANOARROW_STORAGE(int16_t, INT16); #undef DEFINE_NANOARROW_STORAGE From 6e0c139905c4f681f8383b7f935add5a5841cef5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 3 Dec 2024 07:58:33 -0500 Subject: [PATCH 4/5] add is_signed trait --- cpp/include/cudf/utilities/traits.hpp | 23 ++++++++++++++++++++++ cpp/src/dictionary/dictionary_factories.cu | 6 ++++-- cpp/src/dictionary/encode.cu | 3 ++- cpp/src/utilities/traits.cpp | 16 +++++++++++++++ 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 22a67ca049a..dae1cd38832 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -217,6 +217,29 @@ constexpr inline bool is_index_type() */ bool is_index_type(data_type type); +/** + * @brief Indicates whether the type `T` is a signed numeric type. + * + * @tparam T The type to verify + * @return true `T` is signed numeric + */ +template +constexpr inline bool is_signed() +{ + return std::is_signed_v; +} + +/** + * @brief Indicates whether `type` is a signed numeric `data_type`. + * + * "Signed Numeric" types include fundamental integral types such as `INT*` + * but can also be `FLOAT*` types. + * + * @param type The `data_type` to verify + * @return true `type` is signed numeric + */ +bool is_signed(data_type type); + /** * @brief Indicates whether the type `T` is a unsigned numeric type. * diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index de9550f012d..9db0a45c86b 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -33,7 +33,8 @@ struct dispatch_create_indices { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(std::is_signed(), "indices must be an signed type"); + CUDF_EXPECTS(std::is_signed() && std::is_integral(), + "indices must be an signed type"); column_view indices_view{ indices.type(), indices.size(), indices.data(), nullptr, 0, indices.offset()}; return std::make_unique(indices_view, stream, mr); @@ -83,7 +84,8 @@ std::unique_ptr make_dictionary_column(std::unique_ptr keys_colu { CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls"); CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls"); - CUDF_EXPECTS(!is_unsigned(indices_column->type()), "indices must be type unsigned integer"); + CUDF_EXPECTS(is_signed(indices_column->type()) && is_index_type(indices_column->type()), + "indices must be type unsigned integer"); auto count = indices_column->size(); std::vector> children; diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index 9e934341787..5935b4f13e8 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -44,7 +44,8 @@ std::unique_ptr encode(column_view const& input_column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(!is_unsigned(indices_type), "indices must be type signed integer"); + CUDF_EXPECTS(is_signed(indices_type) && is_index_type(indices_type), + "indices must be type signed integer"); CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32, "cannot encode a dictionary from a dictionary"); diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp index 41ee4e960b6..86b4db02f54 100644 --- a/cpp/src/utilities/traits.cpp +++ b/cpp/src/utilities/traits.cpp @@ -127,6 +127,22 @@ struct is_index_type_impl { */ bool is_index_type(data_type type) { return cudf::type_dispatcher(type, is_index_type_impl{}); } +struct is_signed_impl { + template + constexpr bool operator()() + { + return is_signed(); + } +}; + +/** + * @brief Indicates whether `type` is a signed numeric `data_type`. + * + * @param type The `data_type` to verify + * @return true `type` is signed numeric + */ +bool is_signed(data_type type) { return cudf::type_dispatcher(type, is_signed_impl{}); } + struct is_unsigned_impl { template constexpr bool operator()() From d5970e7b212aeea37ec2e577b1be72b8ceb97c4a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 3 Dec 2024 19:37:34 -0500 Subject: [PATCH 5/5] fix if-signed check in dictionary factory functor --- cpp/src/dictionary/dictionary_factories.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index 9db0a45c86b..9f81c852a30 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -33,8 +33,7 @@ struct dispatch_create_indices { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(std::is_signed() && std::is_integral(), - "indices must be an signed type"); + CUDF_EXPECTS(cudf::is_signed(), "indices must be a signed type"); column_view indices_view{ indices.type(), indices.size(), indices.data(), nullptr, 0, indices.offset()}; return std::make_unique(indices_view, stream, mr);