From 1719cda0b18bf3f15426f827fc49e23f0ec3bd40 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:41:11 -0500 Subject: [PATCH] Remove calls to strings_column_view::offsets_begin() (#15112) Removes calls to `cudf::strings_column_view::offsets_begin()` since the result cannot have a hardcoded integer type. The goal is to deprecate this member function in this release. Follow on changes may be required to further enable large strings support to these functions. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/15112 --- cpp/examples/strings/custom_prealloc.cu | 2 +- cpp/src/transform/row_conversion.cu | 33 +++++++++++++------------ cpp/tests/io/json_type_cast_test.cu | 33 +++++++++++-------------- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index 93194899fe1..27b553731f8 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -98,7 +98,7 @@ std::unique_ptr redact_strings(cudf::column_view const& names, nvtxRangePushA("redact_strings"); auto const scv = cudf::strings_column_view(names); - auto const offsets = scv.offsets_begin(); + auto const offsets = scv.offsets().begin(); // create working memory to hold the output of each string auto working_memory = rmm::device_uvector(scv.chars_size(stream), stream); diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index 361a3610afa..32faa097d0e 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -212,7 +213,7 @@ struct batch_data { * @return pair of device vector of size_types of the row sizes of the table and a device vector of * offsets into the string column */ -std::pair, rmm::device_uvector> +std::pair, rmm::device_uvector> build_string_row_offsets(table_view const& tbl, size_type fixed_width_and_validity_size, rmm::cuda_stream_view stream) @@ -222,20 +223,20 @@ build_string_row_offsets(table_view const& tbl, thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0); auto d_offsets_iterators = [&]() { - std::vector offsets_iterators; - auto offsets_iter = thrust::make_transform_iterator( - tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator { - if (!is_fixed_width(col.type())) { - CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); - return strings_column_view(col).offsets_begin(); - } else { - return nullptr; - } + std::vector offsets_iterators; + auto itr = thrust::make_transform_iterator( + tbl.begin(), [](auto const& col) -> cudf::detail::input_offsetalator { + return cudf::detail::offsetalator_factory::make_input_iterator( + strings_column_view(col).offsets(), col.offset()); }); - std::copy_if(offsets_iter, - offsets_iter + tbl.num_columns(), - std::back_inserter(offsets_iterators), - [](auto const& offset_ptr) { return offset_ptr != nullptr; }); + auto stencil = thrust::make_transform_iterator( + tbl.begin(), [](auto const& col) -> bool { return !is_fixed_width(col.type()); }); + thrust::copy_if(thrust::host, + itr, + itr + tbl.num_columns(), + stencil, + std::back_inserter(offsets_iterators), + thrust::identity{}); return make_device_uvector_sync( offsets_iterators, stream, rmm::mr::get_current_device_resource()); }(); @@ -858,7 +859,7 @@ CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows, size_type const num_variable_columns, int8_t const** variable_input_data, size_type const* variable_col_output_offsets, - size_type const** variable_col_offsets, + cudf::detail::input_offsetalator* variable_col_offsets, size_type fixed_width_row_size, RowOffsetFunctor row_offsets, size_type const batch_row_offset, @@ -1844,7 +1845,7 @@ std::vector> convert_to_rows( batch_data& batch_info, offsetFunctor offset_functor, column_info_s const& column_info, - std::optional> variable_width_offsets, + std::optional> variable_width_offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 8a541022ab0..fe430010f4b 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -34,6 +35,8 @@ #include +#include + #include #include #include @@ -43,25 +46,15 @@ using namespace cudf::test::iterators; struct JSONTypeCastTest : public cudf::test::BaseFixture {}; namespace { -struct offsets_to_length { - __device__ cudf::size_type operator()(thrust::tuple const& p) - { - return thrust::get<1>(p) - thrust::get<0>(p); - } -}; /// Returns length of each string in the column auto string_offset_to_length(cudf::strings_column_view const& column, rmm::cuda_stream_view stream) { - auto offsets_begin = column.offsets_begin(); - auto offsets_pair = - thrust::make_zip_iterator(thrust::make_tuple(offsets_begin, thrust::next(offsets_begin))); rmm::device_uvector svs_length(column.size(), stream); - thrust::transform(rmm::exec_policy(cudf::get_default_stream()), - offsets_pair, - offsets_pair + column.size(), - svs_length.begin(), - offsets_to_length{}); + auto itr = + cudf::detail::offsetalator_factory::make_input_iterator(column.offsets(), column.offset()); + thrust::adjacent_difference( + rmm::exec_policy(stream), itr + 1, itr + column.size() + 1, svs_length.begin()); return svs_length; } } // namespace @@ -96,7 +89,8 @@ TEST_F(JSONTypeCastTest, String) auto str_col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask), @@ -129,7 +123,8 @@ TEST_F(JSONTypeCastTest, Int) auto col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask), @@ -169,7 +164,8 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask), @@ -238,7 +234,8 @@ TEST_F(JSONTypeCastTest, ErrorNulls) auto str_col = cudf::io::json::detail::parse_data( column.chars_begin(stream), - thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(column.offsets().begin(), svs_length.begin())), column.size(), type, std::move(null_mask),