From e252fb4b1e5b09d2fff96935cc5cd6a667606138 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 9 Oct 2024 15:01:06 -0700 Subject: [PATCH] Complete `make_strings_column_batch` Signed-off-by: Nghia Truong --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/make_strings_column.cu | 117 +++++++++++ cpp/include/cudf/column/column_factories.hpp | 19 ++ cpp/include/cudf/detail/valid_if.cuh | 68 ++++++ .../detail/strings_column_factories.cuh | 92 +++++--- cpp/src/strings/strings_column_factories.cu | 196 ++++++++++++++++-- cpp/tests/strings/factories_test.cu | 107 ++++++++++ 7 files changed, 551 insertions(+), 49 deletions(-) create mode 100644 cpp/benchmarks/string/make_strings_column.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index b0f75b25975..a49e196cb52 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -383,6 +383,7 @@ ConfigureNVBench( string/join_strings.cpp string/lengths.cpp string/like.cpp + string/make_strings_column.cu string/replace_re.cpp string/reverse.cpp string/slice.cpp diff --git a/cpp/benchmarks/string/make_strings_column.cu b/cpp/benchmarks/string/make_strings_column.cu new file mode 100644 index 00000000000..10c7eb65bdd --- /dev/null +++ b/cpp/benchmarks/string/make_strings_column.cu @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +constexpr int min_row_width = 0; +constexpr int max_row_width = 50; + +static void BM_make_strings_column(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const has_nulls = static_cast(state.get_int64("has_nulls")); + + data_profile const table_profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width) + .null_probability(has_nulls ? std::optional{0.1} : std::nullopt); + auto const data_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + + using string_index_pair = thrust::pair; + auto const stream = cudf::get_default_stream(); + auto input = rmm::device_uvector(data_table->num_rows(), stream); + auto const d_data_ptr = + cudf::column_device_view::create(data_table->get_column(0).view(), stream); + thrust::tabulate(rmm::exec_policy(stream), + input.begin(), + input.end(), + [data_col = *d_data_ptr] __device__(auto const idx) { + if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; } + auto const row = data_col.element(idx); + return string_index_pair{row.data(), row.size_bytes()}; + }); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + [[maybe_unused]] auto const output = cudf::make_strings_column(input, stream); + }); +} + +static void BM_make_strings_column_batch(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const has_nulls = static_cast(state.get_int64("has_nulls")); + auto const batch_size = static_cast(state.get_int64("batch_size")); + + data_profile const table_profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width) + .null_probability(has_nulls ? std::optional{0.1} : std::nullopt); + auto const data_table = create_random_table( + cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile); + + using string_index_pair = thrust::pair; + auto const stream = cudf::get_default_stream(); + auto input_data = std::vector>{}; + auto input = std::vector>{}; + input_data.reserve(batch_size); + input.reserve(batch_size); + for (auto i = 0; i < batch_size; ++i) { + auto const d_data_ptr = + cudf::column_device_view::create(data_table->get_column(i).view(), stream); + auto batch_input = rmm::device_uvector(data_table->num_rows(), stream); + thrust::tabulate(rmm::exec_policy(stream), + batch_input.begin(), + batch_input.end(), + [data_col = *d_data_ptr] __device__(auto const idx) { + if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; } + auto const row = data_col.element(idx); + return string_index_pair{row.data(), row.size_bytes()}; + }); + input_data.emplace_back(std::move(batch_input)); + input.emplace_back(input_data.back()); + } + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + [[maybe_unused]] auto const output = cudf::make_strings_column_batch(input, stream); + }); +} + +NVBENCH_BENCH(BM_make_strings_column) + .set_name("make_strings_column") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000, 200'000'000}) + .add_int64_axis("has_nulls", {0, 1}); + +NVBENCH_BENCH(BM_make_strings_column_batch) + .set_name("make_strings_column_batch") + .add_int64_axis("num_rows", {1'000'000, 10'000'000, 20'000'000}) + .add_int64_axis("has_nulls", {0, 1}) + .add_int64_axis("batch_size", {10, 50, 100, 200}); diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index c3b68b52c36..723768bb068 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -378,6 +378,25 @@ std::unique_ptr make_strings_column( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Construct a batch of STRING type columns given an array of device span of pointer/size + * + * This function has input/output expectation similar to the `make_strings_column()` API that + * accepts only one device span of pointer/size pairs. The difference is that, this is designed to + * create many strings column at once with minimal overhead of multiple kernel launches and + * stream synchronizations. + * + * @param input Array of device spans of pointer/size pairs, where each pointer is a device memory + * address or `nullptr` (indicating a null string), and size is string length (in bytes) + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for memory allocation of the output columns + * @return Array of constructed strings columns + */ +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Construct a STRING type column given a device span of string_view. * diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index cfb2e70bfed..0dca0cce85b 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -188,5 +189,72 @@ CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1, } } +// TODO +/** + * @brief Populates a set of bitmasks by applying a binary predicate to two +* input ranges. + + * Given a set of bitmasks, `masks`, the state of bit `j` in mask `i` is + * determined by `p( *(begin1 + i), *(begin2 + j))`. If the predicate evaluates + * to true, the bit is set to `1`. If false, set to `0`. + * + * Example Arguments: + * begin1: zero-based counting iterator, + * begin2: zero-based counting iterator, + * p: [](size_type col, size_type row){ return col == row; } + * masks: [[b00...], [b00...], [b00...]] + * mask_count: 3 + * mask_num_bits: 2 + * valid_counts: [0, 0, 0] + * + * Example Results: + * masks: [[b10...], [b01...], [b00...]] + * valid_counts: [1, 1, 0] + * + * @note If any mask in `masks` is `nullptr`, that mask will be ignored. + * + * @param begin1 LHS arguments to binary predicate. ex: column/mask idx + * @param begin2 RHS arguments to binary predicate. ex: row/bit idx + * @param p Predicate: `bit = p(begin1 + mask_idx, begin2 + bit_idx)` + * @param masks Masks for which bits will be obtained and assigned. + * @param mask_count The number of `masks`. + * @param mask_num_bits The number of bits to assign for each mask. If this + * number is smaller than the total number of bits, the + * remaining bits may not be initialized. + * @param valid_counts Used to obtain the total number of valid bits for each + * mask. + */ +template +CUDF_KERNEL void valid_if_batch_kernel(device_span const> input, + BinaryPredicate p, + bitmask_type* const* masks, + size_type* valid_counts) +{ + for (std::size_t mask_idx = 0; mask_idx < input.size(); ++mask_idx) { + auto const mask_input = input[mask_idx]; + auto const mask_num_bits = mask_input.size(); + auto const out_mask = masks[mask_idx]; + + std::size_t block_offset{blockIdx.x * blockDim.x}; + size_type warp_valid_count{0}; + while (block_offset < mask_num_bits) { + auto const thread_idx = block_offset + threadIdx.x; + auto const thread_active = thread_idx < mask_num_bits; + auto const bit_is_valid = thread_active && p(mask_input[thread_idx]); + auto const warp_validity = __ballot_sync(0xffff'ffffu, bit_is_valid); + + if (thread_active && threadIdx.x % warp_size == 0) { + out_mask[word_index(thread_idx)] = warp_validity; + } + + warp_valid_count += __popc(warp_validity); + block_offset += blockDim.x * gridDim.x; + } + + auto const block_valid_count = single_lane_block_sum_reduce(warp_valid_count); + if (threadIdx.x == 0) { atomicAdd(valid_counts + mask_idx, block_valid_count); } + } +} + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 6b1b453a752..f18cf78578e 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -59,6 +59,61 @@ using string_index_pair = thrust::pair; */ constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64; +/** + * @brief Gather characters to create a strings column using the given string_index_pair iterator + * + * @tparam IndexPairIterator iterator over type `pair` values + * + * @param offsets The offsets for the output strings column + * @param chars_size The size (in bytes) of the chars data + * @param avg_bytes_per_row The average bytes per row + * @param begin Iterator to the first string_index_pair + * @param strings_count The number of strings + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return An array of chars gathered from the input string_index_pair iterator + */ +template +rmm::device_uvector make_chars_column(column_view const& offsets, + int64_t chars_size, + int64_t avg_bytes_per_row, + IndexPairIterator begin, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); + + // use a character-parallel kernel for long string lengths + if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { + auto const str_begin = thrust::make_transform_iterator( + begin, cuda::proclaim_return_type([] __device__(auto ip) { + return string_view{ip.first, ip.second}; + })); + + return gather_chars(str_begin, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_offsets, + chars_size, + stream, + mr); + } + // this approach is 2-3x faster for a large number of smaller string lengths + auto chars_data = rmm::device_uvector(chars_size, stream, mr); + auto d_chars = chars_data.data(); + auto copy_chars = [d_chars] __device__(auto item) { + string_index_pair const str = thrust::get<0>(item); + int64_t const offset = thrust::get<1>(item); + if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); + }; + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)), + strings_count, + copy_chars); + return chars_data; +} + /** * @brief Create a strings-type column from iterators of pointer/size pairs * @@ -88,8 +143,6 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto const d_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // create null mask auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; @@ -99,38 +152,9 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] { - auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); - // use a character-parallel kernel for long string lengths - if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { - auto const str_begin = thrust::make_transform_iterator( - begin, cuda::proclaim_return_type([] __device__(auto ip) { - return string_view{ip.first, ip.second}; - })); - - return gather_chars(str_begin, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_offsets, - bytes, - stream, - mr); - } else { - // this approach is 2-3x faster for a large number of smaller string lengths - auto chars_data = rmm::device_uvector(bytes, stream, mr); - auto d_chars = chars_data.data(); - auto copy_chars = [d_chars] __device__(auto item) { - string_index_pair const str = thrust::get<0>(item); - int64_t const offset = thrust::get<1>(item); - if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); - }; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)), - strings_count, - copy_chars); - return chars_data; - } - }(); + auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); + auto chars_data = make_chars_column( + offsets_column->view(), bytes, avg_bytes_per_row, begin, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 07516f91dcf..972e689f151 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -17,35 +17,177 @@ #include #include #include +#include +#include #include -#include -#include #include -#include #include #include +#include #include -#include -#include - namespace cudf { +namespace strings::detail { + namespace { -struct string_view_to_pair { - string_view null_placeholder; - string_view_to_pair(string_view n) : null_placeholder(n) {} - __device__ thrust::pair operator()(string_view const& i) - { - return (i.data() == null_placeholder.data()) - ? thrust::pair{nullptr, 0} - : thrust::pair{i.data(), i.size_bytes()}; + +using column_string_pairs = cudf::device_span const>; + +template +std::pair>, rmm::device_uvector> +make_offsets_child_column_batch_async(std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + std::vector> offsets_columns(input.size()); + rmm::device_uvector chars_sizes(input.size(), stream); + for (std::size_t idx = 0; idx < input.size(); ++idx) { + auto const string_pairs = input[idx]; + auto const string_count = static_cast(string_pairs.size()); + auto offsets = make_numeric_column( + data_type{type_to_id()}, string_count + 1, mask_state::UNALLOCATED, stream, mr); + + auto const offsets_transformer = cuda::proclaim_return_type( + [string_count, string_pairs = string_pairs.data()] __device__(size_type idx) -> size_type { + return idx < string_count ? string_pairs[idx].second : size_type{0}; + }); + auto const input_it = cudf::detail::make_counting_transform_iterator(0, offsets_transformer); + auto const d_offsets = offsets->mutable_view().template data(); + auto const output_it = cudf::detail::make_sizes_to_offsets_iterator( + d_offsets, d_offsets + string_count + 1, chars_sizes.data() + idx); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + input_it, + input_it + string_count + 1, + output_it, + int64_t{0}); + offsets_columns[idx] = std::move(offsets); } -}; + + return {std::move(offsets_columns), std::move(chars_sizes)}; +} + +std::pair, rmm::device_uvector> valid_if_batch_async( + std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const d_input = + cudf::detail::make_device_uvector_async(input, stream, cudf::get_current_device_resource_ref()); + auto const predicate = [] __device__(thrust::pair pair) -> bool { + return pair.first != nullptr; + }; + + std::vector null_masks(input.size()); + for (std::size_t idx = 0; idx < input.size(); ++idx) { + null_masks[idx] = cudf::create_null_mask( + static_cast(input[idx].size()), mask_state::UNINITIALIZED, stream, mr); + } + std::vector h_masks(input.size()); + std::transform(null_masks.begin(), null_masks.end(), h_masks.begin(), [](auto& mask) { + return reinterpret_cast(mask.data()); + }); + auto const d_masks = cudf::detail::make_device_uvector_async( + h_masks, stream, cudf::get_current_device_resource_ref()); + + rmm::device_uvector valid_counts(input.size(), stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), valid_counts.begin(), valid_counts.end(), 0); + + constexpr size_type block_size{256}; + auto const grid = cudf::detail::grid_1d{static_cast(input.size()), block_size}; + cudf::detail::valid_if_batch_kernel + <<>>( + device_span{d_input.data(), d_input.size()}, + // d_input.data(), + predicate, + d_masks.data(), + valid_counts.data()); + + return {std::move(null_masks), std::move(valid_counts)}; +} } // namespace +std::vector> make_strings_column_batch( + std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto [offsets_cols, d_chars_sizes] = + make_offsets_child_column_batch_async(input, stream, mr); + auto [null_masks, d_valid_counts] = valid_if_batch_async(input, stream, mr); + + auto const chars_sizes = cudf::detail::make_host_vector_async(d_chars_sizes, stream); + auto const valid_counts = cudf::detail::make_host_vector_async(d_valid_counts, stream); + + // This should be the only stream sync in the entire API. + stream.synchronize(); + + auto const threshold = cudf::strings::get_offset64_threshold(); + auto const overflow_count = + std::count_if(chars_sizes.begin(), chars_sizes.end(), [threshold](auto const chars_size) { + return chars_size >= threshold; + }); + CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || overflow_count == 0, + "Size of output exceeds the column size limit", + std::overflow_error); + + if (overflow_count > 0) { + std::vector long_string_input; + std::vector long_string_col_idx; + long_string_input.reserve(overflow_count); + long_string_col_idx.reserve(overflow_count); + for (std::size_t idx = 0; idx < input.size(); ++idx) { + auto const chars_size = chars_sizes[idx]; + if (chars_size >= threshold) { + long_string_input.push_back(input[idx]); + long_string_col_idx.push_back(idx); + } + } + + [[maybe_unused]] auto [new_offsets_cols, d_new_chars_sizes] = + make_offsets_child_column_batch_async(long_string_input, stream, mr); + + // Update the new offsets columns. + // The new chars sizes should be the same as before, thus we don't need to update them. + for (std::size_t idx = 0; idx < long_string_col_idx.size(); ++idx) { + offsets_cols[long_string_col_idx[idx]] = std::move(new_offsets_cols[idx]); + } + } + + std::vector> chars_cols(input.size()); + std::vector> output(input.size()); + for (std::size_t idx = 0; idx < input.size(); ++idx) { + auto const strings_count = static_cast(input[idx].size()); + if (strings_count == 0) { + output[idx] = make_empty_column(type_id::STRING); + continue; + } + + auto const chars_size = chars_sizes[idx]; + auto const valid_count = valid_counts[idx]; + auto const avg_bytes_per_row = chars_size / std::max(valid_count, 1); + auto chars_data = make_chars_column(offsets_cols[idx]->view(), + chars_size, + avg_bytes_per_row, + input[idx].data(), + strings_count, + stream, + mr); + output[idx] = make_strings_column(strings_count, + std::move(offsets_cols[idx]), + chars_data.release(), + strings_count - valid_count, + std::move(null_masks[idx])); + } + + return output; +} + +} // namespace strings::detail + // Create a strings-type column from vector of pointer/size pairs std::unique_ptr make_strings_column( device_span const> strings, @@ -57,6 +199,30 @@ std::unique_ptr make_strings_column( return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr); } +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + return cudf::strings::detail::make_strings_column_batch(input, stream, mr); +} + +namespace { +struct string_view_to_pair { + string_view null_placeholder; + string_view_to_pair(string_view n) : null_placeholder(n) {} + __device__ thrust::pair operator()(string_view const& i) + { + return (i.data() == null_placeholder.data()) + ? thrust::pair{nullptr, 0} + : thrust::pair{i.data(), i.size_bytes()}; + } +}; + +} // namespace + std::unique_ptr make_strings_column(device_span string_views, string_view null_placeholder, rmm::cuda_stream_view stream, diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 90054e41d36..8b84c9938b8 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -209,6 +209,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn) namespace { using string_pair = thrust::pair; + struct string_view_to_pair { __device__ string_pair operator()(thrust::pair const& p) { @@ -234,3 +235,109 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) auto result = cudf::make_strings_column(pairs); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data); } + +struct StringsBatchConstructionTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsBatchConstructionTest, EmptyColumns) +{ + auto constexpr num_columns = 10; + auto const stream = cudf::get_default_stream(); + + auto const d_string_pairs = + rmm::device_uvector>{0, stream}; + auto const input = + std::vector const>>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +TEST_F(StringsBatchConstructionTest, AllNullsColumns) +{ + auto constexpr num_columns = 10; + auto constexpr num_rows = 100; + auto const stream = cudf::get_default_stream(); + + auto d_string_pairs = + rmm::device_uvector>{num_rows, stream}; + thrust::uninitialized_fill_n(rmm::exec_policy(stream), + d_string_pairs.data(), + d_string_pairs.size(), + thrust::pair{nullptr, 0}); + auto const input = + std::vector const>>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_strings_column(d_string_pairs); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +TEST_F(StringsBatchConstructionTest, CreateColumnsFromPairs) +{ + auto constexpr num_columns = 10; + auto constexpr num_rows = 1000; + auto const stream = cudf::get_default_stream(); + + std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", + nullptr, + "absent stop words"}; + + std::vector offsets(h_test_strings.size() + 1, 0); + for (std::size_t i = 0; i < h_test_strings.size(); ++i) { + offsets[i + 1] = offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0); + } + + std::vector h_buffer(offsets.back()); + for (std::size_t i = 0; i < h_test_strings.size(); ++i) { + if (h_test_strings[i]) { + memcpy(h_buffer.data() + offsets[i], h_test_strings[i], strlen(h_test_strings[i])); + } + } + auto const d_test_strings = cudf::detail::make_device_uvector_sync( + h_buffer, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + + std::vector>> h_input(num_columns); + for (auto& col : h_input) { + col.resize(num_rows); + for (int idx = 0; idx < num_rows; ++idx) { + auto const data_idx = idx % static_cast(h_test_strings.size()); + col[idx] = { + d_test_strings.data() + offsets[data_idx], + h_test_strings[data_idx] ? strlen(h_test_strings[data_idx]) : 0}; + } + } + + std::vector const>> input( + num_columns); + std::transform(h_input.begin(), h_input.end(), input.begin(), [](auto const& col) { + return cudf::device_span const>{col.data(), + col.size()}; + }); + auto const output = cudf::make_strings_column_batch(input, stream); + + std::vector> expected(num_columns); + for (auto const& string_pairs : input) { + expected.emplace_back(cudf::make_strings_column(string_pairs)); + } + + for (std::size_t i = 0; i < num_columns; ++i) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected[i]->view(), output[i]->view()); + } +} + +TEST_F(StringsBatchConstructionTest, CreateLongStringsColumns) +{ + // +}