From 789cbfdd69648fd7ec553922e64accb763ca3c57 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 30 May 2024 15:02:37 -0400 Subject: [PATCH] Use offsetalator in nvtext::tokenize_with_vocabulary (#15878) Updates the `token_counts_fn` kernel in the `nvtext::tokenize_with_vocabulary` to use the offsetalator instead of hardcoded `size_type` for accessing strings offsets. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15878 --- cpp/src/text/vocabulary_tokenize.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index 8913ce22da8..f012f7ce09a 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -240,10 +240,10 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings, return; } - auto const offsets = - d_strings.child(cudf::strings_column_view::offsets_column_index).data(); - auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; - auto const chars_begin = d_strings.data() + offsets[d_strings.offset()]; + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const offset = offsets_itr[str_idx + d_strings.offset()] - offsets_itr[d_strings.offset()]; + auto const chars_begin = d_strings.data() + offsets_itr[d_strings.offset()]; auto const begin = d_str.data(); auto const end = begin + d_str.size_bytes();