Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.04' into feat/pylibc…
Browse files Browse the repository at this point in the history
…udf_scan_reduce
  • Loading branch information
vyasr committed Feb 6, 2024
2 parents c3b414a + c7e3dc5 commit 68a0a76
Show file tree
Hide file tree
Showing 45 changed files with 804 additions and 638 deletions.
2 changes: 1 addition & 1 deletion ci/check_style.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

set -euo pipefail

Expand Down
2 changes: 1 addition & 1 deletion cpp/examples/fetch_dependencies.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down
2 changes: 1 addition & 1 deletion cpp/libcudf_kafka/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down
77 changes: 36 additions & 41 deletions cpp/src/text/bpe/byte_pair_encoding.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

Expand Down Expand Up @@ -76,17 +79,17 @@ constexpr int block_size = 512;
template <typename MapRefType>
struct bpe_unpairable_offsets_fn {
cudf::device_span<char const> d_chars;
cudf::size_type offset;
int64_t offset;
MapRefType const d_map;
__device__ cudf::size_type operator()(cudf::size_type idx)
__device__ int64_t operator()(int64_t idx)
{
if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { return 0; }

auto const itr = d_chars.data() + idx;
auto const end = d_chars.end();
auto const lhs = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr));
auto const next = itr + lhs.size_bytes();
auto output = 0;
auto output = 0L;
if (next < end) {
auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next));
// see if both halves exist anywhere in the table, if not these are unpairable
Expand Down Expand Up @@ -123,6 +126,7 @@ struct bpe_unpairable_offsets_fn {
*/
template <typename MapRefType>
CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
char const* d_input_chars,
MapRefType const d_map,
int8_t* d_spaces_data, // working memory
cudf::size_type* d_ranks_data, // more working memory
Expand All @@ -134,10 +138,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);

auto const d_str = d_strings.element<cudf::string_view>(str_idx);
auto const offsets =
d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
auto const d_str = d_strings.element<cudf::string_view>(str_idx);
auto const offset = thrust::distance(d_input_chars, d_str.data());

auto const d_spaces = d_spaces_data + offset;
auto const end_spaces = d_spaces + d_str.size_bytes();
Expand Down Expand Up @@ -292,6 +294,7 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
* @param d_sizes Output sizes of each row
*/
CUDF_KERNEL void bpe_finalize(cudf::column_device_view const d_strings,
char const* d_input_chars,
int8_t* d_spaces_data, // where separators are inserted
cudf::size_type* d_sizes // output sizes of encoded strings
)
Expand All @@ -311,9 +314,7 @@ CUDF_KERNEL void bpe_finalize(cudf::column_device_view const d_strings,
return;
}

auto const offsets =
d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
auto const offset = thrust::distance(d_input_chars, d_str.data());

auto const d_spaces = d_spaces_data + offset;
auto const end_spaces = d_spaces + d_str.size_bytes();
Expand Down Expand Up @@ -352,27 +353,22 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const

auto const d_strings = cudf::column_device_view::create(input.parent(), stream);

auto const first_offset = (input.offset() == 0) ? 0
: cudf::detail::get_value<cudf::size_type>(
auto const first_offset = (input.offset() == 0) ? 0L
: cudf::strings::detail::get_offset_value(
input.offsets(), input.offset(), stream);
auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
? input.chars_size(stream)
: cudf::detail::get_value<cudf::size_type>(
? static_cast<int64_t>(input.chars_size(stream))
: cudf::strings::detail::get_offset_value(
input.offsets(), input.size() + input.offset(), stream);
auto const chars_size = last_offset - first_offset;
auto const d_input_chars = input.chars_begin(stream) + first_offset;

auto const offset_data_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
auto offsets = cudf::make_numeric_column(
offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets->mutable_view().data<cudf::size_type>();

rmm::device_uvector<int8_t> d_spaces(chars_size, stream); // identifies non-merged pairs
// used for various purposes below: unpairable-offsets, pair ranks, separator insert positions
rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
rmm::device_uvector<int64_t> d_working(chars_size, stream);

auto const chars_begin = thrust::counting_iterator<cudf::size_type>(0);
auto const chars_end = thrust::counting_iterator<cudf::size_type>(chars_size);
auto const chars_begin = thrust::counting_iterator<int64_t>(0);
auto const chars_end = thrust::counting_iterator<int64_t>(chars_size);

{
// this kernel locates unpairable sections of strings to create artificial string row
Expand All @@ -383,14 +379,16 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
auto up_fn = bpe_unpairable_offsets_fn<decltype(mp_map)>{d_chars_span, first_offset, mp_map};
thrust::transform(rmm::exec_policy_nosync(stream), chars_begin, chars_end, d_up_offsets, up_fn);
auto const up_end = // remove all but the unpairable offsets
thrust::remove(rmm::exec_policy_nosync(stream), d_up_offsets, d_up_offsets + chars_size, 0);
thrust::remove(rmm::exec_policy_nosync(stream), d_up_offsets, d_up_offsets + chars_size, 0L);
auto const unpairables = thrust::distance(d_up_offsets, up_end); // number of unpairables

// new string boundaries created by combining unpairable offsets with the existing offsets
auto tmp_offsets = rmm::device_uvector<cudf::size_type>(unpairables + input.size() + 1, stream);
auto tmp_offsets = rmm::device_uvector<int64_t>(unpairables + input.size() + 1, stream);
auto input_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
thrust::merge(rmm::exec_policy_nosync(stream),
input.offsets_begin(),
input.offsets_end(),
input_offsets,
input_offsets + input.size() + 1,
d_up_offsets,
up_end,
tmp_offsets.begin());
Expand All @@ -402,31 +400,28 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
tmp_offsets.resize(offsets_total, stream);

// temp column created with the merged offsets and the original chars data
auto const col_offsets =
cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
auto const tmp_size = offsets_total - 1;
auto const tmp_input = cudf::column_view(
auto const col_offsets = cudf::column_view(cudf::device_span<int64_t const>(tmp_offsets));
auto const tmp_size = offsets_total - 1;
auto const tmp_input = cudf::column_view(
input.parent().type(), tmp_size, input.chars_begin(stream), nullptr, 0, 0, {col_offsets});
auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);

// launch the byte-pair-encoding kernel on the temp column
rmm::device_uvector<int8_t> d_rerank(chars_size, stream); // more working memory;
auto const d_ranks = d_working.data(); // store pair ranks here
rmm::device_uvector<cudf::size_type> d_ranks(chars_size, stream);
auto const pair_map = get_bpe_merge_pairs_impl(merge_pairs)->get_merge_pairs_ref();
bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
*d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data());
*d_tmp_strings, d_input_chars, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data());
}

// compute the output sizes and store them in the d_offsets vector
// compute the output sizes
auto output_sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
*d_strings, d_spaces.data(), d_offsets);
*d_strings, d_input_chars, d_spaces.data(), output_sizes.data());

// convert sizes to offsets in-place
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
"Size of output exceeds the column size limit",
std::overflow_error);
auto [offsets, bytes] = cudf::strings::detail::make_offsets_child_column(
output_sizes.begin(), output_sizes.end(), stream, mr);

// build the output: inserting separators to the input character data
rmm::device_uvector<char> chars(bytes, stream, mr);
Expand All @@ -436,8 +431,8 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
return d_spaces[idx] > 0; // separator to be inserted here
};
auto const copy_end = thrust::copy_if(
rmm::exec_policy_nosync(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero);
auto const copy_end =
cudf::detail::copy_if_safe(chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero, stream);

// this will insert the single-byte separator into positions specified in d_inserts
auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);
Expand Down
86 changes: 34 additions & 52 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
Expand All @@ -37,12 +38,9 @@
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/transform_scan.h>

#include <cuda/functional>

#include <stdexcept>

namespace nvtext {
namespace detail {
namespace {
Expand All @@ -60,10 +58,10 @@ namespace {
* member.
*/
struct string_tokens_positions_fn {
cudf::column_device_view const d_strings; // strings to tokenize
cudf::string_view const d_delimiter; // delimiter to tokenize around
cudf::size_type const* d_token_offsets; // offsets into the d_token_positions for each string
position_pair* d_token_positions; // token positions in each string
cudf::column_device_view const d_strings; // strings to tokenize
cudf::string_view const d_delimiter; // delimiter to tokenize around
cudf::detail::input_offsetalator d_token_offsets; // offsets of d_token_positions for each string
position_pair* d_token_positions; // token positions in each string

__device__ void operator()(cudf::size_type idx)
{
Expand Down Expand Up @@ -95,12 +93,12 @@ struct ngram_builder_fn {
cudf::column_device_view const d_strings; // strings to generate ngrams from
cudf::string_view const d_separator; // separator to place between them 'grams
cudf::size_type const ngrams; // ngram number to generate (2=bi-gram, 3=tri-gram)
cudf::size_type const* d_token_offsets; // offsets for token position for each string
position_pair const* d_token_positions; // token positions for each string
cudf::size_type const* d_chars_offsets{}; // offsets for each string's ngrams
char* d_chars{}; // write ngram strings to here
cudf::size_type const* d_ngram_offsets{}; // offsets for sizes of each string's ngrams
cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here
cudf::detail::input_offsetalator d_token_offsets; // offsets for token position for each string
position_pair const* d_token_positions; // token positions for each string
cudf::detail::input_offsetalator d_chars_offsets{}; // offsets for each string's ngrams
char* d_chars{}; // write ngram strings to here
cudf::size_type const* d_ngram_offsets{}; // offsets for sizes of each string's ngrams
cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here

__device__ cudf::size_type operator()(cudf::size_type idx)
{
Expand Down Expand Up @@ -165,16 +163,12 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s

// first, get the number of tokens per string to get the token-offsets
// Ex. token-counts = [3,2]; token-offsets = [0,3,5]
rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
auto d_token_offsets = token_offsets.data();
thrust::transform_inclusive_scan(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_token_offsets + 1,
strings_tokenizer{d_strings, d_delimiter},
thrust::plus<cudf::size_type>());
token_offsets.set_element_to_zero_async(0, stream);
auto const total_tokens = token_offsets.back_element(stream); // Ex. 5 tokens
auto const count_itr =
cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter});
auto [token_offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_token_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());

// get the token positions (in bytes) per string
// Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
Expand All @@ -188,21 +182,17 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s

// compute the number of ngrams per string to get the total number of ngrams to generate
// Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
auto d_ngram_offsets = ngram_offsets.data();
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_ngram_offsets + 1,
auto const ngram_counts = cudf::detail::make_counting_transform_iterator(
0,
cuda::proclaim_return_type<cudf::size_type>(
[d_token_offsets, ngrams] __device__(cudf::size_type idx) {
auto token_count = d_token_offsets[idx + 1] - d_token_offsets[idx];
auto token_count =
static_cast<cudf::size_type>(d_token_offsets[idx + 1] - d_token_offsets[idx]);
return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
}),
thrust::plus{});
ngram_offsets.set_element_to_zero_async(0, stream);
auto const total_ngrams = ngram_offsets.back_element(stream);
}));
auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column(
ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_ngram_offsets = ngram_offsets->view().begin<cudf::size_type>();

// Compute the total size of the ngrams for each string (not for each ngram)
// Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
Expand All @@ -212,21 +202,14 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
// ngrams for each string.
// Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
// is built in memory like this: "a_bbbb_ccc"
rmm::device_uvector<cudf::size_type> chars_offsets(strings_count + 1, stream);
// First compute the output sizes for each string (this not the final output result)
thrust::transform(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
chars_offsets.begin(),
ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
// Convert the sizes to offsets
auto const output_chars_size = cudf::detail::sizes_to_offsets(
chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
CUDF_EXPECTS(
output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
"Size of output exceeds the column size limit",
std::overflow_error);

// First compute the output sizes for each string (this not the final output result)
auto const sizes_itr = cudf::detail::make_counting_transform_iterator(
0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
auto [chars_offsets, output_chars_size] = cudf::strings::detail::make_offsets_child_column(
sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_chars_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view());

// This will contain the size in bytes of each ngram to generate
rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
Expand All @@ -245,14 +228,13 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
ngrams,
d_token_offsets,
d_token_positions,
chars_offsets.data(),
d_chars_offsets,
d_chars,
d_ngram_offsets,
ngram_sizes.data()});
// build the offsets column -- converting the ngram sizes into offsets
auto offsets_column = std::get<0>(
cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr));
offsets_column->set_null_count(0);
// create the output strings column
return make_strings_column(
total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
Expand Down
Loading

0 comments on commit 68a0a76

Please sign in to comment.