Skip to content

Commit

Permalink
Word-based nvtext::minhash function (#15368)
Browse files Browse the repository at this point in the history
Experimental implementation for #15055 
The input is a lists column of strings where each string in each row is expected as a word to be hashed. The minimum hash for that row is returned in a lists column where each row contains a minhash per input hash seed.
Here the caller is expected to produce the words to be hashed.

```
std::unique_ptr<cudf::column> word_minhash(
  cudf::lists_column_view const& input,
  cudf::device_span<uint32_t const> seeds,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15368
  • Loading branch information
davidwendt authored Sep 17, 2024
1 parent 27c29eb commit 23351aa
Show file tree
Hide file tree
Showing 10 changed files with 498 additions and 11 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
)

# ##################################################################################################
Expand Down
77 changes: 77 additions & 0 deletions cpp/benchmarks/text/word_minhash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/filling.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/minhash.hpp>

#include <rmm/device_buffer.hpp>

#include <nvbench/nvbench.cuh>

static void bench_word_minhash(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
auto const base64 = state.get_int64("hash_type") == 64;

data_profile const strings_profile =
data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
auto strings_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);

auto const num_offsets = (num_rows / row_width) + 1;
auto offsets = cudf::sequence(num_offsets,
cudf::numeric_scalar<cudf::size_type>(0),
cudf::numeric_scalar<cudf::size_type>(row_width));

auto source = cudf::make_lists_column(num_offsets - 1,
std::move(offsets),
std::move(strings_table->release().front()),
0,
rmm::device_buffer{});

data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
auto seeds = seeds_table->get_column(0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output are hashes

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
: nvtext::word_minhash(source->view(), seeds.view());
});
}

NVBENCH_BENCH(bench_word_minhash)
.set_name("word_minhash")
.add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
.add_int64_axis("row_width", {10, 100, 1000})
.add_int64_axis("seed_count", {2, 25})
.add_int64_axis("hash_type", {32, 64});
61 changes: 59 additions & 2 deletions cpp/include/nvtext/minhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/hashing.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
Expand Down Expand Up @@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand Down Expand Up @@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand All @@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x86_32 for the hash algorithm.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash(
cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x64_128 for the hash algorithm though
* only the first 64-bits of the hash are used in computing the output.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash64(
cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
/** @} */ // end of group
} // namespace CUDF_EXPORT nvtext
147 changes: 141 additions & 6 deletions cpp/src/text/minhash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include <cudf/hashing/detail/hashing.hpp>
#include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/lists/list_device_view.cuh>
#include <cudf/lists/lists_column_device_view.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -151,15 +153,111 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
mr);
auto d_hashes = hashes->mutable_view().data<hash_value_type>();

constexpr int block_size = 256;
cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
constexpr cudf::thread_index_type block_size = 256;
cudf::detail::grid_1d grid{
static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*d_strings, seeds, width, d_hashes);

return hashes;
}

std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
/**
* @brief Compute the minhash of each list row of strings for each seed
*
* This is a warp-per-row algorithm where parallel threads within a warp
* work on strings in a single list row.
*
* @tparam HashFunction hash function to use on each string
*
* @param d_input List of strings to process
* @param seeds Seeds for hashing each string
* @param d_hashes Minhash output values (one per row)
*/
template <
typename HashFunction,
typename hash_value_type = std::
conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
cudf::device_span<hash_value_type const> seeds,
hash_value_type* d_hashes)
{
auto const idx = cudf::detail::grid_1d::global_thread_id();
auto const row_idx = idx / cudf::detail::warp_size;

if (row_idx >= d_input.size()) { return; }
if (d_input.is_null(row_idx)) { return; }

auto const d_row = cudf::list_device_view(d_input, row_idx);
auto const d_output = d_hashes + (row_idx * seeds.size());

// initialize hashes output for this row
auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
if (lane_idx == 0) {
auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
}
__syncwarp();

// each lane hashes a string from the input row
for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
auto const hash_str =
d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
auto const hasher = HashFunction(seeds[seed_idx]);
// hash string and store the min value
hash_value_type hv;
if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
hv = hasher(hash_str);
} else {
// This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
// but only uses the first uint64 value as requested by the LLM team.
hv = thrust::get<0>(hasher(hash_str));
}
cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
ref.fetch_min(hv, cuda::std::memory_order_relaxed);
}
}
}

template <
typename HashFunction,
typename hash_value_type = std::
conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
cudf::device_span<hash_value_type const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
"The number of seeds times the number of input rows exceeds the column size limit",
std::overflow_error);

auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
if (input.is_empty()) { return cudf::make_empty_column(output_type); }

auto const d_input = cudf::column_device_view::create(input.parent(), stream);

auto hashes = cudf::make_numeric_column(output_type,
input.size() * static_cast<cudf::size_type>(seeds.size()),
cudf::mask_state::UNALLOCATED,
stream,
mr);
auto d_hashes = hashes->mutable_view().data<hash_value_type>();
auto lcdv = cudf::detail::lists_column_device_view(*d_input);

constexpr cudf::thread_index_type block_size = 256;
cudf::detail::grid_1d grid{
static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
minhash_word_kernel<HashFunction>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);

return hashes;
}

std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
std::unique_ptr<cudf::column>&& hashes,
cudf::size_type seeds_size,
rmm::cuda_stream_view stream,
Expand All @@ -176,7 +274,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
std::move(offsets),
std::move(hashes),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
cudf::detail::copy_bitmask(input, stream, mr),
stream,
mr);
// expect this condition to be very rare
Expand Down Expand Up @@ -208,7 +306,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
auto hashes = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}

std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
Expand All @@ -232,7 +330,27 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
auto hashes = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}

std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
auto hashes = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}

std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
auto hashes = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
}
} // namespace detail

Expand Down Expand Up @@ -276,4 +394,21 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
return detail::minhash64(input, seeds, width, stream, mr);
}

std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::word_minhash(input, seeds, stream, mr);
}

std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::word_minhash64(input, seeds, stream, mr);
}
} // namespace nvtext
Loading

0 comments on commit 23351aa

Please sign in to comment.