Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into fea-wide-table-benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
mhaseeb123 authored Sep 17, 2024
2 parents cbd1715 + e98e109 commit 4851db3
Show file tree
Hide file tree
Showing 49 changed files with 1,629 additions and 533 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12

### Conda

cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:

```bash
conda install -c rapidsai -c conda-forge -c nvidia \
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
)

# ##################################################################################################
Expand Down
77 changes: 77 additions & 0 deletions cpp/benchmarks/text/word_minhash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/filling.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/minhash.hpp>

#include <rmm/device_buffer.hpp>

#include <nvbench/nvbench.cuh>

static void bench_word_minhash(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
auto const base64 = state.get_int64("hash_type") == 64;

data_profile const strings_profile =
data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
auto strings_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);

auto const num_offsets = (num_rows / row_width) + 1;
auto offsets = cudf::sequence(num_offsets,
cudf::numeric_scalar<cudf::size_type>(0),
cudf::numeric_scalar<cudf::size_type>(row_width));

auto source = cudf::make_lists_column(num_offsets - 1,
std::move(offsets),
std::move(strings_table->release().front()),
0,
rmm::device_buffer{});

data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
auto seeds = seeds_table->get_column(0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output are hashes

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
: nvtext::word_minhash(source->view(), seeds.view());
});
}

NVBENCH_BENCH(bench_word_minhash)
.set_name("word_minhash")
.add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
.add_int64_axis("row_width", {10, 100, 1000})
.add_int64_axis("seed_count", {2, 25})
.add_int64_axis("hash_type", {32, 64});
6 changes: 6 additions & 0 deletions cpp/doxygen/regex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.

**Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.

By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
- Next line (Unicode: `0085`, UTF-8: `C285`)
- Carriage return (Unicode: `000D`, UTF-8: `0D`)

**Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
- Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
- Unmatched paired special characters like `()`, `[]`, and `{}`.
Expand Down
17 changes: 8 additions & 9 deletions cpp/include/cudf/detail/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,29 +143,28 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
rmm::device_async_resource_ref mr);

/**
* @brief Create a tdigest column of empty clusters.
* @brief Create an empty tdigest column.
*
* The column created contains the specified number of rows of empty clusters.
* An empty tdigest column contains a single row of length 0
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns A tdigest column of empty clusters.
* @returns An empty tdigest column.
*/
CUDF_EXPORT
std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Create a scalar of an empty tdigest cluster.
* @brief Create an empty tdigest scalar.
*
* The returned scalar is a struct_scalar that contains a single row of an empty cluster.
* An empty tdigest scalar is a struct_scalar that contains a single row of length 0
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns A scalar of an empty tdigest cluster.
* @returns An empty tdigest scalar.
*/
std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
Expand Down
20 changes: 16 additions & 4 deletions cpp/include/cudf/strings/regex/flags.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ namespace strings {
* and to match the Python flag values.
*/
enum regex_flags : uint32_t {
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256 ///< use only ASCII when matching built-in character classes
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256, ///< use only ASCII when matching built-in character classes
EXT_NEWLINE = 512 ///< new-line matches extended characters
};

/**
Expand Down Expand Up @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
return (f & regex_flags::ASCII) == regex_flags::ASCII;
}

/**
* @brief Returns true if the given flags contain EXT_NEWLINE
*
* @param f Regex flags to check
* @return true if `f` includes EXT_NEWLINE
*/
constexpr bool is_ext_newline(regex_flags const f)
{
return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
}

/**
* @brief Capture groups setting
*
Expand Down
11 changes: 8 additions & 3 deletions cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper

__device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
{
if (byte_pos > 0)
while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
if (byte_pos > 0) {
if (byte_pos == char_pos) {
--byte_pos;
} else {
while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
}
}
--char_pos;
return *this;
}
Expand Down
20 changes: 10 additions & 10 deletions cpp/include/cudf_test/tdigest_utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);

// NOTE: an empty tdigest column still has 1 row.
auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
}
Expand Down Expand Up @@ -562,12 +562,12 @@ template <typename MergeFunc>
void tdigest_merge_empty(MergeFunc merge_op)
{
// 3 empty tdigests all in the same group
auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto a = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto b = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto c = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());
std::vector<column_view> cols;
cols.push_back(*a);
cols.push_back(*b);
Expand All @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
auto const delta = 1000;
auto result = merge_op(*values, delta);

auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
cudf::get_default_stream(), cudf::get_current_device_resource_ref());

CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
}
Expand Down
61 changes: 59 additions & 2 deletions cpp/include/nvtext/minhash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/hashing.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
Expand Down Expand Up @@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand Down Expand Up @@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
*
* @throw std::invalid_argument if the width < 2
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Strings column to compute minhash
* @param seeds Seed values used for the hash algorithm
Expand All @@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x86_32 for the hash algorithm.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash(
cudf::lists_column_view const& input,
cudf::device_span<uint32_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the minhash values for each row of strings per seed
*
* Hash values are computed from each string in each row and the
* minimum hash value is returned for each row for each seed.
* Each row of the output list column are seed results for the corresponding
* input row. The order of the elements in each row match the order of
* the seeds provided in the `seeds` parameter.
*
* This function uses MurmurHash3_x64_128 for the hash algorithm though
* only the first 64-bits of the hash are used in computing the output.
*
* Any null row entries result in corresponding null output rows.
*
* @throw std::invalid_argument if seeds is empty
* @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
*
* @param input Lists column of strings to compute minhash
* @param seeds Seed values used for the hash algorithm
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return List column of minhash values for each string per seed
*/
std::unique_ptr<cudf::column> word_minhash64(
cudf::lists_column_view const& input,
cudf::device_span<uint64_t const> seeds,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
/** @} */ // end of group
} // namespace CUDF_EXPORT nvtext
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/page_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
{
bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;

constexpr int batch_size = 32;
constexpr int batch_size = cudf::detail::warp_size;
int cur_leaf_count = target_leaf_count;
while (s->error == 0 && s->nz_count < target_leaf_count &&
s->input_value_count < s->num_input_values) {
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/io/parquet/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,9 @@ struct SchemaElement {
bool operator==(SchemaElement const& other) const
{
return type == other.type && converted_type == other.converted_type &&
type_length == other.type_length && repetition_type == other.repetition_type &&
name == other.name && num_children == other.num_children &&
decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
field_id == other.field_id;
type_length == other.type_length && name == other.name &&
num_children == other.num_children && decimal_scale == other.decimal_scale &&
decimal_precision == other.decimal_precision && field_id == other.field_id;
}

// the parquet format is a little squishy when it comes to interpreting
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/io/parquet/parquet_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,8 @@ struct ColumnChunkDesc {
int32_t src_col_schema_,
column_chunk_info const* chunk_info_,
float list_bytes_per_row_est_,
bool strings_to_categorical_)
bool strings_to_categorical_,
int32_t src_file_idx_)
: compressed_data(compressed_data_),
compressed_size(compressed_size_),
num_values(num_values_),
Expand All @@ -419,7 +420,8 @@ struct ColumnChunkDesc {
src_col_schema(src_col_schema_),
h_chunk_info(chunk_info_),
list_bytes_per_row_est(list_bytes_per_row_est_),
is_strings_to_cat(strings_to_categorical_)
is_strings_to_cat(strings_to_categorical_),
src_file_idx(src_file_idx_)

{
}
Expand Down Expand Up @@ -456,6 +458,7 @@ struct ColumnChunkDesc {

bool is_strings_to_cat{}; // convert strings to hashes
bool is_large_string_col{}; // `true` if string data uses 64-bit offsets
int32_t src_file_idx{}; // source file index
};

/**
Expand Down
Loading

0 comments on commit 4851db3

Please sign in to comment.