Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into test
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored Sep 18, 2024
2 parents 1f1f77c + 44a9c10 commit d959ada
Show file tree
Hide file tree
Showing 26 changed files with 1,052 additions and 52 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
)

# ##################################################################################################
Expand Down
87 changes: 85 additions & 2 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
void parquet_read_common(cudf::size_type num_rows_to_read,
cudf::size_type num_cols_to_read,
cuio_source_sink_pair& source_sink,
nvbench::state& state)
nvbench::state& state,
size_t table_data_size = data_size)
{
cudf::io::parquet_reader_options read_opts =
cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
Expand All @@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
});

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
Expand Down Expand Up @@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

template <data_type DataType>
void BM_parquet_read_wide_tables(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>> type_list)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));

auto const n_col = static_cast<cudf::size_type>(state.get_int64("num_cols"));
auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = io_type::DEVICE_BUFFER;
cuio_source_sink_pair source_sink(source_type);

auto const num_rows_written = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, n_col),
table_size_bytes{data_size_bytes},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(cudf::io::compression_type::NONE);
cudf::io::write_parquet(write_opts);
return view.num_rows();
}();

parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
}

void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
{
auto const d_type = []() {
auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
d_type1.reserve(d_type1.size() + d_type2.size());
std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
return d_type1;
}();

auto const n_col = static_cast<cudf::size_type>(state.get_int64("num_cols"));
auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = io_type::DEVICE_BUFFER;
cuio_source_sink_pair source_sink(source_type);

auto const num_rows_written = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, n_col),
table_size_bytes{data_size_bytes},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(cudf::io::compression_type::NONE);
cudf::io::write_parquet(write_opts);
return view.num_rows();
}();

parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
}

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::FLOAT,
data_type::DECIMAL,
Expand Down Expand Up @@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
.add_int64_axis("run_length", {1, 32})
.add_int64_axis("num_string_cols", {1, 2, 3});

using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
.set_name("parquet_read_wide_tables")
.set_min_samples(4)
.set_type_axes_names({"data_type"})
.add_int64_axis("data_size_mb", {1024, 2048, 4096})
.add_int64_axis("num_cols", {256, 512, 1024})
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
.set_name("parquet_read_wide_tables_mixed")
.set_min_samples(4)
.add_int64_axis("data_size_mb", {1024, 2048, 4096})
.add_int64_axis("num_cols", {256, 512, 1024})
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

// a benchmark for structs that only contain fixed-width types
using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
Expand Down
36 changes: 24 additions & 12 deletions cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
}

std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
nvbench::state& state, std::vector<cudf::type_id> const& d_types)
nvbench::state& state, std::vector<cudf::type_id> const& d_types, io_type io_source_type)
{
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
Expand All @@ -63,7 +63,7 @@ std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
size_t total_file_size = 0;

for (size_t i = 0; i < num_files; ++i) {
cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
cuio_source_sink_pair source_sink{io_source_type};

auto const tbl = create_random_table(
cycle_dtypes(d_types, num_cols),
Expand Down Expand Up @@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
{
size_t const data_size = state.get_int64("total_data_size");
auto const num_threads = state.get_int64("num_threads");
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));

auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
BS::thread_pool threads(num_threads);

auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
auto [source_sink_vector, total_file_size, num_files] =
write_file_data(state, d_types, source_type);
std::vector<cudf::io::source_info> source_info_vector;
std::transform(source_sink_vector.begin(),
source_sink_vector.end(),
Expand Down Expand Up @@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
auto const num_threads = state.get_int64("num_threads");
size_t const input_limit = state.get_int64("input_limit");
size_t const output_limit = state.get_int64("output_limit");
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));

auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
BS::thread_pool threads(num_threads);
auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
auto [source_sink_vector, total_file_size, num_files] =
write_file_data(state, d_types, source_type);
std::vector<cudf::io::source_info> source_info_vector;
std::transform(source_sink_vector.begin(),
source_sink_vector.end(),
Expand Down Expand Up @@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
.add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
.add_int64_axis("num_threads", {1, 2, 4, 8})
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});
.add_int64_axis("run_length", {8})
.add_string_axis("io_type", {"PINNED_BUFFER"});

NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
.set_name("parquet_multithreaded_read_decode_fixed_width")
Expand All @@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
.add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
.add_int64_axis("num_threads", {1, 2, 4, 8})
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});
.add_int64_axis("run_length", {8})
.add_string_axis("io_type", {"PINNED_BUFFER"});

NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
.set_name("parquet_multithreaded_read_decode_string")
Expand All @@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
.add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
.add_int64_axis("num_threads", {1, 2, 4, 8})
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});
.add_int64_axis("run_length", {8})
.add_string_axis("io_type", {"PINNED_BUFFER"});

NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
.set_name("parquet_multithreaded_read_decode_list")
Expand All @@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
.add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
.add_int64_axis("num_threads", {1, 2, 4, 8})
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});
.add_int64_axis("run_length", {8})
.add_string_axis("io_type", {"PINNED_BUFFER"});

// mixed data types: fixed width, strings
NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
Expand All @@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});
.add_int64_axis("output_limit", {640 * 1024 * 1024})
.add_string_axis("io_type", {"PINNED_BUFFER"});

NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
.set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
Expand All @@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});
.add_int64_axis("output_limit", {640 * 1024 * 1024})
.add_string_axis("io_type", {"PINNED_BUFFER"});

NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
.set_name("parquet_multithreaded_read_decode_chunked_string")
Expand All @@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});
.add_int64_axis("output_limit", {640 * 1024 * 1024})
.add_string_axis("io_type", {"PINNED_BUFFER"});

NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
.set_name("parquet_multithreaded_read_decode_chunked_list")
Expand All @@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});
.add_int64_axis("output_limit", {640 * 1024 * 1024})
.add_string_axis("io_type", {"PINNED_BUFFER"});
77 changes: 77 additions & 0 deletions cpp/benchmarks/text/word_minhash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/filling.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/minhash.hpp>

#include <rmm/device_buffer.hpp>

#include <nvbench/nvbench.cuh>

static void bench_word_minhash(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
auto const base64 = state.get_int64("hash_type") == 64;

data_profile const strings_profile =
data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
auto strings_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);

auto const num_offsets = (num_rows / row_width) + 1;
auto offsets = cudf::sequence(num_offsets,
cudf::numeric_scalar<cudf::size_type>(0),
cudf::numeric_scalar<cudf::size_type>(row_width));

auto source = cudf::make_lists_column(num_offsets - 1,
std::move(offsets),
std::move(strings_table->release().front()),
0,
rmm::device_buffer{});

data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
auto seeds = seeds_table->get_column(0);

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
state.add_global_memory_writes<nvbench::int32_t>(num_rows); // output are hashes

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
: nvtext::word_minhash(source->view(), seeds.view());
});
}

NVBENCH_BENCH(bench_word_minhash)
.set_name("word_minhash")
.add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
.add_int64_axis("row_width", {10, 100, 1000})
.add_int64_axis("seed_count", {2, 25})
.add_int64_axis("hash_type", {32, 64});
6 changes: 6 additions & 0 deletions cpp/doxygen/regex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.

**Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.

By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
- Next line (Unicode: `0085`, UTF-8: `C285`)
- Carriage return (Unicode: `000D`, UTF-8: `0D`)

**Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
- Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
- Unmatched paired special characters like `()`, `[]`, and `{}`.
Expand Down
20 changes: 16 additions & 4 deletions cpp/include/cudf/strings/regex/flags.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ namespace strings {
* and to match the Python flag values.
*/
enum regex_flags : uint32_t {
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256 ///< use only ASCII when matching built-in character classes
DEFAULT = 0, ///< default
MULTILINE = 8, ///< the '^' and '$' honor new-line characters
DOTALL = 16, ///< the '.' matching includes new-line characters
ASCII = 256, ///< use only ASCII when matching built-in character classes
EXT_NEWLINE = 512 ///< new-line matches extended characters
};

/**
Expand Down Expand Up @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
return (f & regex_flags::ASCII) == regex_flags::ASCII;
}

/**
* @brief Returns true if the given flags contain EXT_NEWLINE
*
* @param f Regex flags to check
* @return true if `f` includes EXT_NEWLINE
*/
constexpr bool is_ext_newline(regex_flags const f)
{
return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
}

/**
* @brief Capture groups setting
*
Expand Down
11 changes: 8 additions & 3 deletions cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper

__device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
{
if (byte_pos > 0)
while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
if (byte_pos > 0) {
if (byte_pos == char_pos) {
--byte_pos;
} else {
while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
;
}
}
--char_pos;
return *this;
}
Expand Down
Loading

0 comments on commit d959ada

Please sign in to comment.