Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into host-tree-algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
karthikeyann authored Sep 18, 2024
2 parents ccfc6f6 + 2a9a8f5 commit 8fbb1d0
Show file tree
Hide file tree
Showing 90 changed files with 3,913 additions and 1,252 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
steps:
- name: Get PR info
id: get-pr-info
uses: rapidsai/shared-actions/get-pr-info@main
uses: nv-gha-runners/get-pr-info@main
- name: Checkout code repo
uses: actions/checkout@v4
with:
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ build/
cpp/build/
cpp/examples/*/install/
cpp/examples/*/build/
cpp/examples/tpch/datagen/datafusion
cpp/include/cudf/ipc_generated/*.h
cpp/thirdparty/googletest/

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12

### Conda

cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:

```bash
conda install -c rapidsai -c conda-forge -c nvidia \
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ add_library(
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
src/io/json/parser_features.cpp
src/io/json/process_tokens.cu
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
28 changes: 18 additions & 10 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,25 @@ target_include_directories(
)

add_library(
tpch_data_generator STATIC
common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
common/tpch_data_generator/random_column_generator.cu
ndsh_data_generator STATIC
common/ndsh_data_generator/ndsh_data_generator.cpp common/ndsh_data_generator/table_helpers.cpp
common/ndsh_data_generator/random_column_generator.cu
)
target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
target_compile_features(ndsh_data_generator PUBLIC cxx_std_17 cuda_std_17)

target_compile_options(
tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
ndsh_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
)

target_link_libraries(
tpch_data_generator
ndsh_data_generator
PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
)

target_include_directories(
tpch_data_generator
ndsh_data_generator
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
)
Expand Down Expand Up @@ -127,8 +127,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::nvbench
$<TARGET_NAME_IF_EXISTS:conda_env>
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
)
install(
TARGETS ${CMAKE_BENCH_NAME}
Expand Down Expand Up @@ -175,6 +175,14 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
# * transpose benchmark ---------------------------------------------------------------------------
ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)

# ##################################################################################################
# * nds-h benchmark --------------------------------------------------------------------------------
ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)

# ##################################################################################################
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureNVBench(
Expand Down Expand Up @@ -329,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)

ConfigureNVBench(
TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
)

# ##################################################################################################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include "tpch_data_generator.hpp"
#include "ndsh_data_generator.hpp"

#include "random_column_generator.hpp"
#include "table_helpers.hpp"
Expand Down Expand Up @@ -435,46 +435,37 @@ std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& o
columns.push_back(std::move(l_quantity));
columns.push_back(std::move(l_discount));
columns.push_back(std::move(l_tax));
columns.push_back(std::move(l_returnflag));
columns.push_back(std::move(l_linestatus));
columns.push_back(std::move(l_shipdate_ts));
columns.push_back(std::move(l_commitdate_ts));
columns.push_back(std::move(l_receiptdate_ts));
columns.push_back(std::move(l_returnflag));
columns.push_back(std::move(l_linestatus));
columns.push_back(std::move(l_shipinstruct));
columns.push_back(std::move(l_shipmode));
columns.push_back(std::move(l_comment));
return std::make_unique<cudf::table>(std::move(columns));
}

std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
/**
* @brief Generate the part of the `orders` table dependent on the `lineitem` table
*
* @param lineitem_partial The partially generated `lineitem` table
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem_partial,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
auto const l_linestatus_mask = lineitem.column(0);
auto const l_orderkey = lineitem.column(1);
auto const l_discount = lineitem.column(6);
auto const l_tax = lineitem.column(7);
auto const l_extendedprice = lineitem.column(16);
auto const l_linestatus_mask = lineitem_partial.column(0);
auto const l_orderkey = lineitem_partial.column(1);
auto const l_extendedprice = lineitem_partial.column(6);
auto const l_discount = lineitem_partial.column(7);
auto const l_tax = lineitem_partial.column(8);

std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;

// Generate the `o_totalprice` column
// We calculate the `charge` column, which is a function of `l_extendedprice`,
// `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
auto o_totalprice = [&]() {
auto const keys = cudf::table_view({l_orderkey});
cudf::groupby::groupby gb(keys);
std::vector<cudf::groupby::aggregation_request> requests;
requests.push_back(cudf::groupby::aggregation_request());
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
requests[0].values = l_charge->view();
auto agg_result = gb.aggregate(requests);
return cudf::round(agg_result.second[0].results[0]->view(), 2);
}();
orders_dependent_columns.push_back(std::move(o_totalprice));

// Generate the `o_orderstatus` column
auto o_orderstatus = [&]() {
auto const keys = cudf::table_view({l_orderkey});
Expand Down Expand Up @@ -529,6 +520,22 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
}();
orders_dependent_columns.push_back(std::move(o_orderstatus));

// Generate the `o_totalprice` column
// We calculate the `charge` column, which is a function of `l_extendedprice`,
// `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
auto o_totalprice = [&]() {
auto const keys = cudf::table_view({l_orderkey});
cudf::groupby::groupby gb(keys);
std::vector<cudf::groupby::aggregation_request> requests;
requests.push_back(cudf::groupby::aggregation_request());
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
requests[0].values = l_charge->view();
auto agg_result = gb.aggregate(requests);
return cudf::round(agg_result.second[0].results[0]->view(), 2);
}();
orders_dependent_columns.push_back(std::move(o_totalprice));
return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
}

Expand Down Expand Up @@ -730,9 +737,7 @@ generate_orders_lineitem_part(double scale_factor,
// Generate the `part` table
auto part = generate_part(scale_factor, stream, mr);

// Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
// add the column to the `lineitem` table, and write the `lineitem` table to a parquet file

// Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column
auto l_extendedprice = [&]() {
auto const left = cudf::table_view(
{lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
Expand All @@ -752,8 +757,9 @@ generate_orders_lineitem_part(double scale_factor,
return cudf::round(col->view(), 2);
}();

// Insert the `l_extendedprice` column into the partial columns of the `lineitem` table
auto lineitem_partial_columns = lineitem_partial->release();
lineitem_partial_columns.push_back(std::move(l_extendedprice));
lineitem_partial_columns.insert(lineitem_partial_columns.begin() + 6, std::move(l_extendedprice));
auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));

// Generate the dependent columns of the `orders` table
Expand All @@ -762,7 +768,7 @@ generate_orders_lineitem_part(double scale_factor,

auto orders_independent_columns = orders_independent->release();
auto orders_dependent_columns = orders_dependent->release();
orders_independent_columns.insert(orders_independent_columns.end(),
orders_independent_columns.insert(orders_independent_columns.begin() + 2,
std::make_move_iterator(orders_dependent_columns.begin()),
std::make_move_iterator(orders_dependent_columns.end()));

Expand Down
14 changes: 7 additions & 7 deletions cpp/benchmarks/hashing/hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
state.add_global_memory_reads<nvbench::int64_t>(num_rows);
// add memory read from bitmaks
if (!no_nulls) {
state.add_global_memory_reads<nvbench::int8_t>(2 *
state.add_global_memory_reads<nvbench::int8_t>(2L *
cudf::bitmask_allocation_size_bytes(num_rows));
}
// memory written depends on used hash
Expand All @@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
});
} else if (hash_name == "md5") {
// md5 creates a 32-byte string
state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
} else if (hash_name == "sha1") {
// sha1 creates a 40-byte string
state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
} else if (hash_name == "sha224") {
// sha224 creates a 56-byte string
state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
} else if (hash_name == "sha256") {
// sha256 creates a 64-byte string
state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
} else if (hash_name == "sha384") {
// sha384 creates a 96-byte string
state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
} else if (hash_name == "sha512") {
// sha512 creates a 128-byte string
state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });
Expand Down
87 changes: 85 additions & 2 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
void parquet_read_common(cudf::size_type num_rows_to_read,
cudf::size_type num_cols_to_read,
cuio_source_sink_pair& source_sink,
nvbench::state& state)
nvbench::state& state,
size_t table_data_size = data_size)
{
cudf::io::parquet_reader_options read_opts =
cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
Expand All @@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
});

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
Expand Down Expand Up @@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

template <data_type DataType>
void BM_parquet_read_wide_tables(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>> type_list)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));

auto const n_col = static_cast<cudf::size_type>(state.get_int64("num_cols"));
auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = io_type::DEVICE_BUFFER;
cuio_source_sink_pair source_sink(source_type);

auto const num_rows_written = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, n_col),
table_size_bytes{data_size_bytes},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(cudf::io::compression_type::NONE);
cudf::io::write_parquet(write_opts);
return view.num_rows();
}();

parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
}

void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
{
auto const d_type = []() {
auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
d_type1.reserve(d_type1.size() + d_type2.size());
std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
return d_type1;
}();

auto const n_col = static_cast<cudf::size_type>(state.get_int64("num_cols"));
auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = io_type::DEVICE_BUFFER;
cuio_source_sink_pair source_sink(source_type);

auto const num_rows_written = [&]() {
auto const tbl = create_random_table(
cycle_dtypes(d_type, n_col),
table_size_bytes{data_size_bytes},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(cudf::io::compression_type::NONE);
cudf::io::write_parquet(write_opts);
return view.num_rows();
}();

parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
}

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::FLOAT,
data_type::DECIMAL,
Expand Down Expand Up @@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
.add_int64_axis("run_length", {1, 32})
.add_int64_axis("num_string_cols", {1, 2, 3});

using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
.set_name("parquet_read_wide_tables")
.set_min_samples(4)
.set_type_axes_names({"data_type"})
.add_int64_axis("data_size_mb", {1024, 2048, 4096})
.add_int64_axis("num_cols", {256, 512, 1024})
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
.set_name("parquet_read_wide_tables_mixed")
.set_min_samples(4)
.add_int64_axis("data_size_mb", {1024, 2048, 4096})
.add_int64_axis("num_cols", {256, 512, 1024})
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

// a benchmark for structs that only contain fixed-width types
using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
Expand Down
Loading

0 comments on commit 8fbb1d0

Please sign in to comment.