diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ff431c7f260..89717b6b997 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -17,11 +17,19 @@ find_package(Threads REQUIRED) add_library(cudf_datagen STATIC common/generate_input.cu) target_compile_features(cudf_datagen PUBLIC cxx_std_17 cuda_std_17) +add_executable(cudf_tpch_datagen common/cudf_datagen/dbgen.cu) +target_compile_features(cudf_tpch_datagen PUBLIC cxx_std_17 cuda_std_17) + target_compile_options( cudf_datagen PUBLIC "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" ) +target_compile_options( + cudf_tpch_datagen PUBLIC "$<$:${CUDF_CXX_FLAGS}>" + "$<$:${CUDF_CUDA_FLAGS}>" +) + target_link_libraries( cudf_datagen PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf @@ -35,6 +43,19 @@ target_include_directories( "$" ) +target_link_libraries( + cudf_tpch_datagen + PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf + cudftestutil nvtx3::nvtx3-cpp + PRIVATE $ +) + +target_include_directories( + cudf_tpch_datagen + PUBLIC "$" "$" + "$" +) + # ################################################################################################## # * compiler function ----------------------------------------------------------------------------- diff --git a/cpp/benchmarks/common/cudf_datagen/dbgen.cu b/cpp/benchmarks/common/cudf_datagen/dbgen.cu new file mode 100644 index 00000000000..1875c83dc29 --- /dev/null +++ b/cpp/benchmarks/common/cudf_datagen/dbgen.cu @@ -0,0 +1,746 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "schema.hpp" +#include "utils.hpp" +#include "vocab.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct generate_random_string { + char* chars; + thrust::default_random_engine engine; + thrust::uniform_int_distribution char_dist; + + __host__ __device__ generate_random_string(char* c) : chars(c), char_dist(32, 137) {} + + __host__ __device__ void operator()(thrust::tuple str_begin_end) + { + auto begin = thrust::get<0>(str_begin_end); + auto end = thrust::get<1>(str_begin_end); + engine.discard(begin); + for (auto i = begin; i < end; ++i) { + auto ch = char_dist(engine); + if (i == end - 1 && ch >= '\x7F') ch = ' '; // last element ASCII only. + if (ch >= '\x7F') // x7F is at the top edge of ASCII + chars[i++] = '\xC4'; // these characters are assigned two bytes + chars[i] = static_cast(ch + (ch >= '\x7F')); + } + } +}; + +template +struct generate_random_value { + T lower; + T upper; + + __host__ __device__ generate_random_value(T lower, T upper) : lower(lower), upper(upper) {} + + __host__ __device__ T operator()(const int64_t idx) const + { + if (cudf::is_integral()) { + thrust::default_random_engine engine; + thrust::uniform_int_distribution dist(lower, upper); + engine.discard(idx); + return dist(engine); + } else { + thrust::default_random_engine engine; + thrust::uniform_real_distribution dist(lower, upper); + engine.discard(idx); + return dist(engine); + } + } +}; + +std::unique_ptr gen_rand_str_col(int64_t lower, + int64_t upper, + cudf::size_type num_rows) +{ + rmm::device_uvector offsets(num_rows + 1, cudf::get_default_stream()); + + // The first element will always be 0 since it the offset of the first string. + int64_t initial_offset{0}; + offsets.set_element(0, initial_offset, cudf::get_default_stream()); + + // We generate the lengths of the strings randomly for each row and + // store them from the second element of the offsets vector. + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + offsets.begin() + 1, + generate_random_value(lower, upper)); + + // We then calculate the offsets by performing an inclusive scan on this + // vector. + thrust::inclusive_scan( + rmm::exec_policy(cudf::get_default_stream()), offsets.begin(), offsets.end(), offsets.begin()); + + // The last element is the total length of all the strings combined using + // which we allocate the memory for the `chars` vector, that holds the + // randomly generated characters for the strings. + auto total_length = *thrust::device_pointer_cast(offsets.end() - 1); + rmm::device_uvector chars(total_length, cudf::get_default_stream()); + + // We generate the strings in parallel into the `chars` vector using the + // offsets vector generated above. + thrust::for_each_n(rmm::exec_policy(cudf::get_default_stream()), + thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1), + num_rows, + generate_random_string(chars.data())); + + return cudf::make_strings_column( + num_rows, + std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), + chars.release(), + 0, + rmm::device_buffer{}); +} + +template +std::unique_ptr gen_rand_num_col(T lower, T upper, cudf::size_type count) +{ + cudf::data_type type; + if (cudf::is_integral()) { + type = cudf::data_type{cudf::type_id::INT64}; + } else { + type = cudf::data_type{cudf::type_id::FLOAT64}; + } + auto col = cudf::make_numeric_column( + type, count, cudf::mask_state::UNALLOCATED, cudf::get_default_stream()); + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + col->mutable_view().begin(), + generate_random_value(lower, upper)); + return col; +} + +/** + * @brief Generate a primary key column + * + * @param start The starting value of the primary key + * @param num_rows The number of rows in the column + */ +std::unique_ptr gen_primary_key_col(int64_t start, int64_t num_rows) +{ + auto init = cudf::numeric_scalar(start); + auto step = cudf::numeric_scalar(1); + return cudf::sequence(num_rows, init, step); +} + +/** + * @brief Generate a column where all the rows have the same string value + * + * @param value The string value + * @param num_rows The length of the column + */ +std::unique_ptr gen_repeat_str_col(std::string value, int64_t num_rows) +{ + auto indices = rmm::device_uvector(num_rows, cudf::get_default_stream()); + auto empty_str_col = + cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), cudf::get_default_stream()); + auto scalar = cudf::string_scalar(value); + auto scalar_repeat = cudf::fill(empty_str_col->view(), 0, num_rows, scalar); + return scalar_repeat; +} + +std::unique_ptr gen_rand_str_col_from_set(std::vector string_set, + int64_t num_rows) +{ + // Build a vocab table of random strings to choose from + auto keys = gen_primary_key_col(0, string_set.size()); + auto values = cudf::test::strings_column_wrapper(string_set.begin(), string_set.end()).release(); + auto vocab_table = cudf::table_view({keys->view(), values->view()}); + + // Build a single column table containing `num_rows` random numbers + auto rand_keys = gen_rand_num_col(0, string_set.size() - 1, num_rows); + auto rand_keys_table = cudf::table_view({rand_keys->view()}); + + auto joined_table = + perform_left_join(rand_keys_table, vocab_table, {0}, {0}, cudf::null_equality::EQUAL); + return std::make_unique(joined_table->get_column(2)); +} + +void generate_lineitem(int64_t scale_factor) +{ + cudf::size_type num_rows = 1500000 * scale_factor; + + // Generate the `l_partkey` column + auto l_partkey = gen_rand_num_col(1, 200000 * scale_factor, num_rows); + + // Generate the `l_quantity` column + auto l_quantity = gen_rand_num_col(1, 50, num_rows); + + // Generate the `l_discount` column + auto l_discount = gen_rand_num_col(0.0, 0.10, num_rows); + + // Generate the `l_tax` column + auto l_tax = gen_rand_num_col(0.0, 0.08, num_rows); + + // Generate the `l_comment` column + auto l_comment = gen_rand_str_col(10, 43, num_rows); + + // Generate the `l_shipinstruct` column + auto l_shipinstruct = gen_rand_str_col_from_set(vocab_instructions, num_rows); + + // Generate the `l_shipmode` column + auto l_shipmode = gen_rand_str_col_from_set(vocab_modes, num_rows); + + auto lineitem = cudf::table_view({l_partkey->view(), + l_quantity->view(), + l_discount->view(), + l_tax->view(), + l_comment->view(), + l_shipinstruct->view(), + l_shipmode->view()}); + + write_parquet(lineitem, + "lineitem.parquet", + {"l_quantity", "l_discount", "l_tax", "l_comment", "l_shipinstruct", "l_shipmode"}); +} + +void generate_orders(int64_t scale_factor) +{ + cudf::size_type num_rows = 1500000 * scale_factor; + + // Generate the `o_orderpriority` column + auto o_orderpriority = gen_rand_str_col_from_set(vocab_priorities, num_rows); + + // Generate the `o_shippriority` column + auto empty = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + num_rows, + cudf::mask_state::UNALLOCATED, + cudf::get_default_stream()); + auto o_shippriority = cudf::fill(empty->view(), 0, num_rows, cudf::numeric_scalar(0)); + + // Generate the `o_comment` column + auto o_comment = gen_rand_str_col(19, 78, num_rows); + + auto orders = + cudf::table_view({o_orderpriority->view(), o_shippriority->view(), o_comment->view()}); + + write_parquet(orders, "orders.parquet", {"o_orderpriority", "o_shippriority", "o_comment"}); +} + +std::unique_ptr calc_ps_suppkey(cudf::column_view const& ps_partkey, + int64_t const& scale_factor, + int64_t const& num_rows) +{ + // Generating the `s` col + auto s_empty = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + num_rows, + cudf::mask_state::UNALLOCATED, + cudf::get_default_stream()); + + auto s = + cudf::fill(s_empty->view(), 0, num_rows, cudf::numeric_scalar(10000 * scale_factor)); + + // Generating the `i` col + auto seq = gen_primary_key_col(0, num_rows); + auto i = cudf::binary_operation(seq->view(), + cudf::numeric_scalar(4), + cudf::binary_operator::MOD, + cudf::data_type{cudf::type_id::INT64}); + + // Create a table view out of `p_partkey`, `s`, and `i` + auto table = cudf::table_view({ps_partkey, s->view(), i->view()}); + + // Create the AST expression + auto scalar_1 = cudf::numeric_scalar(1); + auto scalar_4 = cudf::numeric_scalar(4); + auto literal_1 = cudf::ast::literal(scalar_1); + auto literal_4 = cudf::ast::literal(scalar_4); + + auto ps_partkey_col_ref = cudf::ast::column_reference(0); + auto s_col_ref = cudf::ast::column_reference(1); + auto i_col_ref = cudf::ast::column_reference(2); + + // (int)(ps_partkey - 1)/s + auto expr_a = cudf::ast::operation(cudf::ast::ast_operator::SUB, ps_partkey_col_ref, literal_1); + auto expr_b = cudf::ast::operation(cudf::ast::ast_operator::DIV, expr_a, s_col_ref); + auto expr_b_casted = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_INT64, expr_b); + + // s/4 + auto expr_c = cudf::ast::operation(cudf::ast::ast_operator::DIV, s_col_ref, literal_4); + + // (s/4 + (int)(ps_partkey - 1)/s) + auto expr_d = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_c, expr_b_casted); + + // (i * (s/4 + (int)(ps_partkey - 1)/s)) + auto expr_e = cudf::ast::operation(cudf::ast::ast_operator::MUL, i_col_ref, expr_d); + + // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) + auto expr_f = cudf::ast::operation(cudf::ast::ast_operator::ADD, ps_partkey_col_ref, expr_e); + + // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + auto expr_g = cudf::ast::operation(cudf::ast::ast_operator::MOD, expr_f, s_col_ref); + + // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + 1 + auto final_expr = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_g, literal_1); + + // Execute the AST expression + auto ps_suppkey = cudf::compute_column(table, final_expr); + return ps_suppkey; +} + +/** + * @brief Generate the `partsupp` table + * + * @param scale_factor The scale factor to use + * @param stream The CUDA stream to use + * @param mr The memory resource to use + */ +void generate_partsupp(int64_t const& scale_factor, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + cudf::size_type const num_rows_part = 200000 * scale_factor; + cudf::size_type const num_rows = 800000 * scale_factor; + + // Generate the `ps_partkey` column + auto const p_partkey = gen_primary_key_col(1, num_rows_part); + auto const rep_freq_empty = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + num_rows_part, + cudf::mask_state::UNALLOCATED, + cudf::get_default_stream()); + auto const rep_freq = + cudf::fill(rep_freq_empty->view(), 0, num_rows_part, cudf::numeric_scalar(4)); + auto const rep_table = cudf::repeat(cudf::table_view({p_partkey->view()}), rep_freq->view()); + auto const ps_partkey = rep_table->get_column(0); + + // Generate the `ps_suppkey` column + auto const ps_suppkey = calc_ps_suppkey(ps_partkey.view(), scale_factor, num_rows); + + // Generate the `p_availqty` column + auto const ps_availqty = gen_rand_num_col(1, 9999, num_rows); + + // Generate the `p_supplycost` column + auto const ps_supplycost = gen_rand_num_col(1.0, 1000.0, num_rows); + + // Generate the `p_comment` column + // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification + auto const ps_comment = gen_rand_str_col(49, 198, num_rows); + + auto partsupp = cudf::table_view({ps_partkey.view(), + ps_suppkey->view(), + ps_availqty->view(), + ps_supplycost->view(), + ps_comment->view()}); + write_parquet(partsupp, "partsupp.parquet", schema_partsupp); +} + +std::unique_ptr calc_p_retailprice(cudf::column_view const& p_partkey) +{ + // ( + // 90000 + // + + // ( + // (P_PARTKEY/10) + // modulo + // 20001 + // ) + // + + // 100 + // * + // (P_PARTKEY modulo 1000) + // ) + // /100 + auto val_a = cudf::binary_operation(p_partkey, + cudf::numeric_scalar(10), + cudf::binary_operator::DIV, + cudf::data_type{cudf::type_id::FLOAT64}); + + auto val_b = cudf::binary_operation(val_a->view(), + cudf::numeric_scalar(20001), + cudf::binary_operator::MOD, + cudf::data_type{cudf::type_id::INT64}); + + auto val_c = cudf::binary_operation(p_partkey, + cudf::numeric_scalar(1000), + cudf::binary_operator::MOD, + cudf::data_type{cudf::type_id::INT64}); + + auto val_d = cudf::binary_operation(val_c->view(), + cudf::numeric_scalar(100), + cudf::binary_operator::MUL, + cudf::data_type{cudf::type_id::INT64}); + // 90000 + val_b + val_d + auto val_e = cudf::binary_operation(val_b->view(), + cudf::numeric_scalar(90000), + cudf::binary_operator::ADD, + cudf::data_type{cudf::type_id::INT64}); + + auto val_f = cudf::binary_operation(val_e->view(), + val_d->view(), + cudf::binary_operator::ADD, + cudf::data_type{cudf::type_id::INT64}); + + auto p_retailprice = cudf::binary_operation(val_f->view(), + cudf::numeric_scalar(100), + cudf::binary_operator::DIV, + cudf::data_type{cudf::type_id::FLOAT64}); + + return p_retailprice; +} + +/** + * @brief Generate the `part` table + * + * @param scale_factor The scale factor to use + * @param stream The CUDA stream to use + * @param mr The memory resource to use + */ +void generate_part(int64_t const& scale_factor, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + cudf::size_type const num_rows = 200000 * scale_factor; + + // Generate the `p_partkey` column + auto const p_partkey = gen_primary_key_col(1, num_rows); + + // Generate the `p_name` column + auto const p_name_a = gen_rand_str_col_from_set(vocab_p_name, num_rows); + auto const p_name_b = gen_rand_str_col_from_set(vocab_p_name, num_rows); + auto const p_name_c = gen_rand_str_col_from_set(vocab_p_name, num_rows); + auto const p_name_d = gen_rand_str_col_from_set(vocab_p_name, num_rows); + auto const p_name_e = gen_rand_str_col_from_set(vocab_p_name, num_rows); + auto const p_name_parts = cudf::table_view( + {p_name_a->view(), p_name_b->view(), p_name_c->view(), p_name_d->view(), p_name_e->view()}); + auto const p_name = cudf::strings::concatenate(p_name_parts, cudf::string_scalar(" ")); + + // Generate the `p_mfgr` column + auto const mfgr_repeat = gen_repeat_str_col("Manufacturer#", num_rows); + auto const random_values_m = gen_rand_num_col(1, 5, num_rows); + auto const random_values_m_str = cudf::strings::from_integers(random_values_m->view()); + auto const p_mfgr = cudf::strings::concatenate( + cudf::table_view({mfgr_repeat->view(), random_values_m_str->view()})); + + // Generate the `p_brand` column + auto const brand_repeat = gen_repeat_str_col("Brand#", num_rows); + auto const random_values_n = gen_rand_num_col(1, 5, num_rows); + auto const random_values_n_str = cudf::strings::from_integers(random_values_n->view()); + auto const p_brand = cudf::strings::concatenate(cudf::table_view( + {brand_repeat->view(), random_values_m_str->view(), random_values_n_str->view()})); + + // Generate the `p_type` column + auto const p_type = gen_rand_str_col_from_set(gen_vocab_types(), num_rows); + + // Generate the `p_size` column + auto const p_size = gen_rand_num_col(1, 50, num_rows); + + // Generate the `p_container` column + auto const p_container = gen_rand_str_col_from_set(gen_vocab_containers(), num_rows); + + // Generate the `p_retailprice` column + auto const p_retailprice = calc_p_retailprice(p_partkey->view()); + + // Generate the `p_comment` column + // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification + auto const p_comment = gen_rand_str_col(5, 22, num_rows); + + // Create the `part` table + auto const part = cudf::table_view({p_partkey->view(), + p_name->view(), + p_mfgr->view(), + p_brand->view(), + p_type->view(), + p_size->view(), + p_container->view(), + p_retailprice->view(), + p_comment->view()}); + + write_parquet(part, "part.parquet", schema_part); +} + +/** + * @brief Generate the `nation` table + * + * @param scale_factor The scale factor to use + * @param stream The CUDA stream to use + * @param mr The memory resource to use + */ +void generate_nation(int64_t const& scale_factor, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + cudf::size_type const num_rows = 25; + + // Generate the `n_nationkey` column + auto const n_nationkey = gen_primary_key_col(0, num_rows); + + // Generate the `n_name` column + auto const n_name = + cudf::test::strings_column_wrapper( + {"ALGERIA", "ARGENTINA", "BRAZIL", "CANADA", "EGYPT", + "ETHIOPIA", "FRANCE", "GERMANY", "INDIA", "INDONESIA", + "IRAN", "IRAQ", "JAPAN", "JORDAN", "KENYA", + "MOROCCO", "MOZAMBIQUE", "PERU", "CHINA", "ROMANIA", + "SAUDI ARABIA", "VIETNAM", "RUSSIA", "UNITED KINGDOM", "UNITED STATES"}) + .release(); + + // Generate the `n_regionkey` column + thrust::host_vector const region_keys = {0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2, + 4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1}; + thrust::device_vector const d_region_keys = region_keys; + + auto n_regionkey = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + num_rows, + cudf::mask_state::UNALLOCATED, + cudf::get_default_stream()); + thrust::copy(rmm::exec_policy(cudf::get_default_stream()), + d_region_keys.begin(), + d_region_keys.end(), + n_regionkey->mutable_view().begin()); + + // Generate the `n_comment` column + // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification + auto const n_comment = gen_rand_str_col(31, 114, num_rows); + + // Create the `nation` table + auto const nation = + cudf::table_view({n_nationkey->view(), n_name->view(), n_regionkey->view(), n_comment->view()}); + write_parquet(nation, "nation.parquet", schema_nation); +} + +/** + * @brief Generate the `region` table + * + * @param scale_factor The scale factor to use + * @param stream The CUDA stream to use + * @param mr The memory resource to use + */ +void generate_region(int64_t const& scale_factor, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + cudf::size_type const num_rows = 5; + + // Generate the `r_regionkey` column + auto const r_regionkey = gen_primary_key_col(0, num_rows); + + // Generate the `r_name` column + auto const r_name = + cudf::test::strings_column_wrapper({"AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST"}) + .release(); + + // Generate the `r_comment` column + // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification + auto const r_comment = gen_rand_str_col(31, 115, num_rows); + + // Create the `region` table + auto const region = cudf::table_view({r_regionkey->view(), r_name->view(), r_comment->view()}); + write_parquet(region, "region.parquet", schema_region); +} + +/** + * @brief Generate the `customer` table + * + * @param scale_factor The scale factor to use + * @param stream The CUDA stream to use + * @param mr The memory resource to use + */ +void generate_customer(int64_t const& scale_factor, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + cudf::size_type const num_rows = 150000 * scale_factor; + + // Generate the `c_custkey` column + auto const c_custkey = gen_primary_key_col(1, num_rows); + + // Generate the `c_name` column + auto const customer_repeat = gen_repeat_str_col("Customer#", num_rows); + auto const c_custkey_str = cudf::strings::from_integers(c_custkey->view()); + auto const c_custkey_str_padded = + cudf::strings::pad(c_custkey_str->view(), 9, cudf::strings::side_type::LEFT, "0"); + auto const c_name = cudf::strings::concatenate( + cudf::table_view({customer_repeat->view(), c_custkey_str_padded->view()})); + + // Generate the `c_address` column + // NOTE: This column is not compliant with clause 4.2.2.7 of the TPC-H specification + auto const c_address = gen_rand_str_col(10, 40, num_rows); + + // Generate the `c_nationkey` column + auto const c_nationkey = gen_rand_num_col(0, 24, num_rows); + + // Generate the `c_phone` column + auto const c_phone_a = + cudf::strings::from_integers(gen_rand_num_col(10, 34, num_rows)->view()); + auto const c_phone_b = + cudf::strings::from_integers(gen_rand_num_col(100, 999, num_rows)->view()); + auto const c_phone_c = + cudf::strings::from_integers(gen_rand_num_col(100, 999, num_rows)->view()); + auto const c_phone_d = + cudf::strings::from_integers(gen_rand_num_col(1000, 9999, num_rows)->view()); + auto const c_phone_parts = + cudf::table_view({c_phone_a->view(), c_phone_b->view(), c_phone_c->view(), c_phone_d->view()}); + auto const c_phone = cudf::strings::concatenate(c_phone_parts, cudf::string_scalar("-")); + + // Generate the `c_acctbal` column + auto const c_acctbal = gen_rand_num_col(-999.99, 9999.99, num_rows); + + // Generate the `c_mktsegment` column + auto const c_mktsegment = gen_rand_str_col_from_set(vocab_segments, num_rows); + + // Generate the `c_comment` column + // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification + auto const c_comment = gen_rand_str_col(29, 116, num_rows); + + // Create the `customer` table + auto const customer = cudf::table_view({c_custkey->view(), + c_name->view(), + c_address->view(), + c_nationkey->view(), + c_phone->view(), + c_acctbal->view(), + c_mktsegment->view(), + c_comment->view()}); + write_parquet(customer, "customer.parquet", schema_customer); +} + +/** + * @brief Generate the `supplier` table + * + * @param scale_factor The scale factor to use + * @param stream The CUDA stream to use + * @param mr The memory resource to use + */ +void generate_supplier(int64_t const& scale_factor, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + cudf::size_type num_rows = 10000 * scale_factor; + + // Generate the `s_suppkey` column + auto s_suppkey = gen_primary_key_col(1, num_rows); + + // Generate the `s_name` column + auto supplier_repeat = gen_repeat_str_col("Supplier#", num_rows); + auto s_suppkey_str = cudf::strings::from_integers(s_suppkey->view()); + auto s_suppkey_str_padded = + cudf::strings::pad(s_suppkey_str->view(), 9, cudf::strings::side_type::LEFT, "0"); + auto s_name = cudf::strings::concatenate( + cudf::table_view({supplier_repeat->view(), s_suppkey_str_padded->view()})); + + // Generate the `s_address` column + // NOTE: This column is not compliant with clause 4.2.2.7 of the TPC-H specification + auto s_address = gen_rand_str_col(10, 40, num_rows); + + // Generate the `s_nationkey` column + auto s_nationkey = gen_rand_num_col(0, 24, num_rows); + + // Generate the `s_phone` column + auto s_phone_part_1 = + cudf::strings::from_integers(gen_rand_num_col(10, 34, num_rows)->view()); + auto s_phone_part_2 = + cudf::strings::from_integers(gen_rand_num_col(100, 999, num_rows)->view()); + auto s_phone_part_3 = + cudf::strings::from_integers(gen_rand_num_col(100, 999, num_rows)->view()); + auto s_phone_part_4 = + cudf::strings::from_integers(gen_rand_num_col(1000, 9999, num_rows)->view()); + auto s_phone_parts = cudf::table_view({s_phone_part_1->view(), + s_phone_part_2->view(), + s_phone_part_3->view(), + s_phone_part_4->view()}); + auto s_phone = cudf::strings::concatenate(s_phone_parts, cudf::string_scalar("-")); + + // Generate the `s_acctbal` column + auto s_acctbal = gen_rand_num_col(-999.99, 9999.99, num_rows); + + // Generate the `s_comment` column + // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification + auto s_comment = gen_rand_str_col(25, 100, num_rows); + + // Create the `supplier` table + auto supplier = cudf::table_view({s_suppkey->view(), + s_name->view(), + s_address->view(), + s_nationkey->view(), + s_phone->view(), + s_acctbal->view(), + s_comment->view()}); + write_parquet(supplier, "supplier.parquet", schema_supplier); +} + +int main(int argc, char** argv) +{ + rmm::mr::cuda_memory_resource cuda_mr{}; + rmm::mr::set_current_device_resource(&cuda_mr); + + if (argc < 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + int32_t scale_factor = std::atoi(argv[1]); + std::cout << "Requested scale factor: " << scale_factor << std::endl; + + // generate_lineitem(scale_factor); + // generate_orders(scale_factor); + generate_partsupp(scale_factor); + generate_part(scale_factor); + generate_supplier(scale_factor); + generate_customer(scale_factor); + generate_nation(scale_factor); + generate_region(scale_factor); + + return 0; +} diff --git a/cpp/benchmarks/common/cudf_datagen/schema.hpp b/cpp/benchmarks/common/cudf_datagen/schema.hpp new file mode 100644 index 00000000000..f5de39520b7 --- /dev/null +++ b/cpp/benchmarks/common/cudf_datagen/schema.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +auto const schema_customer = std::vector{"c_custkey", + "c_name", + "c_address", + "c_nationkey", + "c_phone", + "c_acctbal", + "c_mktsegment", + "c_comment"}; + +auto const schema_region = std::vector{"r_regionkey", "r_name", "r_comment"}; + +auto const schema_supplier = std::vector{ + "s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"}; + +auto const schema_nation = + std::vector{"n_nationkey", "n_name", "n_regionkey", "n_comment"}; + +auto const schema_part = std::vector{"p_partkey", + "p_name", + "p_mfgr", + "p_brand", + "p_type", + "p_size", + "p_container", + "p_retailprice", + "p_comment"}; + +auto const schema_partsupp = std::vector{ + "ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"}; diff --git a/cpp/benchmarks/common/cudf_datagen/utils.hpp b/cpp/benchmarks/common/cudf_datagen/utils.hpp new file mode 100644 index 00000000000..2f201fdc509 --- /dev/null +++ b/cpp/benchmarks/common/cudf_datagen/utils.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include + +#include + +void write_parquet(cudf::table_view tbl, + std::string const& path, + std::vector const& col_names) +{ + std::cout << "Writing to " << path << "\n"; + auto const sink_info = cudf::io::sink_info(path); + cudf::io::table_metadata metadata; + std::vector col_name_infos; + for (auto& col_name : col_names) { + col_name_infos.push_back(cudf::io::column_name_info(col_name)); + } + metadata.schema_info = col_name_infos; + auto const table_input_metadata = cudf::io::table_input_metadata{metadata}; + auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl); + builder.metadata(table_input_metadata); + auto const options = builder.build(); + cudf::io::write_parquet(options); +} + +std::unique_ptr perform_left_join(cudf::table_view const& left_input, + cudf::table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls) +{ + constexpr auto oob_policy = cudf::out_of_bounds_policy::NULLIFY; + auto const left_selected = left_input.select(left_on); + auto const right_selected = right_input.select(right_on); + auto const [left_join_indices, right_join_indices] = cudf::left_join( + left_selected, right_selected, compare_nulls, rmm::mr::get_current_device_resource()); + + auto const left_indices_span = cudf::device_span{*left_join_indices}; + auto const right_indices_span = cudf::device_span{*right_join_indices}; + + auto const left_indices_col = cudf::column_view{left_indices_span}; + auto const right_indices_col = cudf::column_view{right_indices_span}; + + auto const left_result = cudf::gather(left_input, left_indices_col, oob_policy); + auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy); + + auto joined_cols = left_result->release(); + auto right_cols = right_result->release(); + joined_cols.insert(joined_cols.end(), + std::make_move_iterator(right_cols.begin()), + std::make_move_iterator(right_cols.end())); + return std::make_unique(std::move(joined_cols)); +} diff --git a/cpp/benchmarks/common/cudf_datagen/vocab.hpp b/cpp/benchmarks/common/cudf_datagen/vocab.hpp new file mode 100644 index 00000000000..ea9c931b680 --- /dev/null +++ b/cpp/benchmarks/common/cudf_datagen/vocab.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +std::vector const vocab_p_name = { + "almond", "antique", "aquamarine", "azure", "beige", "bisque", "black", + "blanched", "blue", "blush", "brown", "burlywood", "burnished", "chartreuse", + "chiffon", "chocolate", "coral", "cornflower", "cornsilk", "cream", "cyan", + "dark", "deep", "dim", "dodger", "drab", "firebrick", "floral", + "forest", "frosted", "gainsboro", "ghost", "goldenrod", "green", "grey", + "honeydew", "hot", "indian", "ivory", "khaki", "lace", "lavender", + "lawn", "lemon", "light", "lime", "linen", "magenta", "maroon", + "medium", "metallic", "midnight", "mint", "misty", "moccasin", "navajo", + "navy", "olive", "orange", "orchid", "pale", "papaya", "peach", + "peru", "pink", "plum", "powder", "puff", "purple", "red", + "rose", "rosy", "royal", "saddle", "salmon", "sandy", "seashell", + "sienna", "sky", "slate", "smoke", "snow", "spring", "steel", + "tan", "thistle", "tomato", "turquoise", "violet", "wheat", "white", + "yellow"}; + +std::vector const vocab_modes = { + "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB"}; + +std::vector const vocab_instructions = { + "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN"}; + +std::vector const vocab_priorities = { + "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW"}; + +std::vector const vocab_segments = { + "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"}; + +std::vector gen_vocab_types() +{ + std::vector syllable_a = { + "STANDARD", "SMALL", "MEDIUM", "LARGE", "ECONOMY", "PROMO"}; + std::vector syllable_b = {"ANODIZED", "BURNISHED", "PLATED", "POLISHED", "BRUSHED"}; + std::vector syllable_c = {"TIN", "NICKEL", "BRASS", "STEEL", "COPPER"}; + std::vector syllable_combinations; + for (auto const& s_a : syllable_a) { + for (auto const& s_b : syllable_b) { + for (auto const& s_c : syllable_c) { + syllable_combinations.push_back(s_a + " " + s_b + " " + s_c); + } + } + } + return syllable_combinations; +} + +std::vector gen_vocab_containers() +{ + std::vector syllable_a = {"SM", "LG", "MED", "JUMBO", "WRAP"}; + std::vector syllable_b = {"CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM"}; + std::vector syllable_combinations; + for (auto const& s_a : syllable_a) { + for (auto const& s_b : syllable_b) { + syllable_combinations.push_back(s_a + " " + s_b); + } + } + return syllable_combinations; +}