diff --git a/cpp/benchmarks/common/cudf_datagen/dbgen.cu b/cpp/benchmarks/common/cudf_datagen/dbgen.cu index 94bd171235c..55e57e27238 100644 --- a/cpp/benchmarks/common/cudf_datagen/dbgen.cu +++ b/cpp/benchmarks/common/cudf_datagen/dbgen.cu @@ -201,56 +201,6 @@ std::unique_ptr gen_repeat_str_col(std::string value, int64_t num_ return scalar_repeat; } -std::vector get_modes_strs() -{ - return std::vector{"REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB"}; -} - -std::vector get_instructions_strs() -{ - return std::vector{"DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN"}; -} - -std::vector get_priorities_strs() -{ - return std::vector{"1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW"}; -} - -std::vector get_segments_strs() -{ - return std::vector{"AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"}; -} - -std::vector get_types_strs() -{ - std::vector syllable_a = { - "STANDARD", "SMALL", "MEDIUM", "LARGE", "ECONOMY", "PROMO"}; - std::vector syllable_b = {"ANODIZED", "BURNISHED", "PLATED", "POLISHED", "BRUSHED"}; - std::vector syllable_c = {"TIN", "NICKEL", "BRASS", "STEEL", "COPPER"}; - std::vector syllable_combinations; - for (auto const& s_a : syllable_a) { - for (auto const& s_b : syllable_b) { - for (auto const& s_c : syllable_c) { - syllable_combinations.push_back(s_a + " " + s_b + " " + s_c); - } - } - } - return syllable_combinations; -} - -std::vector get_containers_strs() -{ - std::vector syllable_a = {"SM", "LG", "MED", "JUMBO", "WRAP"}; - std::vector syllable_b = {"CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM"}; - std::vector syllable_combinations; - for (auto const& s_a : syllable_a) { - for (auto const& s_b : syllable_b) { - syllable_combinations.push_back(s_a + " " + s_b); - } - } - return syllable_combinations; -} - std::unique_ptr gen_rand_str_col_from_set(std::vector string_set, int64_t num_rows) { @@ -272,25 +222,29 @@ void generate_lineitem(int64_t scale_factor) { cudf::size_type num_rows = 1500000 * scale_factor; + // Generate the `l_partkey` column + auto l_partkey = gen_rand_num_col(1, 200000 * scale_factor, num_rows); + // Generate the `l_quantity` column auto l_quantity = gen_rand_num_col(1, 50, num_rows); // Generate the `l_discount` column - auto l_discount = gen_rand_num_col(0.0, 0.10, num_rows); + auto l_discount = gen_rand_num_col(0.0, 0.10, num_rows); // Generate the `l_tax` column - auto l_tax = gen_rand_num_col(0.0, 0.08, num_rows); + auto l_tax = gen_rand_num_col(0.0, 0.08, num_rows); // Generate the `l_comment` column auto l_comment = gen_rand_str_col(10, 43, num_rows); // Generate the `l_shipinstruct` column - auto l_shipinstruct = gen_rand_str_col_from_set(get_instructions_strs(), num_rows); + auto l_shipinstruct = gen_rand_str_col_from_set(vocab_instructions, num_rows); // Generate the `l_shipmode` column - auto l_shipmode = gen_rand_str_col_from_set(get_modes_strs(), num_rows); + auto l_shipmode = gen_rand_str_col_from_set(vocab_modes, num_rows); - auto lineitem = cudf::table_view({l_quantity->view(), + auto lineitem = cudf::table_view({l_partkey->view(), + l_quantity->view(), l_discount->view(), l_tax->view(), l_comment->view(), @@ -307,7 +261,7 @@ void generate_orders(int64_t scale_factor) cudf::size_type num_rows = 1500000 * scale_factor; // Generate the `o_orderpriority` column - auto o_orderpriority = gen_rand_str_col_from_set(get_priorities_strs(), num_rows); + auto o_orderpriority = gen_rand_str_col_from_set(vocab_priorities, num_rows); // Generate the `o_shippriority` column auto empty = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, @@ -418,7 +372,7 @@ void generate_partsupp(int64_t const& scale_factor, auto const ps_availqty = gen_rand_num_col(1, 9999, num_rows); // Generate the `p_supplycost` column - auto const ps_supplycost = gen_rand_num_col(1.00, 1000.00, num_rows); + auto const ps_supplycost = gen_rand_num_col(1.0, 1000.0, num_rows); // Generate the `p_comment` column // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification @@ -527,13 +481,13 @@ void generate_part(int64_t const& scale_factor, {brand_repeat->view(), random_values_m_str->view(), random_values_n_str->view()})); // Generate the `p_type` column - auto const p_type = gen_rand_str_col_from_set(get_types_strs(), num_rows); + auto const p_type = gen_rand_str_col_from_set(gen_vocab_types(), num_rows); // Generate the `p_size` column auto const p_size = gen_rand_num_col(1, 50, num_rows); // Generate the `p_container` column - auto const p_container = gen_rand_str_col_from_set(get_containers_strs(), num_rows); + auto const p_container = gen_rand_str_col_from_set(gen_vocab_containers(), num_rows); // Generate the `p_retailprice` column auto const p_retailprice = calc_p_retailprice(p_partkey->view()); @@ -684,7 +638,7 @@ void generate_customer(int64_t const& scale_factor, auto const c_acctbal = gen_rand_num_col(-999.99, 9999.99, num_rows); // Generate the `c_mktsegment` column - auto const c_mktsegment = gen_rand_str_col_from_set(get_segments_strs(), num_rows); + auto const c_mktsegment = gen_rand_str_col_from_set(vocab_segments, num_rows); // Generate the `c_comment` column // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification diff --git a/cpp/benchmarks/common/cudf_datagen/vocab.hpp b/cpp/benchmarks/common/cudf_datagen/vocab.hpp index 12efcb490f9..ea9c931b680 100644 --- a/cpp/benchmarks/common/cudf_datagen/vocab.hpp +++ b/cpp/benchmarks/common/cudf_datagen/vocab.hpp @@ -31,3 +31,45 @@ std::vector const vocab_p_name = { "sienna", "sky", "slate", "smoke", "snow", "spring", "steel", "tan", "thistle", "tomato", "turquoise", "violet", "wheat", "white", "yellow"}; + +std::vector const vocab_modes = { + "REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB"}; + +std::vector const vocab_instructions = { + "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN"}; + +std::vector const vocab_priorities = { + "1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW"}; + +std::vector const vocab_segments = { + "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"}; + +std::vector gen_vocab_types() +{ + std::vector syllable_a = { + "STANDARD", "SMALL", "MEDIUM", "LARGE", "ECONOMY", "PROMO"}; + std::vector syllable_b = {"ANODIZED", "BURNISHED", "PLATED", "POLISHED", "BRUSHED"}; + std::vector syllable_c = {"TIN", "NICKEL", "BRASS", "STEEL", "COPPER"}; + std::vector syllable_combinations; + for (auto const& s_a : syllable_a) { + for (auto const& s_b : syllable_b) { + for (auto const& s_c : syllable_c) { + syllable_combinations.push_back(s_a + " " + s_b + " " + s_c); + } + } + } + return syllable_combinations; +} + +std::vector gen_vocab_containers() +{ + std::vector syllable_a = {"SM", "LG", "MED", "JUMBO", "WRAP"}; + std::vector syllable_b = {"CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM"}; + std::vector syllable_combinations; + for (auto const& s_a : syllable_a) { + for (auto const& s_b : syllable_b) { + syllable_combinations.push_back(s_a + " " + s_b); + } + } + return syllable_combinations; +}