Skip to content

Commit

Permalink
Move vocabs into vocabs.hpp
Browse files Browse the repository at this point in the history
  • Loading branch information
JayjeetAtGithub committed Jul 18, 2024
1 parent 39cf97c commit 0f47bef
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 60 deletions.
74 changes: 14 additions & 60 deletions cpp/benchmarks/common/cudf_datagen/dbgen.cu
Original file line number Diff line number Diff line change
Expand Up @@ -201,56 +201,6 @@ std::unique_ptr<cudf::column> gen_repeat_str_col(std::string value, int64_t num_
return scalar_repeat;
}

std::vector<std::string> get_modes_strs()
{
return std::vector<std::string>{"REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB"};
}

std::vector<std::string> get_instructions_strs()
{
return std::vector<std::string>{"DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN"};
}

std::vector<std::string> get_priorities_strs()
{
return std::vector<std::string>{"1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW"};
}

std::vector<std::string> get_segments_strs()
{
return std::vector<std::string>{"AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"};
}

std::vector<std::string> get_types_strs()
{
std::vector<std::string> syllable_a = {
"STANDARD", "SMALL", "MEDIUM", "LARGE", "ECONOMY", "PROMO"};
std::vector<std::string> syllable_b = {"ANODIZED", "BURNISHED", "PLATED", "POLISHED", "BRUSHED"};
std::vector<std::string> syllable_c = {"TIN", "NICKEL", "BRASS", "STEEL", "COPPER"};
std::vector<std::string> syllable_combinations;
for (auto const& s_a : syllable_a) {
for (auto const& s_b : syllable_b) {
for (auto const& s_c : syllable_c) {
syllable_combinations.push_back(s_a + " " + s_b + " " + s_c);
}
}
}
return syllable_combinations;
}

std::vector<std::string> get_containers_strs()
{
std::vector<std::string> syllable_a = {"SM", "LG", "MED", "JUMBO", "WRAP"};
std::vector<std::string> syllable_b = {"CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM"};
std::vector<std::string> syllable_combinations;
for (auto const& s_a : syllable_a) {
for (auto const& s_b : syllable_b) {
syllable_combinations.push_back(s_a + " " + s_b);
}
}
return syllable_combinations;
}

std::unique_ptr<cudf::column> gen_rand_str_col_from_set(std::vector<std::string> string_set,
int64_t num_rows)
{
Expand All @@ -272,25 +222,29 @@ void generate_lineitem(int64_t scale_factor)
{
cudf::size_type num_rows = 1500000 * scale_factor;

// Generate the `l_partkey` column
auto l_partkey = gen_rand_num_col<int64_t>(1, 200000 * scale_factor, num_rows);

// Generate the `l_quantity` column
auto l_quantity = gen_rand_num_col<int64_t>(1, 50, num_rows);

// Generate the `l_discount` column
auto l_discount = gen_rand_num_col<float>(0.0, 0.10, num_rows);
auto l_discount = gen_rand_num_col<double>(0.0, 0.10, num_rows);

// Generate the `l_tax` column
auto l_tax = gen_rand_num_col<float>(0.0, 0.08, num_rows);
auto l_tax = gen_rand_num_col<double>(0.0, 0.08, num_rows);

// Generate the `l_comment` column
auto l_comment = gen_rand_str_col(10, 43, num_rows);

// Generate the `l_shipinstruct` column
auto l_shipinstruct = gen_rand_str_col_from_set(get_instructions_strs(), num_rows);
auto l_shipinstruct = gen_rand_str_col_from_set(vocab_instructions, num_rows);

// Generate the `l_shipmode` column
auto l_shipmode = gen_rand_str_col_from_set(get_modes_strs(), num_rows);
auto l_shipmode = gen_rand_str_col_from_set(vocab_modes, num_rows);

auto lineitem = cudf::table_view({l_quantity->view(),
auto lineitem = cudf::table_view({l_partkey->view(),
l_quantity->view(),
l_discount->view(),
l_tax->view(),
l_comment->view(),
Expand All @@ -307,7 +261,7 @@ void generate_orders(int64_t scale_factor)
cudf::size_type num_rows = 1500000 * scale_factor;

// Generate the `o_orderpriority` column
auto o_orderpriority = gen_rand_str_col_from_set(get_priorities_strs(), num_rows);
auto o_orderpriority = gen_rand_str_col_from_set(vocab_priorities, num_rows);

// Generate the `o_shippriority` column
auto empty = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64},
Expand Down Expand Up @@ -418,7 +372,7 @@ void generate_partsupp(int64_t const& scale_factor,
auto const ps_availqty = gen_rand_num_col<int64_t>(1, 9999, num_rows);

// Generate the `p_supplycost` column
auto const ps_supplycost = gen_rand_num_col<double>(1.00, 1000.00, num_rows);
auto const ps_supplycost = gen_rand_num_col<double>(1.0, 1000.0, num_rows);

// Generate the `p_comment` column
// NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
Expand Down Expand Up @@ -527,13 +481,13 @@ void generate_part(int64_t const& scale_factor,
{brand_repeat->view(), random_values_m_str->view(), random_values_n_str->view()}));

// Generate the `p_type` column
auto const p_type = gen_rand_str_col_from_set(get_types_strs(), num_rows);
auto const p_type = gen_rand_str_col_from_set(gen_vocab_types(), num_rows);

// Generate the `p_size` column
auto const p_size = gen_rand_num_col<int64_t>(1, 50, num_rows);

// Generate the `p_container` column
auto const p_container = gen_rand_str_col_from_set(get_containers_strs(), num_rows);
auto const p_container = gen_rand_str_col_from_set(gen_vocab_containers(), num_rows);

// Generate the `p_retailprice` column
auto const p_retailprice = calc_p_retailprice(p_partkey->view());
Expand Down Expand Up @@ -684,7 +638,7 @@ void generate_customer(int64_t const& scale_factor,
auto const c_acctbal = gen_rand_num_col<double>(-999.99, 9999.99, num_rows);

// Generate the `c_mktsegment` column
auto const c_mktsegment = gen_rand_str_col_from_set(get_segments_strs(), num_rows);
auto const c_mktsegment = gen_rand_str_col_from_set(vocab_segments, num_rows);

// Generate the `c_comment` column
// NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
Expand Down
42 changes: 42 additions & 0 deletions cpp/benchmarks/common/cudf_datagen/vocab.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,45 @@ std::vector<std::string> const vocab_p_name = {
"sienna", "sky", "slate", "smoke", "snow", "spring", "steel",
"tan", "thistle", "tomato", "turquoise", "violet", "wheat", "white",
"yellow"};

std::vector<std::string> const vocab_modes = {
"REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB"};

std::vector<std::string> const vocab_instructions = {
"DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN"};

std::vector<std::string> const vocab_priorities = {
"1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW"};

std::vector<std::string> const vocab_segments = {
"AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"};

std::vector<std::string> gen_vocab_types()
{
std::vector<std::string> syllable_a = {
"STANDARD", "SMALL", "MEDIUM", "LARGE", "ECONOMY", "PROMO"};
std::vector<std::string> syllable_b = {"ANODIZED", "BURNISHED", "PLATED", "POLISHED", "BRUSHED"};
std::vector<std::string> syllable_c = {"TIN", "NICKEL", "BRASS", "STEEL", "COPPER"};
std::vector<std::string> syllable_combinations;
for (auto const& s_a : syllable_a) {
for (auto const& s_b : syllable_b) {
for (auto const& s_c : syllable_c) {
syllable_combinations.push_back(s_a + " " + s_b + " " + s_c);
}
}
}
return syllable_combinations;
}

std::vector<std::string> gen_vocab_containers()
{
std::vector<std::string> syllable_a = {"SM", "LG", "MED", "JUMBO", "WRAP"};
std::vector<std::string> syllable_b = {"CASE", "BOX", "BAG", "JAR", "PKG", "PACK", "CAN", "DRUM"};
std::vector<std::string> syllable_combinations;
for (auto const& s_a : syllable_a) {
for (auto const& s_b : syllable_b) {
syllable_combinations.push_back(s_a + " " + s_b);
}
}
return syllable_combinations;
}

0 comments on commit 0f47bef

Please sign in to comment.