Skip to content

Commit

Permalink
Support scale factor < 1
Browse files Browse the repository at this point in the history
  • Loading branch information
JayjeetAtGithub committed Aug 22, 2024
1 parent 64e57fe commit 4a3e91e
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 59 deletions.
6 changes: 4 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ function(ConfigureBench CMAKE_BENCH_NAME)
CUDA_STANDARD_REQUIRED ON
)
target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
$<TARGET_NAME_IF_EXISTS:conda_env>
${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_tpch_datagen cudf_datagen
benchmark::benchmark_main $<TARGET_NAME_IF_EXISTS:conda_env>
)
add_custom_command(
OUTPUT CUDF_BENCHMARKS
Expand Down Expand Up @@ -138,6 +138,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
)
endfunction()

ConfigureBench(DATAGEN common/cudf_tpch_datagen/datagen.cpp)

# ##################################################################################################
# * column benchmarks -----------------------------------------------------------------------------
ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
Expand Down
82 changes: 82 additions & 0 deletions cpp/benchmarks/common/cudf_tpch_datagen/datagen.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "tpch_datagen.hpp"

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/io/parquet.hpp>

/**
* @brief Write a `cudf::table` to a parquet file
*
* @param table The cudf::table to write
* @param path The path to write the parquet file to
* @param col_names The names of the columns in the table
*/
void write_parquet(std::unique_ptr<cudf::table> table,
std::string const& path,
std::vector<std::string> const& col_names)
{
CUDF_FUNC_RANGE();
cudf::io::table_metadata metadata;
std::vector<cudf::io::column_name_info> col_name_infos;
for (auto& col_name : col_names) {
col_name_infos.push_back(cudf::io::column_name_info(col_name));
}
metadata.schema_info = col_name_infos;
auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
auto builder = cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(path));
builder.metadata(table_input_metadata);
auto const options = builder.build();
cudf::io::parquet_chunked_writer(options).write(table->view());
}

int main(int argc, char** argv)
{
if (argc < 2) {
std::cerr << "Usage: " << argv[0] << " [scale_factor]" << std::endl;
return 1;
}

double scale_factor = std::atof(argv[1]);
std::cout << "Generating scale factor: " << scale_factor << std::endl;

auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
write_parquet(std::move(orders), "orders.parquet", cudf::datagen::schema::ORDERS);
write_parquet(std::move(lineitem), "lineitem.parquet", cudf::datagen::schema::LINEITEM);
write_parquet(std::move(part), "part.parquet", cudf::datagen::schema::PART);

auto partsupp = cudf::datagen::generate_partsupp(
scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
write_parquet(std::move(partsupp), "partsupp.parquet", cudf::datagen::schema::PARTSUPP);

auto supplier = cudf::datagen::generate_supplier(
scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
write_parquet(std::move(supplier), "supplier.parquet", cudf::datagen::schema::SUPPLIER);

auto customer = cudf::datagen::generate_customer(
scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
write_parquet(std::move(customer), "customer.parquet", cudf::datagen::schema::CUSTOMER);

auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
write_parquet(std::move(nation), "nation.parquet", cudf::datagen::schema::NATION);

auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
write_parquet(std::move(region), "region.parquet", cudf::datagen::schema::REGION);
}
14 changes: 7 additions & 7 deletions cpp/benchmarks/common/cudf_tpch_datagen/tpch_datagen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ std::vector<std::string> generate_vocab_containers()
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_orders_independent(cudf::size_type const& scale_factor,
std::unique_ptr<cudf::table> generate_orders_independent(double scale_factor,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -245,7 +245,7 @@ std::unique_ptr<cudf::table> generate_orders_independent(cudf::size_type const&
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& orders_independent,
cudf::size_type const& scale_factor,
double scale_factor,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -500,7 +500,7 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_partsupp(cudf::size_type const& scale_factor,
std::unique_ptr<cudf::table> generate_partsupp(double scale_factor,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -552,7 +552,7 @@ std::unique_ptr<cudf::table> generate_partsupp(cudf::size_type const& scale_fact
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_part(cudf::size_type const& scale_factor,
std::unique_ptr<cudf::table> generate_part(double scale_factor,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -659,7 +659,7 @@ std::unique_ptr<cudf::table> generate_part(cudf::size_type const& scale_factor,
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
generate_orders_lineitem_part(cudf::size_type const& scale_factor,
generate_orders_lineitem_part(double scale_factor,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -729,7 +729,7 @@ generate_orders_lineitem_part(cudf::size_type const& scale_factor,
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_supplier(cudf::size_type const& scale_factor,
std::unique_ptr<cudf::table> generate_supplier(double scale_factor,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -795,7 +795,7 @@ std::unique_ptr<cudf::table> generate_supplier(cudf::size_type const& scale_fact
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_customer(cudf::size_type const& scale_factor,
std::unique_ptr<cudf::table> generate_customer(double scale_factor,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down
100 changes: 50 additions & 50 deletions cpp/benchmarks/common/cudf_tpch_datagen/tpch_datagen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,54 +24,54 @@ namespace CUDF_EXPORT cudf {
namespace datagen {
namespace schema {

constexpr std::array<const char*, 9> ORDERS{"o_orderkey",
"o_custkey",
"o_orderdate",
"o_orderpriority",
"o_clerk",
"o_shippriority",
"o_comment",
"o_totalprice",
"o_orderstatus"};
constexpr std::array<const char*, 16> LINEITEM{"l_orderkey",
"l_partkey",
"l_suppkey",
"l_linenumber",
"l_quantity",
"l_discount",
"l_tax",
"l_shipdate",
"l_commitdate",
"l_receiptdate",
"l_returnflag",
"l_linestatus",
"l_shipinstruct",
"l_shipmode",
"l_comment",
"l_extendedprice"};
constexpr std::array<const char*, 9> PART{"p_partkey",
"p_name",
"p_mfgr",
"p_brand",
"p_type",
"p_size",
"p_container",
"p_retailprice",
"p_comment"};
constexpr std::array<const char*, 5> PARTSUPP{
const std::vector<std::string> ORDERS = {"o_orderkey",
"o_custkey",
"o_orderdate",
"o_orderpriority",
"o_clerk",
"o_shippriority",
"o_comment",
"o_totalprice",
"o_orderstatus"};
const std::vector<std::string> LINEITEM = {"l_orderkey",
"l_partkey",
"l_suppkey",
"l_linenumber",
"l_quantity",
"l_discount",
"l_tax",
"l_shipdate",
"l_commitdate",
"l_receiptdate",
"l_returnflag",
"l_linestatus",
"l_shipinstruct",
"l_shipmode",
"l_comment",
"l_extendedprice"};
const std::vector<std::string> PART = {"p_partkey",
"p_name",
"p_mfgr",
"p_brand",
"p_type",
"p_size",
"p_container",
"p_retailprice",
"p_comment"};
const std::vector<std::string> PARTSUPP = {
"ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"};
constexpr std::array<const char*, 7> SUPPLIER{
const std::vector<std::string> SUPPLIER = {
"s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"};
constexpr std::array<const char*, 8> CUSTOMER{"c_custkey",
"c_name",
"c_address",
"c_nationkey",
"c_phone",
"c_acctbal",
"c_mktsegment",
"c_comment"};
constexpr std::array<const char*, 4> NATION{"n_nationkey", "n_name", "n_regionkey", "n_comment"};
constexpr std::array<const char*, 3> REGION{"r_regionkey", "r_name", "r_comment"};
const std::vector<std::string> CUSTOMER = {"c_custkey",
"c_name",
"c_address",
"c_nationkey",
"c_phone",
"c_acctbal",
"c_mktsegment",
"c_comment"};
const std::vector<std::string> NATION = {"n_nationkey", "n_name", "n_regionkey", "n_comment"};
const std::vector<std::string> REGION = {"r_regionkey", "r_name", "r_comment"};

} // namespace schema

Expand All @@ -84,7 +84,7 @@ constexpr std::array<const char*, 3> REGION{"r_regionkey", "r_name", "r_comment"
*/
std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
generate_orders_lineitem_part(
cudf::size_type const& scale_factor,
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

Expand All @@ -96,7 +96,7 @@ generate_orders_lineitem_part(
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_partsupp(
cudf::size_type const& scale_factor,
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

Expand All @@ -108,7 +108,7 @@ std::unique_ptr<cudf::table> generate_partsupp(
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_supplier(
cudf::size_type const& scale_factor,
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

Expand All @@ -120,7 +120,7 @@ std::unique_ptr<cudf::table> generate_supplier(
* @param mr Device memory resource used to allocate the returned column's device memory
*/
std::unique_ptr<cudf::table> generate_customer(
cudf::size_type const& scale_factor,
double scale_factor,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

Expand Down

0 comments on commit 4a3e91e

Please sign in to comment.