Support scale factor < 1

rapidsai · Aug 22, 2024 · 4a3e91e · 4a3e91e
1 parent 64e57fe
commit 4a3e91e
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 59 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -98,8 +98,8 @@ function(ConfigureBench CMAKE_BENCH_NAME)
                CUDA_STANDARD_REQUIRED ON
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_tpch_datagen cudf_datagen
+                                benchmark::benchmark_main $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   add_custom_command(
     OUTPUT CUDF_BENCHMARKS
@@ -138,6 +138,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
   )
 endfunction()
 
+ConfigureBench(DATAGEN common/cudf_tpch_datagen/datagen.cpp)
+
 # ##################################################################################################
 # * column benchmarks -----------------------------------------------------------------------------
 ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)

diff --git a/cpp/benchmarks/common/cudf_tpch_datagen/datagen.cpp b/cpp/benchmarks/common/cudf_tpch_datagen/datagen.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tpch_datagen.hpp"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/io/parquet.hpp>
+
+/**
+ * @brief Write a `cudf::table` to a parquet file
+ *
+ * @param table The cudf::table to write
+ * @param path The path to write the parquet file to
+ * @param col_names The names of the columns in the table
+ */
+void write_parquet(std::unique_ptr<cudf::table> table,
+                   std::string const& path,
+                   std::vector<std::string> const& col_names)
+{
+  CUDF_FUNC_RANGE();
+  cudf::io::table_metadata metadata;
+  std::vector<cudf::io::column_name_info> col_name_infos;
+  for (auto& col_name : col_names) {
+    col_name_infos.push_back(cudf::io::column_name_info(col_name));
+  }
+  metadata.schema_info            = col_name_infos;
+  auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+  auto builder = cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(path));
+  builder.metadata(table_input_metadata);
+  auto const options = builder.build();
+  cudf::io::parquet_chunked_writer(options).write(table->view());
+}
+
+int main(int argc, char** argv)
+{
+  if (argc < 2) {
+    std::cerr << "Usage: " << argv[0] << " [scale_factor]" << std::endl;
+    return 1;
+  }
+
+  double scale_factor = std::atof(argv[1]);
+  std::cout << "Generating scale factor: " << scale_factor << std::endl;
+
+  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+    scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  write_parquet(std::move(orders), "orders.parquet", cudf::datagen::schema::ORDERS);
+  write_parquet(std::move(lineitem), "lineitem.parquet", cudf::datagen::schema::LINEITEM);
+  write_parquet(std::move(part), "part.parquet", cudf::datagen::schema::PART);
+
+  auto partsupp = cudf::datagen::generate_partsupp(
+    scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  write_parquet(std::move(partsupp), "partsupp.parquet", cudf::datagen::schema::PARTSUPP);
+
+  auto supplier = cudf::datagen::generate_supplier(
+    scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  write_parquet(std::move(supplier), "supplier.parquet", cudf::datagen::schema::SUPPLIER);
+
+  auto customer = cudf::datagen::generate_customer(
+    scale_factor, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  write_parquet(std::move(customer), "customer.parquet", cudf::datagen::schema::CUSTOMER);
+
+  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                               rmm::mr::get_current_device_resource());
+  write_parquet(std::move(nation), "nation.parquet", cudf::datagen::schema::NATION);
+
+  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                               rmm::mr::get_current_device_resource());
+  write_parquet(std::move(region), "region.parquet", cudf::datagen::schema::REGION);
+}
diff --git a/cpp/benchmarks/common/cudf_tpch_datagen/tpch_datagen.cpp b/cpp/benchmarks/common/cudf_tpch_datagen/tpch_datagen.cpp
@@ -123,7 +123,7 @@ std::vector<std::string> generate_vocab_containers()
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
-std::unique_ptr<cudf::table> generate_orders_independent(cudf::size_type const& scale_factor,
+std::unique_ptr<cudf::table> generate_orders_independent(double scale_factor,
                                                          rmm::cuda_stream_view stream,
                                                          rmm::device_async_resource_ref mr)
 {
@@ -245,7 +245,7 @@ std::unique_ptr<cudf::table> generate_orders_independent(cudf::size_type const&
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& orders_independent,
-                                                       cudf::size_type const& scale_factor,
+                                                       double scale_factor,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::device_async_resource_ref mr)
 {
@@ -500,7 +500,7 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
-std::unique_ptr<cudf::table> generate_partsupp(cudf::size_type const& scale_factor,
+std::unique_ptr<cudf::table> generate_partsupp(double scale_factor,
                                                rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
@@ -552,7 +552,7 @@ std::unique_ptr<cudf::table> generate_partsupp(cudf::size_type const& scale_fact
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
-std::unique_ptr<cudf::table> generate_part(cudf::size_type const& scale_factor,
+std::unique_ptr<cudf::table> generate_part(double scale_factor,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
 {
@@ -659,7 +659,7 @@ std::unique_ptr<cudf::table> generate_part(cudf::size_type const& scale_factor,
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
-generate_orders_lineitem_part(cudf::size_type const& scale_factor,
+generate_orders_lineitem_part(double scale_factor,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
@@ -729,7 +729,7 @@ generate_orders_lineitem_part(cudf::size_type const& scale_factor,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
-std::unique_ptr<cudf::table> generate_supplier(cudf::size_type const& scale_factor,
+std::unique_ptr<cudf::table> generate_supplier(double scale_factor,
                                                rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
@@ -795,7 +795,7 @@ std::unique_ptr<cudf::table> generate_supplier(cudf::size_type const& scale_fact
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
-std::unique_ptr<cudf::table> generate_customer(cudf::size_type const& scale_factor,
+std::unique_ptr<cudf::table> generate_customer(double scale_factor,
                                                rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {

diff --git a/cpp/benchmarks/common/cudf_tpch_datagen/tpch_datagen.hpp b/cpp/benchmarks/common/cudf_tpch_datagen/tpch_datagen.hpp
@@ -24,54 +24,54 @@ namespace CUDF_EXPORT cudf {
 namespace datagen {
 namespace schema {
 
-constexpr std::array<const char*, 9> ORDERS{"o_orderkey",
-                                            "o_custkey",
-                                            "o_orderdate",
-                                            "o_orderpriority",
-                                            "o_clerk",
-                                            "o_shippriority",
-                                            "o_comment",
-                                            "o_totalprice",
-                                            "o_orderstatus"};
-constexpr std::array<const char*, 16> LINEITEM{"l_orderkey",
-                                               "l_partkey",
-                                               "l_suppkey",
-                                               "l_linenumber",
-                                               "l_quantity",
-                                               "l_discount",
-                                               "l_tax",
-                                               "l_shipdate",
-                                               "l_commitdate",
-                                               "l_receiptdate",
-                                               "l_returnflag",
-                                               "l_linestatus",
-                                               "l_shipinstruct",
-                                               "l_shipmode",
-                                               "l_comment",
-                                               "l_extendedprice"};
-constexpr std::array<const char*, 9> PART{"p_partkey",
-                                          "p_name",
-                                          "p_mfgr",
-                                          "p_brand",
-                                          "p_type",
-                                          "p_size",
-                                          "p_container",
-                                          "p_retailprice",
-                                          "p_comment"};
-constexpr std::array<const char*, 5> PARTSUPP{
+const std::vector<std::string> ORDERS   = {"o_orderkey",
+                                           "o_custkey",
+                                           "o_orderdate",
+                                           "o_orderpriority",
+                                           "o_clerk",
+                                           "o_shippriority",
+                                           "o_comment",
+                                           "o_totalprice",
+                                           "o_orderstatus"};
+const std::vector<std::string> LINEITEM = {"l_orderkey",
+                                           "l_partkey",
+                                           "l_suppkey",
+                                           "l_linenumber",
+                                           "l_quantity",
+                                           "l_discount",
+                                           "l_tax",
+                                           "l_shipdate",
+                                           "l_commitdate",
+                                           "l_receiptdate",
+                                           "l_returnflag",
+                                           "l_linestatus",
+                                           "l_shipinstruct",
+                                           "l_shipmode",
+                                           "l_comment",
+                                           "l_extendedprice"};
+const std::vector<std::string> PART     = {"p_partkey",
+                                           "p_name",
+                                           "p_mfgr",
+                                           "p_brand",
+                                           "p_type",
+                                           "p_size",
+                                           "p_container",
+                                           "p_retailprice",
+                                           "p_comment"};
+const std::vector<std::string> PARTSUPP = {
   "ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"};
-constexpr std::array<const char*, 7> SUPPLIER{
+const std::vector<std::string> SUPPLIER = {
   "s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"};
-constexpr std::array<const char*, 8> CUSTOMER{"c_custkey",
-                                              "c_name",
-                                              "c_address",
-                                              "c_nationkey",
-                                              "c_phone",
-                                              "c_acctbal",
-                                              "c_mktsegment",
-                                              "c_comment"};
-constexpr std::array<const char*, 4> NATION{"n_nationkey", "n_name", "n_regionkey", "n_comment"};
-constexpr std::array<const char*, 3> REGION{"r_regionkey", "r_name", "r_comment"};
+const std::vector<std::string> CUSTOMER = {"c_custkey",
+                                           "c_name",
+                                           "c_address",
+                                           "c_nationkey",
+                                           "c_phone",
+                                           "c_acctbal",
+                                           "c_mktsegment",
+                                           "c_comment"};
+const std::vector<std::string> NATION   = {"n_nationkey", "n_name", "n_regionkey", "n_comment"};
+const std::vector<std::string> REGION   = {"r_regionkey", "r_name", "r_comment"};
 
 }  // namespace schema
 
@@ -84,7 +84,7 @@ constexpr std::array<const char*, 3> REGION{"r_regionkey", "r_name", "r_comment"
  */
 std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
 generate_orders_lineitem_part(
-  cudf::size_type const& scale_factor,
+  double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
@@ -96,7 +96,7 @@ generate_orders_lineitem_part(
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<cudf::table> generate_partsupp(
-  cudf::size_type const& scale_factor,
+  double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
@@ -108,7 +108,7 @@ std::unique_ptr<cudf::table> generate_partsupp(
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<cudf::table> generate_supplier(
-  cudf::size_type const& scale_factor,
+  double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
@@ -120,7 +120,7 @@ std::unique_ptr<cudf::table> generate_supplier(
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<cudf::table> generate_customer(
-  cudf::size_type const& scale_factor,
+  double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());