Merge branch 'branch-24.10' into host-tree-algorithms

rapidsai · Sep 18, 2024 · 8fbb1d0 · 8fbb1d0
2 parents ccfc6f6 + 2a9a8f5
commit 8fbb1d0
Show file tree

Hide file tree

Showing 90 changed files with 3,913 additions and 1,252 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -52,7 +52,7 @@ jobs:
     steps:
       - name: Get PR info
         id: get-pr-info
-        uses: rapidsai/shared-actions/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@main
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:

diff --git a/.gitignore b/.gitignore
@@ -80,7 +80,6 @@ build/
 cpp/build/
 cpp/examples/*/install/
 cpp/examples/*/build/
-cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 

diff --git a/README.md b/README.md
@@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
 
 ### Conda
 
-cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
+cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -385,6 +385,7 @@ add_library(
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
   src/io/json/parser_features.cpp
+  src/io/json/process_tokens.cu
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -36,25 +36,25 @@ target_include_directories(
 )
 
 add_library(
-  tpch_data_generator STATIC
-  common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
-  common/tpch_data_generator/random_column_generator.cu
+  ndsh_data_generator STATIC
+  common/ndsh_data_generator/ndsh_data_generator.cpp common/ndsh_data_generator/table_helpers.cpp
+  common/ndsh_data_generator/random_column_generator.cu
 )
-target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
+target_compile_features(ndsh_data_generator PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(
-  tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+  ndsh_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                              "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
 target_link_libraries(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 target_include_directories(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
 )
@@ -127,8 +127,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::nvbench
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
+                                nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
@@ -175,6 +175,14 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
 # * transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
+# ##################################################################################################
+# * nds-h benchmark --------------------------------------------------------------------------------
+ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)
+
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
@@ -329,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
 )
 
 # ##################################################################################################

diff --git a/...ch_data_generator/tpch_data_generator.cpp → ...sh_data_generator/ndsh_data_generator.cpp b/...ch_data_generator/tpch_data_generator.cpp → ...sh_data_generator/ndsh_data_generator.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tpch_data_generator.hpp"
+#include "ndsh_data_generator.hpp"
 
 #include "random_column_generator.hpp"
 #include "table_helpers.hpp"
@@ -435,46 +435,37 @@ std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& o
   columns.push_back(std::move(l_quantity));
   columns.push_back(std::move(l_discount));
   columns.push_back(std::move(l_tax));
+  columns.push_back(std::move(l_returnflag));
+  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipdate_ts));
   columns.push_back(std::move(l_commitdate_ts));
   columns.push_back(std::move(l_receiptdate_ts));
-  columns.push_back(std::move(l_returnflag));
-  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipinstruct));
   columns.push_back(std::move(l_shipmode));
   columns.push_back(std::move(l_comment));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
+/**
+ * @brief Generate the part of the `orders` table dependent on the `lineitem` table
+ *
+ * @param lineitem_partial The partially generated `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem_partial,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto const l_linestatus_mask = lineitem.column(0);
-  auto const l_orderkey        = lineitem.column(1);
-  auto const l_discount        = lineitem.column(6);
-  auto const l_tax             = lineitem.column(7);
-  auto const l_extendedprice   = lineitem.column(16);
+  auto const l_linestatus_mask = lineitem_partial.column(0);
+  auto const l_orderkey        = lineitem_partial.column(1);
+  auto const l_extendedprice   = lineitem_partial.column(6);
+  auto const l_discount        = lineitem_partial.column(7);
+  auto const l_tax             = lineitem_partial.column(8);
 
   std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;
 
-  // Generate the `o_totalprice` column
-  // We calculate the `charge` column, which is a function of `l_extendedprice`,
-  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
-  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
-  auto o_totalprice   = [&]() {
-    auto const keys = cudf::table_view({l_orderkey});
-    cudf::groupby::groupby gb(keys);
-    std::vector<cudf::groupby::aggregation_request> requests;
-    requests.push_back(cudf::groupby::aggregation_request());
-    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-    requests[0].values = l_charge->view();
-    auto agg_result    = gb.aggregate(requests);
-    return cudf::round(agg_result.second[0].results[0]->view(), 2);
-  }();
-  orders_dependent_columns.push_back(std::move(o_totalprice));
-
   // Generate the `o_orderstatus` column
   auto o_orderstatus = [&]() {
     auto const keys = cudf::table_view({l_orderkey});
@@ -529,6 +520,22 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
       cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
   }();
   orders_dependent_columns.push_back(std::move(o_orderstatus));
+
+  // Generate the `o_totalprice` column
+  // We calculate the `charge` column, which is a function of `l_extendedprice`,
+  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
+  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
+  auto o_totalprice   = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_charge->view();
+    auto agg_result    = gb.aggregate(requests);
+    return cudf::round(agg_result.second[0].results[0]->view(), 2);
+  }();
+  orders_dependent_columns.push_back(std::move(o_totalprice));
   return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
 }
 
@@ -730,9 +737,7 @@ generate_orders_lineitem_part(double scale_factor,
   // Generate the `part` table
   auto part = generate_part(scale_factor, stream, mr);
 
-  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
-  // add the column to the `lineitem` table, and write the `lineitem` table to a parquet file
-
+  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column
   auto l_extendedprice = [&]() {
     auto const left = cudf::table_view(
       {lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
@@ -752,8 +757,9 @@ generate_orders_lineitem_part(double scale_factor,
     return cudf::round(col->view(), 2);
   }();
 
+  // Insert the `l_extendedprice` column into the partial columns of the `lineitem` table
   auto lineitem_partial_columns = lineitem_partial->release();
-  lineitem_partial_columns.push_back(std::move(l_extendedprice));
+  lineitem_partial_columns.insert(lineitem_partial_columns.begin() + 6, std::move(l_extendedprice));
   auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));
 
   // Generate the dependent columns of the `orders` table
@@ -762,7 +768,7 @@ generate_orders_lineitem_part(double scale_factor,
 
   auto orders_independent_columns = orders_independent->release();
   auto orders_dependent_columns   = orders_dependent->release();
-  orders_independent_columns.insert(orders_independent_columns.end(),
+  orders_independent_columns.insert(orders_independent_columns.begin() + 2,
                                     std::make_move_iterator(orders_dependent_columns.begin()),
                                     std::make_move_iterator(orders_dependent_columns.end()));
 

diff --git a/...ch_data_generator/tpch_data_generator.hpp → ...sh_data_generator/ndsh_data_generator.hpp b/...ch_data_generator/tpch_data_generator.hpp → ...sh_data_generator/ndsh_data_generator.hpp
diff --git a/...data_generator/random_column_generator.cu → ...data_generator/random_column_generator.cu b/...data_generator/random_column_generator.cu → ...data_generator/random_column_generator.cu
diff --git a/...ata_generator/random_column_generator.hpp → ...ata_generator/random_column_generator.hpp b/...ata_generator/random_column_generator.hpp → ...ata_generator/random_column_generator.hpp
diff --git a/...mon/tpch_data_generator/table_helpers.cpp → ...mon/ndsh_data_generator/table_helpers.cpp b/...mon/tpch_data_generator/table_helpers.cpp → ...mon/ndsh_data_generator/table_helpers.cpp
diff --git a/...mon/tpch_data_generator/table_helpers.hpp → ...mon/ndsh_data_generator/table_helpers.hpp b/...mon/tpch_data_generator/table_helpers.hpp → ...mon/ndsh_data_generator/table_helpers.hpp
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
@@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
   state.add_global_memory_reads<nvbench::int64_t>(num_rows);
   // add memory read from bitmaks
   if (!no_nulls) {
-    state.add_global_memory_reads<nvbench::int8_t>(2 *
+    state.add_global_memory_reads<nvbench::int8_t>(2L *
                                                    cudf::bitmask_allocation_size_bytes(num_rows));
   }
   // memory written depends on used hash
@@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
     });
   } else if (hash_name == "md5") {
     // md5 creates a 32-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
   } else if (hash_name == "sha1") {
     // sha1 creates a 40-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
   } else if (hash_name == "sha224") {
     // sha224 creates a 56-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
   } else if (hash_name == "sha256") {
     // sha256 creates a 64-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
   } else if (hash_name == "sha384") {
     // sha384 creates a 96-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
   } else if (hash_name == "sha512") {
     // sha512 creates a 128-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
 void parquet_read_common(cudf::size_type num_rows_to_read,
                          cudf::size_type num_cols_to_read,
                          cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
+                         nvbench::state& state,
+                         size_t table_data_size = data_size)
 {
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
@@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
     });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
@@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+template <data_type DataType>
+void BM_parquet_read_wide_tables(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
+void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
+{
+  auto const d_type = []() {
+    auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
+    auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
+    d_type1.reserve(d_type1.size() + d_type2.size());
+    std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
+    return d_type1;
+  }();
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
                                             data_type::DECIMAL,
@@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
 
+using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
+  .set_name("parquet_read_wide_tables")
+  .set_min_samples(4)
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
+NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
+  .set_name("parquet_read_wide_tables_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
 // a benchmark for structs that only contain fixed-width types
 using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
 NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))