Merge remote-tracking branch 'upstream/branch-24.10' into simplify-re…

…mote-io
rapidsai · Aug 13, 2024 · 491c140 · 491c140
2 parents 00c47fa + 3a791cb
commit 491c140
Show file tree

Hide file tree

Showing 194 changed files with 4,326 additions and 1,431 deletions.
diff --git a/.gitignore b/.gitignore
@@ -79,6 +79,8 @@ Debug
 build/
 cpp/build/
 cpp/examples/*/install/
+cpp/examples/*/build/
+cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -144,7 +144,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.2.0
+    rev: v0.3.1
     hooks:
       - id: verify-copyright
         exclude: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -18,18 +18,16 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
-#Get <major>.<minor> for next version
+# Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 # Need to distutils-normalize the versions for some use cases
 CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
 PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
-echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -61,7 +59,7 @@ for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
 done
 
@@ -77,7 +75,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 # CI files
 for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
-  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 

diff --git a/ci/run_cudf_memcheck_ctests.sh b/ci/run_cudf_memcheck_ctests.sh
@@ -15,9 +15,6 @@ export LIBCUDF_MEMCHECK_ENABLED=1
 for gt in ./*_TEST ; do
   test_name=$(basename ${gt})
   # Run gtests with compute-sanitizer
-  if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
-    continue
-  fi
   echo "Running compute-sanitizer on $test_name"
   compute-sanitizer --tool memcheck ${gt} "$@"
 done

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
+- flatbuffers==24.3.25
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -81,7 +82,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -31,6 +31,7 @@ dependencies:
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
+- flatbuffers==24.3.25
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -79,7 +80,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -62,7 +62,7 @@ requirements:
     - python
     - cython >=3.0.3
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
     # TODO: Change to `2.0` for NumPy 2
     - numpy 1.23

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}

diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
@@ -31,6 +31,9 @@ librdkafka_version:
 fmt_version:
   - ">=10.1.1,<11"
 
+flatbuffers_version:
+  - "=24.3.25"
+
 spdlog_version:
   - ">=1.12.0,<1.13"
 

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -68,6 +68,7 @@ requirements:
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
+    - flatbuffers {{ flatbuffers_version }}
     - spdlog {{ spdlog_version }}
     - zlib {{ zlib_version }}
 

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -392,7 +392,6 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
-  src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -162,6 +162,7 @@ ConfigureNVBench(
   stream_compaction/distinct.cpp
   stream_compaction/distinct_count.cpp
   stream_compaction/stable_distinct.cpp
+  stream_compaction/stream_compaction_common.cpp
   stream_compaction/unique.cpp
   stream_compaction/unique_count.cpp
 )
@@ -353,6 +354,11 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
+# ##################################################################################################
+# * multi buffer memset benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
+
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)

diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size = 512 << 20;
+
+void parquet_read_common(cudf::size_type num_rows_to_read,
+                         cudf::size_type num_cols_to_read,
+                         cuio_source_sink_pair& source_sink,
+                         nvbench::state& state)
+{
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+
+      timer.start();
+      auto const result = cudf::io::read_parquet(read_opts);
+      timer.stop();
+
+      CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
+      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+    });
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <data_type DataType>
+void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+{
+  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
+  auto const num_cols    = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
+  auto const compression = cudf::io::compression_type::NONE;
+  cuio_source_sink_pair source_sink(source_type);
+  auto const tbl =
+    create_random_table(cycle_dtypes(d_type, num_cols),
+                        table_size_bytes{data_size},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .compression(compression);
+  cudf::io::write_parquet(write_opts);
+  auto const num_rows = view.num_rows();
+
+  parquet_read_common(num_rows, num_cols, source_sink, state);
+}
+
+using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
+                                            data_type::FLOAT,
+                                            data_type::DECIMAL,
+                                            data_type::TIMESTAMP,
+                                            data_type::DURATION,
+                                            data_type::STRING,
+                                            data_type::LIST,
+                                            data_type::STRUCT>;
+
+NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
+  .set_name("batched_memset")
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("num_cols", {1000})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
@@ -30,7 +30,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/reduce.h>
 
 #include <random>
@@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state)
                           sizeof(TypeParam));
 }
 
-// operator+ defined for pair iterator reduction
-template <typename T>
-__device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pair<T, bool> rhs)
-{
-  return thrust::pair<T, bool>{lhs.first * lhs.second + rhs.first * rhs.second,
-                               lhs.second + rhs.second};
-}
-// -----------------------------------------------------------------------------
-template <typename T, bool has_null>
-void pair_iterator_bench_cub(cudf::column_view& col,
-                             rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col    = cudf::column_device_view::create(col);
-  int num_items = col.size();
-  auto begin    = d_col->pair_begin<T, has_null>();
-  reduce_by_cub(result.begin(), begin, num_items, init);
-}
-
-template <typename T, bool has_null>
-void pair_iterator_bench_thrust(cudf::column_view& col,
-                                rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col = cudf::column_device_view::create(col);
-  auto d_in  = d_col->pair_begin<T, has_null>();
-  auto d_end = d_in + col.size();
-  thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{});
-}
-
-template <class TypeParam, bool cub_or_thrust>
-void BM_pair_iterator(benchmark::State& state)
-{
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  using T      = TypeParam;
-  auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
-  auto null_gen =
-    thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; });
-
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_T(
-    num_gen, num_gen + column_size, null_gen);
-  cudf::column_view hasnull_F = wrap_hasnull_F;
-  cudf::column_view hasnull_T = wrap_hasnull_T;
-
-  // Initialize dev_result to false
-  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    if (cub_or_thrust) {
-      pair_iterator_bench_cub<T, false>(hasnull_T,
-                                        dev_result);  // driven by pair iterator with nulls
-    } else {
-      pair_iterator_bench_thrust<T, false>(hasnull_T,
-                                           dev_result);  // driven by pair iterator with nulls
-    }
-  }
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          sizeof(TypeParam));
-}
-
 #define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \
   BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state)             \
   {                                                                          \
@@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true);
 ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false);
-
-#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust)  \
-  BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
-  {                                                              \
-    BM_pair_iterator<type, cub_or_thrust>(state);                \
-  }                                                              \
-  BENCHMARK_REGISTER_F(Iterator, name)                           \
-    ->RangeMultiplier(10)                                        \
-    ->Range(1000, 10000000)                                      \
-    ->UseManualTime()                                            \
-    ->Unit(benchmark::kMillisecond);
-
-PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true);
-PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false);