Merge branch 'branch-25.02' into cudf/_lib/timezone

rapidsai · Nov 21, 2024 · 71b1592 · 71b1592
2 parents 120711b + 78db66b
commit 71b1592
Show file tree

Hide file tree

Showing 22 changed files with 571 additions and 208 deletions.
diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
@@ -28,8 +28,11 @@ if [[ $(arch) == "aarch64" ]]; then
     DESELECTED_TESTS+=("tests/unit/operations/test_join.py::test_join_4_columns_with_validity")
 else
     # Ensure that we don't run dbgen when it uses newer symbols than supported by the glibc version in the CI image.
+    # Allow errors since any of these commands could produce empty results that would cause the script to fail.
+    set +e
     glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
     latest_glibc_symbol_found=$(nm py-polars/tests/benchmark/data/pdsh/dbgen/dbgen | grep GLIBC | grep -o "[0-9]\.[0-9]\+" | sort --version-sort | tail -1 | cut -d "." -f 2)
+    set -e
     if [[ ${glibc_minor_version} -lt ${latest_glibc_symbol_found} ]]; then
         DESELECTED_TESTS+=("tests/benchmark/test_pdsh.py::test_pdsh")
     fi

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -71,6 +71,7 @@ dependencies:
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
+- pynvml>=11.4.1,<12.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -69,6 +69,7 @@ dependencies:
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
+- pynvml>=11.4.1,<12.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov

diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
@@ -43,6 +43,7 @@ requirements:
   run:
     - python
     - cudf ={{ version }}
+    - pynvml >=11.4.1,<12.0.0a0
     - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -360,7 +360,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
-ConfigureBench(STRINGS_BENCH string/factory.cu string/repeat_strings.cpp)
+ConfigureBench(STRINGS_BENCH string/factory.cu)
 
 ConfigureNVBench(
   STRINGS_NVBENCH
@@ -384,6 +384,7 @@ ConfigureNVBench(
   string/lengths.cpp
   string/like.cpp
   string/make_strings_column.cu
+  string/repeat_strings.cpp
   string/replace.cpp
   string/replace_re.cpp
   string/reverse.cpp

diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
@@ -14,99 +14,58 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-static constexpr cudf::size_type default_repeat_times = 16;
-static constexpr cudf::size_type min_repeat_times     = -16;
-static constexpr cudf::size_type max_repeat_times     = 16;
+#include <nvbench/nvbench.cuh>
 
-static std::unique_ptr<cudf::table> create_data_table(cudf::size_type n_cols,
-                                                      cudf::size_type n_rows,
-                                                      cudf::size_type max_str_length)
+static void bench_repeat(nvbench::state& state)
 {
-  CUDF_EXPECTS(n_cols == 1 || n_cols == 2, "Invalid number of columns.");
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width  = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width  = static_cast<cudf::size_type>(state.get_int64("max_width"));
+  auto const min_repeat = static_cast<cudf::size_type>(state.get_int64("min_repeat"));
+  auto const max_repeat = static_cast<cudf::size_type>(state.get_int64("max_repeat"));
+  auto const api        = state.get_string("api");
 
-  std::vector<cudf::type_id> dtype_ids{cudf::type_id::STRING};
   auto builder = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-
-  if (n_cols == 2) {
-    dtype_ids.push_back(cudf::type_id::INT32);
-    builder.distribution(
-      cudf::type_id::INT32, distribution_id::NORMAL, min_repeat_times, max_repeat_times);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  builder.distribution(cudf::type_id::INT32, distribution_id::NORMAL, min_repeat, max_repeat);
+
+  auto const table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::INT32}, row_count{num_rows}, data_profile{builder});
+  auto const input = cudf::strings_column_view(table->view().column(0));
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = input.chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+
+  if (api == "scalar") {
+    state.add_global_memory_writes<nvbench::int8_t>(chars_size * max_repeat);
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::repeat_strings(input, max_repeat); });
+  } else if (api == "column") {
+    auto repeats = table->view().column(1);
+    {
+      auto result = cudf::strings::repeat_strings(input, repeats);
+      auto output = cudf::strings_column_view(result->view());
+      state.add_global_memory_writes<nvbench::int8_t>(output.chars_size(stream));
+    }
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::repeat_strings(input, repeats); });
   }
-
-  return create_random_table(dtype_ids, row_count{n_rows}, data_profile{builder});
 }
 
-static void BM_repeat_strings_scalar_times(benchmark::State& state)
-{
-  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
-  auto const table          = create_data_table(1, n_rows, max_str_length);
-  auto const strings_col    = cudf::strings_column_view(table->view().column(0));
-
-  for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::repeat_strings(strings_col, default_repeat_times);
-  }
-
-  state.SetBytesProcessed(state.iterations() * strings_col.chars_size(cudf::get_default_stream()));
-}
-
-static void BM_repeat_strings_column_times(benchmark::State& state)
-{
-  auto const n_rows           = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length   = static_cast<cudf::size_type>(state.range(1));
-  auto const table            = create_data_table(2, n_rows, max_str_length);
-  auto const strings_col      = cudf::strings_column_view(table->view().column(0));
-  auto const repeat_times_col = table->view().column(1);
-
-  for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::repeat_strings(strings_col, repeat_times_col);
-  }
-
-  state.SetBytesProcessed(state.iterations() * (strings_col.chars_size(cudf::get_default_stream()) +
-                                                repeat_times_col.size() * sizeof(int32_t)));
-}
-
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 8;
-  int const max_rows   = 1 << 18;
-  int const row_mult   = 4;
-  int const min_strlen = 1 << 4;
-  int const max_strlen = 1 << 8;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_strlen, max_strlen, len_mult);
-}
-
-class RepeatStrings : public cudf::benchmark {};
-
-#define REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(RepeatStrings, name)                           \
-  (::benchmark::State & st) { BM_repeat_strings_scalar_times(st); } \
-  BENCHMARK_REGISTER_F(RepeatStrings, name)                         \
-    ->Apply(generate_bench_args)                                    \
-    ->UseManualTime()                                               \
-    ->Unit(benchmark::kMillisecond);
-
-#define REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(RepeatStrings, name)                           \
-  (::benchmark::State & st) { BM_repeat_strings_column_times(st); } \
-  BENCHMARK_REGISTER_F(RepeatStrings, name)                         \
-    ->Apply(generate_bench_args)                                    \
-    ->UseManualTime()                                               \
-    ->Unit(benchmark::kMillisecond);
-
-REPEAT_STRINGS_SCALAR_TIMES_BENCHMARK_DEFINE(scalar_times)
-REPEAT_STRINGS_COLUMN_TIMES_BENCHMARK_DEFINE(column_times)
+NVBENCH_BENCH(bench_repeat)
+  .set_name("repeat")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("min_repeat", {0})
+  .add_int64_axis("max_repeat", {16})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_string_axis("api", {"scalar", "column"});
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
@@ -57,12 +57,14 @@ namespace CUDF_EXPORT cudf {
  * @throw cudf::logic_error if the any of the DLTensor fields are unsupported
  *
  * @param managed_tensor a 1D or 2D column-major (Fortran order) tensor
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @return Table with a copy of the tensor data
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -79,12 +81,14 @@ std::unique_ptr<table> from_dlpack(
  * or if any of columns have non-zero null count
  *
  * @param input Table to convert to DLPack
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned DLPack tensor's device memory
  *
  * @return 1D or 2D DLPack tensor with a copy of the table data, or nullptr
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
@@ -1362,7 +1362,7 @@ table_with_metadata read_csv(
  */
 
 /**
- *@brief Builder to build options for `writer_csv()`.
+ *@brief Builder to build options for `write_csv()`.
  */
 class csv_writer_options_builder;
 

diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
@@ -297,16 +297,19 @@ DLManagedTensor* to_dlpack(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
+                                   rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_dlpack(managed_tensor, cudf::get_default_stream(), mr);
+  return detail::from_dlpack(managed_tensor, stream, mr);
 }
 
-DLManagedTensor* to_dlpack(table_view const& input, rmm::device_async_resource_ref mr)
+DLManagedTensor* to_dlpack(table_view const& input,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_dlpack(input, cudf::get_default_stream(), mr);
+  return detail::to_dlpack(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
@@ -701,6 +701,7 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_INTEROP streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)

diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/interop.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <dlpack/dlpack.h>
+
+struct dlpack_deleter {
+  void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
+};
+
+struct DLPackTest : public cudf::test::BaseFixture {};
+
+TEST_F(DLPackTest, ToDLPack)
+{
+  cudf::table_view empty(std::vector<cudf::column_view>{});
+  cudf::to_dlpack(empty, cudf::test::get_default_stream());
+}
+
+TEST_F(DLPackTest, FromDLPack)
+{
+  using unique_managed_tensor = std::unique_ptr<DLManagedTensor, dlpack_deleter>;
+  cudf::test::fixed_width_column_wrapper<int32_t> col1({});
+  cudf::test::fixed_width_column_wrapper<int32_t> col2({});
+  cudf::table_view input({col1, col2});
+  unique_managed_tensor tensor(cudf::to_dlpack(input, cudf::test::get_default_stream()));
+  auto result = cudf::from_dlpack(tensor.get(), cudf::test::get_default_stream());
+}
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -758,6 +758,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
+          - pynvml>=11.4.1,<12.0.0a0
           - rapids-dask-dependency==25.2.*,>=0.0.0a0
   run_custreamz:
     common:
-Original file line number
+Diff line change
@@ Expand Up / @@ -1362,7 +1362,7 @@ table_with_metadata read_csv( @@
      */
     /**
-     *@brief Builder to build options for `writer_csv()`.
+     *@brief Builder to build options for `write_csv()`.
      */
     class csv_writer_options_builder;
@@ Expand Down @@