From 7cc02e5b1da8f4f0c8697e988572eb44f1354626 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 19 Mar 2024 14:08:36 -0700
Subject: [PATCH 1/2] Address poor performance of Parquet string decoding
 (#15304)

See #15297. The Parquet string decoder can become a bottleneck in the presence of strings of widely varying sizes. This PR is an attempt to address this, at least as a stop gap solution. A more complete solution may be to rework the string decoder to work in a block-wide fashion, such as the new micro-kernels added in #15159.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15304
---
 cpp/src/io/parquet/page_string_decode.cu | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 8bb56c66d0f..d8b1c1cc046 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1045,12 +1045,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       //
       if (!has_repetition) { dst_pos -= s->first_row; }
 
-      // need to do this before we branch on src_pos/dst_pos so we don't deadlock
-      // choose a character parallel string copy when the average string is longer than a warp
-      using cudf::detail::warp_size;
-      auto const use_char_ll =
-        s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
-
       if (me < warp_size) {
         for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) {
           dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos + i)];
@@ -1061,10 +1055,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                               : cuda::std::pair<char const*, size_t>{nullptr, 0};
 
           __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
-          size_type offset;
-          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
+          size_type offset, warp_total;
+          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset, warp_total);
           offset += last_offset;
 
+          // choose a character parallel string copy when the average string is longer than a warp
+          auto const use_char_ll = warp_total / warp_size >= warp_size;
+
           if (use_char_ll) {
             __shared__ __align__(8) uint8_t const* pointers[warp_size];
             __shared__ __align__(4) size_type offsets[warp_size];

From f9ac4277f50163a7da2006460034aa3e45c8744e Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 19 Mar 2024 18:05:48 -0500
Subject: [PATCH 2/2] Avoid importing dask-expr if "query-planning" config is
 `False` (#15340)

During some offline debugging with @bdice and @divyegala, we discovered that some cuml tests are somehow failing after `dask_expr` is imported - Even if `dask_expr` is not actually being used. I'd like to figure out exactly what is causing that problem, but the first thing we can/should do is avoid the import altogether when the "query-planning" config is set to `False`.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15340
---
 python/dask_cudf/dask_cudf/expr/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index c36dd0abcb9..826f514a674 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -7,12 +7,12 @@
 QUERY_PLANNING_ON = config.get("dataframe.query-planning", None) is not False
 
 # Register custom expressions and collections
-try:
-    import dask_cudf.expr._collection
-    import dask_cudf.expr._expr
+if QUERY_PLANNING_ON:
+    try:
+        import dask_cudf.expr._collection
+        import dask_cudf.expr._expr
 
-except ImportError as err:
-    if QUERY_PLANNING_ON:
+    except ImportError as err:
         # Dask *should* raise an error before this.
         # However, we can still raise here to be certain.
         raise RuntimeError(