From 7cc02e5b1da8f4f0c8697e988572eb44f1354626 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 19 Mar 2024 14:08:36 -0700 Subject: [PATCH 1/2] Address poor performance of Parquet string decoding (#15304) See #15297. The Parquet string decoder can become a bottleneck in the presence of strings of widely varying sizes. This PR is an attempt to address this, at least as a stop gap solution. A more complete solution may be to rework the string decoder to work in a block-wide fashion, such as the new micro-kernels added in #15159. Authors: - Ed Seidl (https://github.com/etseidl) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/15304 --- cpp/src/io/parquet/page_string_decode.cu | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 8bb56c66d0f..d8b1c1cc046 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1045,12 +1045,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) // if (!has_repetition) { dst_pos -= s->first_row; } - // need to do this before we branch on src_pos/dst_pos so we don't deadlock - // choose a character parallel string copy when the average string is longer than a warp - using cudf::detail::warp_size; - auto const use_char_ll = - s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size; - if (me < warp_size) { for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) { dst_pos = sb->nz_idx[rolling_index(src_pos + i)]; @@ -1061,10 +1055,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) : cuda::std::pair{nullptr, 0}; __shared__ cub::WarpScan::TempStorage temp_storage; - size_type offset; - cub::WarpScan(temp_storage).ExclusiveSum(len, offset); + size_type offset, warp_total; + cub::WarpScan(temp_storage).ExclusiveSum(len, offset, warp_total); offset += last_offset; + // choose a character parallel string copy when the average string is longer than a warp + auto const use_char_ll = warp_total / warp_size >= warp_size; + if (use_char_ll) { __shared__ __align__(8) uint8_t const* pointers[warp_size]; __shared__ __align__(4) size_type offsets[warp_size]; From f9ac4277f50163a7da2006460034aa3e45c8744e Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 19 Mar 2024 18:05:48 -0500 Subject: [PATCH 2/2] Avoid importing dask-expr if "query-planning" config is `False` (#15340) During some offline debugging with @bdice and @divyegala, we discovered that some cuml tests are somehow failing after `dask_expr` is imported - Even if `dask_expr` is not actually being used. I'd like to figure out exactly what is causing that problem, but the first thing we can/should do is avoid the import altogether when the "query-planning" config is set to `False`. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15340 --- python/dask_cudf/dask_cudf/expr/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py index c36dd0abcb9..826f514a674 100644 --- a/python/dask_cudf/dask_cudf/expr/__init__.py +++ b/python/dask_cudf/dask_cudf/expr/__init__.py @@ -7,12 +7,12 @@ QUERY_PLANNING_ON = config.get("dataframe.query-planning", None) is not False # Register custom expressions and collections -try: - import dask_cudf.expr._collection - import dask_cudf.expr._expr +if QUERY_PLANNING_ON: + try: + import dask_cudf.expr._collection + import dask_cudf.expr._expr -except ImportError as err: - if QUERY_PLANNING_ON: + except ImportError as err: # Dask *should* raise an error before this. # However, we can still raise here to be certain. raise RuntimeError(