diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 8190b5d0297..315a389339a 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
new file mode 100644
index 00000000000..1bf4ac08b69
--- /dev/null
+++ b/.github/workflows/auto-assign.yml
@@ -0,0 +1,17 @@
+name: "Auto Assign PR"
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - synchronize
+
+jobs:
+  add_assignees:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions-ecosystem/action-add-assignees@v1
+        with:
+          github_token: "${{ secrets.GITHUB_TOKEN }}"
+          assignees: ${{ github.actor }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 31e78f82a62..f5cb71bfc14 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,4 +1,5 @@
 name: "Pull Request Labeler"
+
 on:
 - pull_request_target
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 174dc72bf02..f5234f58efe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,17 +16,6 @@ repos:
             ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        # Use the config file specific to each subproject so that each
-        # project can specify its own first/third-party packages.
-        args: ["--config-root=python/", "--resolve-all-configs"]
-        files: python/.*
-        exclude: |
-          (?x)^(^python/cudf_polars/.*)
-        types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.2
     hooks:
@@ -150,6 +139,7 @@ repos:
     rev: v0.4.8
     hooks:
       - id: ruff
+        args: ["--fix"]
         files: python/.*$
       - id: ruff-format
         files: python/.*$
@@ -165,7 +155,7 @@ repos:
           )
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
+    rev: v1.16.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9cdde7c2b7..3db1ed35294 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -38,6 +38,7 @@ conduct. More information can be found at:
 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks).
    Fix if needed.
 9. Wait for other developers to review your code and update code as needed.
+   Changes to any C++ files require at least 2 approvals from the cudf-cpp-codeowners before merging.
 10. Once reviewed and approved, a RAPIDS developer will merge your pull request.
 
 If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
@@ -293,8 +294,8 @@ In order to run doxygen as a linter on C++/CUDA code, run
 ./ci/checks/doxygen.sh
 ```
 
-Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/).
+Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/)
+with its various rules  for Black-like formatting or Isort.
 
 cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling
 mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes,
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index e5fcef17a83..3d06eacf9ff 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,8 +15,12 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 # With boa installed conda build forward to boa
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 823d7f62290..ed90041cc77 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -19,6 +19,8 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+sccache --zero-stats
+
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
@@ -28,12 +30,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
@@ -46,6 +54,8 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
+sccache --show-adv-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index bf76f4ed29a..78b8a8a08cf 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -3,7 +3,8 @@
 
 set -euo pipefail
 
-package_dir=$1
+package_name=$1
+package_dir=$2
 
 source rapids-configure-sccache
 source rapids-date-string
@@ -12,4 +13,14 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+sccache --zero-stats
+
+rapids-logger "Building '${package_name}' wheel"
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
+
+sccache --show-adv-stats
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index fb93b06dbe2..fef4416a366 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -18,7 +18,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf
 echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
index 9c945e11c00..79853cdbdb2 100755
--- a/ci/build_wheel_cudf_polars.sh
+++ b/ci/build_wheel_cudf_polars.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/cudf_polars"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf-polars ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index eb2a91289f7..00c64afa2ef 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/dask_cudf"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh dask-cudf ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 91bc071583e..b3d6778ea04 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -3,10 +3,30 @@
 
 set -euo pipefail
 
+package_name="libcudf"
 package_dir="python/libcudf"
 
+rapids-logger "Generating build requirements"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
@@ -16,4 +36,4 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 5e9f7f8a0c4..839d98846fe 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -16,7 +16,7 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh pylibcudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \
@@ -24,4 +24,4 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index f6bdc6f9484..61361fffb07 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -54,15 +54,8 @@ else
     RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
     RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
-    echo "" > ./constraints.txt
-    if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-        # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]`
-        rapids-dependency-file-generator \
-            --output requirements \
-            --file-key test_python_cudf_pandas \
-            --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-        | tee ./constraints.txt
-    fi
+    # generate constraints (possibly pinning to oldest support versions of dependencies)
+    rapids-generate-pip-constraints test_python_cudf_pandas ./constraints.txt
 
     python -m pip install \
         -v \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index a701bfe15e0..ce12744c9e3 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -12,15 +12,8 @@ RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 
 rapids-logger "Install cudf, pylibcudf, and test requirements"
 
-# Constrain to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_cudf \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_cudf ./constraints.txt
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 05f882a475b..2884757e46b 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -29,15 +29,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_cudf_polars \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt
 
 # echo to expand wildcard before adding `[test]` requires for pip
 python -m pip install \
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 361a42ccda9..e15949f4bdb 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -12,15 +12,9 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements"
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key py_test_dask_cudf \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints py_test_dask_cudf ./constraints.txt
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bd5e6c3d569..9d9fec97731 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -46,6 +46,7 @@ dependencies:
 - librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.12.*,>=0.0.0a0
 - make
+- mmh3
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
@@ -58,14 +59,14 @@ dependencies:
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - nvtx>=0.2.1
 - openpyxl
 - packaging
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.13
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
@@ -76,6 +77,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=2.5.0,<2.6.0a0
+- python-xxhash
 - python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 565a3ebfa3c..19e3eafd641 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -45,6 +45,7 @@ dependencies:
 - librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.12.*,>=0.0.0a0
 - make
+- mmh3
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
@@ -56,14 +57,14 @@ dependencies:
 - numba-cuda>=0.0.13
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - nvtx>=0.2.1
 - openpyxl
 - packaging
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.13
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
@@ -74,6 +75,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=2.5.0,<2.6.0a0
+- python-xxhash
 - python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index e8fef715c60..edf92b930d9 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.8,<1.9
+    - polars >=1.11,<1.12
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index dc75eb4b252..c78ca326005 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -35,7 +35,7 @@ spdlog_version:
   - ">=1.14.1,<1.15"
 
 nvcomp_version:
-  - "=4.0.1"
+  - "=4.1.0.6"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 32a753c9f40..bfa4bf80724 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -368,8 +368,16 @@ add_library(
   src/filling/repeat.cu
   src/filling/sequence.cu
   src/groupby/groupby.cu
+  src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_mapping_indices.cu
+  src/groupby/hash/compute_mapping_indices_null.cu
+  src/groupby/hash/compute_shared_memory_aggs.cu
+  src/groupby/hash/compute_single_pass_aggs.cu
+  src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
   src/groupby/hash/groupby.cu
+  src/groupby/hash/hash_compound_agg_finalizer.cu
+  src/groupby/hash/sparse_to_dense_results.cu
   src/groupby/sort/aggregate.cpp
   src/groupby/sort/group_argmax.cu
   src/groupby/sort/group_argmin.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e61a8e6e1e6..68781889c53 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -49,7 +49,7 @@ target_compile_options(
 
 target_link_libraries(
   ndsh_data_generator
-  PUBLIC cudf GTest::gmock GTest::gtest cudf::cudftestutil nvtx3::nvtx3-cpp
+  PUBLIC cudf cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -345,18 +345,17 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
+ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
-  string/combine.cpp
   string/convert_datetime.cpp
   string/convert_durations.cpp
   string/convert_fixed_point.cpp
@@ -374,6 +373,7 @@ ConfigureNVBench(
   STRINGS_NVBENCH
   string/case.cpp
   string/char_types.cpp
+  string/combine.cpp
   string/contains.cpp
   string/copy_if_else.cpp
   string/copy_range.cpp
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index f44f26e4d2c..2533ea9611c 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -16,16 +16,29 @@
 
 #include <benchmarks/common/generate_input.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
 #include <nvbench/nvbench.cuh>
+#include <nvbench/types.cuh>
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iterator>
 #include <list>
 #include <memory>
 #include <optional>
@@ -39,14 +52,14 @@ enum class TreeType {
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table =
     create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
-                          row_count{table_size},
+                          row_count{num_rows},
                           Nullable ? std::optional<double>{0.5} : std::nullopt);
   auto table = source_table->view();
 
@@ -86,7 +99,71 @@ static void BM_ast_transform(nvbench::state& state)
   auto const& expression_tree_root = expressions.back();
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
+}
+
+template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
+static void BM_string_compare_ast_transform(nvbench::state& state)
+{
+  auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows     = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const tree_levels  = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
+
+  // Create table data
+  auto const num_cols = tree_levels * 2;
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+      columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
+    });
+
+  cudf::table table{std::move(columns)};
+  cudf::table_view const table_view = table.view();
+
+  int64_t const chars_size = std::accumulate(
+    table_view.begin(),
+    table_view.end(),
+    static_cast<int64_t>(0),
+    [](int64_t size, auto& column) -> int64_t {
+      return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
+    });
+
+  // Create column references
+  auto column_refs = std::vector<cudf::ast::column_reference>();
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(num_cols),
+                 std::back_inserter(column_refs),
+                 [](auto const& column_id) { return cudf::ast::column_reference(column_id); });
+
+  // Create expression trees
+  std::list<cudf::ast::operation> expressions;
+
+  // Construct AST tree (a == b && c == d && e == f && ...)
+
+  expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));
+
+  std::for_each(thrust::make_counting_iterator(1),
+                thrust::make_counting_iterator(tree_levels),
+                [&](size_t idx) {
+                  auto const& lhs = expressions.back();
+                  auto const& rhs = expressions.emplace_back(
+                    cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1]));
+                  expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs));
+                });
+
+  auto const& expression_tree_root = expressions.back();
+
+  // Use the number of bytes read from global memory
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
@@ -100,7 +177,7 @@ static void BM_ast_transform(nvbench::state& state)
   NVBENCH_BENCH(name)                                                                      \
     .set_name(#name)                                                                       \
     .add_int64_axis("tree_levels", {1, 5, 10})                                             \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
@@ -115,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);
+
+#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
+  static void name(::nvbench::state& st)                                       \
+  {                                                                            \
+    ::BM_string_compare_ast_transform<cmp_op, reduce_op>(st);                  \
+  }                                                                            \
+  NVBENCH_BENCH(name)                                                          \
+    .set_name(#name)                                                           \
+    .add_int64_axis("string_width", {32, 64, 128, 256})                        \
+    .add_int64_axis("num_rows", {32768, 262144, 2097152})                      \
+    .add_int64_axis("tree_levels", {1, 2, 3, 4})                               \
+    .add_int64_axis("hit_rate", {50, 100})
+
+AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and,
+                                              cudf::ast::ast_operator::EQUAL,
+                                              cudf::ast::ast_operator::LOGICAL_AND);
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index 7d267a88764..75c91d270a7 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -17,12 +17,18 @@
 #include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/binaryop.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <nvbench/nvbench.cuh>
 
 #include <algorithm>
+#include <cstddef>
+#include <memory>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -34,17 +40,18 @@ enum class TreeType {
 template <typename key_type, TreeType tree_type, bool reuse_columns>
 static void BM_binaryop_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+  auto const num_rows{static_cast<cudf::size_type>(state.get_int64("num_rows"))};
   auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
   // Create table data
   auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table = create_sequence_table(
-    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
+    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{num_rows});
   cudf::table_view table{*source_table};
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(num_rows);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
@@ -64,15 +71,69 @@ static void BM_binaryop_transform(nvbench::state& state)
   });
 }
 
+template <cudf::binary_operator cmp_op, cudf::binary_operator reduce_op>
+static void BM_string_compare_binaryop_transform(nvbench::state& state)
+{
+  auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows     = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const tree_levels  = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
+
+  // Create table data
+  auto const num_cols = tree_levels * 2;
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+      columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
+    });
+
+  cudf::table table{std::move(columns)};
+  cudf::table_view const table_view = table.view();
+
+  int64_t const chars_size = std::accumulate(
+    table_view.begin(), table_view.end(), static_cast<int64_t>(0), [](int64_t size, auto& column) {
+      return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
+    });
+
+  // Create column references
+
+  // Use the number of bytes read from global memory
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
+
+  // Construct binary operations (a == b && c == d && e == f && ...)
+  auto constexpr bool_type = cudf::data_type{cudf::type_id::BOOL8};
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream{launch.get_stream().get_stream()};
+    std::unique_ptr<cudf::column> reduction =
+      cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream);
+    std::for_each(
+      thrust::make_counting_iterator(1),
+      thrust::make_counting_iterator(tree_levels),
+      [&](size_t idx) {
+        std::unique_ptr<cudf::column> comparison = cudf::binary_operation(
+          table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream);
+        std::unique_ptr<cudf::column> reduced =
+          cudf::binary_operation(*comparison, *reduction, reduce_op, bool_type, stream);
+        stream.synchronize();
+        reduction = std::move(reduced);
+      });
+  });
+}
+
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
                                                                                       \
   static void name(::nvbench::state& st)                                              \
   {                                                                                   \
-    BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                    \
+    ::BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                  \
   }                                                                                   \
   NVBENCH_BENCH(name)                                                                 \
     .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
                                     int32_t,
@@ -86,3 +147,20 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
                                     double,
                                     TreeType::IMBALANCED_LEFT,
                                     false);
+
+#define STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
+                                                                                    \
+  static void name(::nvbench::state& st)                                            \
+  {                                                                                 \
+    ::BM_string_compare_binaryop_transform<cmp_op, reduce_op>(st);                  \
+  }                                                                                 \
+  NVBENCH_BENCH(name)                                                               \
+    .set_name(#name)                                                                \
+    .add_int64_axis("string_width", {32, 64, 128, 256})                             \
+    .add_int64_axis("num_rows", {32768, 262144, 2097152})                           \
+    .add_int64_axis("tree_levels", {1, 2, 3, 4})                                    \
+    .add_int64_axis("hit_rate", {50, 100})
+
+STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform,
+                                                   cudf::binary_operator::EQUAL,
+                                                   cudf::binary_operator::LOGICAL_AND);
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index bc0ff69bce9..426f44a4fa1 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -23,10 +23,10 @@
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
 void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
-  auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
   auto const source_table = create_random_table(
-    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{num_rows});
 
   auto lhs = cudf::column_view(source_table->get_column(0));
   auto rhs = cudf::column_view(source_table->get_column(1));
@@ -37,9 +37,9 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
   // use number of bytes read and written to global memory
-  state.add_global_memory_reads<TypeLhs>(table_size);
-  state.add_global_memory_reads<TypeRhs>(table_size);
-  state.add_global_memory_reads<TypeOut>(table_size);
+  state.add_global_memory_reads<TypeLhs>(num_rows);
+  state.add_global_memory_reads<TypeRhs>(num_rows);
+  state.add_global_memory_writes<TypeOut>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
@@ -55,7 +55,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   }                                                                           \
   NVBENCH_BENCH(name)                                                         \
     .set_name("compiled_binary_op_" BM_STRINGIFY(name))                       \
-    .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d
 
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index dc258e32dc5..bdce8a31176 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -17,13 +17,17 @@
 #include "generate_input.hpp"
 #include "random_distribution_factory.cuh"
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -918,6 +922,58 @@ std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> co
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
+std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
+                                                   cudf::size_type row_width,
+                                                   int32_t hit_rate)
+{
+  // build input table using the following data
+  auto raw_data = cudf::test::strings_column_wrapper(
+                    {
+                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
+                      "012345 6789 01234 56789 0123 456",  // the rest do not match
+                      "abc 4567890 DEFGHI 0987 Wxyz 123",
+                      "abcdefghijklmnopqrstuvwxyz 01234",
+                      "",
+                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "123 édf 4567890 DéFG 0987 X5",
+                      "1",
+                    })
+                    .release();
+
+  if (row_width / 32 > 1) {
+    std::vector<cudf::column_view> columns;
+    for (int i = 0; i < row_width / 32; ++i) {
+      columns.push_back(raw_data->view());
+    }
+    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
+  }
+  auto data_view = raw_data->view();
+
+  // compute number of rows in n_rows that should match
+  auto const num_matches = (static_cast<int64_t>(num_rows) * hit_rate) / 100;
+
+  // Create a randomized gather-map to build a column out of the strings in data.
+  data_profile gather_profile =
+    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
+      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
+  auto gather_table =
+    create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile);
+  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+
+  // Create scatter map by placing 0-index values throughout the gather-map
+  auto scatter_data = cudf::sequence(num_matches,
+                                     cudf::numeric_scalar<int32_t>(0),
+                                     cudf::numeric_scalar<int32_t>(num_rows / num_matches));
+  auto zero_scalar  = cudf::numeric_scalar<int32_t>(0);
+  auto table        = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
+  auto gather_map   = table->view().column(0);
+  table             = cudf::gather(cudf::table_view({data_view}), gather_map);
+
+  return std::move(table->release().front());
+}
+
 std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
   cudf::size_type size, std::optional<double> null_probability, unsigned seed)
 {
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 68d3dc492f5..57834fd11d2 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -670,6 +670,18 @@ std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
                                                    data_profile const& data_params = data_profile{},
                                                    unsigned seed                   = 1);
 
+/**
+ * @brief Deterministically generates a large string column filled with data with the given
+ * parameters.
+ *
+ * @param num_rows Number of rows in the output column
+ * @param row_width Width of each string in the column
+ * @param hit_rate The hit rate percentage, ranging from 0 to 100
+ */
+std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
+                                                   cudf::size_type row_width,
+                                                   int32_t hit_rate);
+
 /**
  * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in
  * subsequent rows.
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index fe24fb58728..45b46005c47 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -186,7 +186,7 @@ std::string exec_cmd(std::string_view cmd)
   std::fflush(nullptr);
   // Switch stderr and stdout to only capture stderr
   auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
-  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+  std::unique_ptr<FILE, int (*)(FILE*)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
   CUDF_EXPECTS(pipe != nullptr, "popen() failed");
 
   std::array<char, 128> buffer;
diff --git a/cpp/benchmarks/ndsh/q01.cpp b/cpp/benchmarks/ndsh/q01.cpp
index ef709926ae9..485e8e5497c 100644
--- a/cpp/benchmarks/ndsh/q01.cpp
+++ b/cpp/benchmarks/ndsh/q01.cpp
@@ -104,7 +104,7 @@
 }
 
 void run_ndsh_q1(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projections and filter predicate for `lineitem` table
   std::vector<std::string> const lineitem_cols = {"l_returnflag",
@@ -124,8 +124,8 @@ void run_ndsh_q1(nvbench::state& state,
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Calculate the discount price and charge columns and append to lineitem table
   auto disc_price =
@@ -170,7 +170,7 @@ void ndsh_q1(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q05.cpp b/cpp/benchmarks/ndsh/q05.cpp
index 522bc4789c2..1c2d657913e 100644
--- a/cpp/benchmarks/ndsh/q05.cpp
+++ b/cpp/benchmarks/ndsh/q05.cpp
@@ -89,7 +89,7 @@
 }
 
 void run_ndsh_q5(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -120,17 +120,17 @@ void run_ndsh_q5(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer =
-    read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"});
+    read_parquet(sources.at("customer").make_source_info(), {"c_custkey", "c_nationkey"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
-  auto const lineitem = read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(sources.at("lineitem").make_source_info(),
                                      {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
   auto const nation =
-    read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
   auto const region =
-    read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred));
+    read_parquet(sources.at("region").make_source_info(), region_cols, std::move(region_pred));
 
   // Perform the joins
   auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
@@ -165,7 +165,7 @@ void ndsh_q5(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q06.cpp b/cpp/benchmarks/ndsh/q06.cpp
index 04078547973..e1e56c3622e 100644
--- a/cpp/benchmarks/ndsh/q06.cpp
+++ b/cpp/benchmarks/ndsh/q06.cpp
@@ -64,7 +64,7 @@
 }
 
 void run_ndsh_q6(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the `lineitem` table from parquet file
   std::vector<std::string> const lineitem_cols = {
@@ -83,8 +83,8 @@ void run_ndsh_q6(nvbench::state& state,
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
   auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Cast the discount and quantity columns to float32 and append to lineitem table
   auto discout_float =
@@ -134,7 +134,7 @@ void ndsh_q6(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp
index 59218ab8912..2e9a69d9ee2 100644
--- a/cpp/benchmarks/ndsh/q09.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -112,20 +112,21 @@
 }
 
 void run_ndsh_q9(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the table from parquet files
   auto const lineitem = read_parquet(
-    sources["lineitem"].make_source_info(),
+    sources.at("lineitem").make_source_info(),
     {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_name"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"});
-  auto const part     = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"});
-  auto const partsupp = read_parquet(sources["partsupp"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), {"o_orderkey", "o_orderdate"});
+  auto const part = read_parquet(sources.at("part").make_source_info(), {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(sources.at("partsupp").make_source_info(),
                                      {"ps_suppkey", "ps_partkey", "ps_supplycost"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
 
   // Generating the `profit` table
   // Filter the part table using `p_name like '%green%'`
@@ -178,7 +179,7 @@ void ndsh_q9(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp
index a520480020a..72edd15083d 100644
--- a/cpp/benchmarks/ndsh/q10.cpp
+++ b/cpp/benchmarks/ndsh/q10.cpp
@@ -94,7 +94,7 @@
 }
 
 void run_ndsh_q10(nvbench::state& state,
-                  std::unordered_map<std::string, parquet_device_buffer>& sources)
+                  std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -122,15 +122,16 @@ void run_ndsh_q10(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer = read_parquet(
-    sources["customer"].make_source_info(),
+    sources.at("customer").make_source_info(),
     {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
   auto const lineitem =
-    read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("lineitem").make_source_info(),
                  {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
                  std::move(lineitem_pred));
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_name", "n_nationkey"});
 
   // Perform the joins
   auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
@@ -163,7 +164,7 @@ void ndsh_q10(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 62116ddf661..9f9849860c9 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -17,6 +17,8 @@
 #include "utilities.hpp"
 
 #include "common/ndsh_data_generator/ndsh_data_generator.hpp"
+#include "common/table_utilities.hpp"
+#include "cudf/detail/utilities/integer_utils.hpp"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -30,8 +32,15 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <algorithm>
 #include <cstdlib>
 #include <ctime>
+#include <iterator>
+#include <unordered_set>
 
 namespace {
 
@@ -85,6 +94,15 @@ std::vector<std::string> const NATION_SCHEMA   = {
   "n_nationkey", "n_name", "n_regionkey", "n_comment"};
 std::vector<std::string> const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"};
 
+std::unordered_map<std::string, std::vector<std::string> const> const SCHEMAS = {
+  {"orders", ORDERS_SCHEMA},
+  {"lineitem", LINEITEM_SCHEMA},
+  {"part", PART_SCHEMA},
+  {"partsupp", PARTSUPP_SCHEMA},
+  {"supplier", SUPPLIER_SCHEMA},
+  {"customer", CUSTOMER_SCHEMA},
+  {"nation", NATION_SCHEMA},
+  {"region", REGION_SCHEMA}};
 }  // namespace
 
 cudf::table_view table_with_names::table() const { return tbl->view(); }
@@ -337,7 +355,7 @@ int32_t days_since_epoch(int year, int month, int day)
 
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source)
+                                    cuio_source_sink_pair& source)
 {
   CUDF_FUNC_RANGE();
   auto const stream = cudf::get_default_stream();
@@ -351,55 +369,124 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
   metadata.schema_info            = col_name_infos;
   auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
 
-  // Declare a host and device buffer
-  std::vector<char> h_buffer;
-
+  auto est_size                     = static_cast<std::size_t>(estimate_size(table->view()));
+  constexpr auto PQ_MAX_TABLE_BYTES = 8ul << 30;  // 8GB
+  // TODO: best to get this limit from percent_of_free_device_memory(50) of device memory resource.
+  if (est_size > PQ_MAX_TABLE_BYTES) {
+    auto builder = cudf::io::chunked_parquet_writer_options::builder(source.make_sink_info());
+    builder.metadata(table_input_metadata);
+    auto const options = builder.build();
+    auto num_splits    = static_cast<cudf::size_type>(
+      std::ceil(static_cast<long double>(est_size) / (PQ_MAX_TABLE_BYTES)));
+    std::vector<cudf::size_type> splits(num_splits - 1);
+    auto num_rows          = table->num_rows();
+    auto num_row_per_chunk = cudf::util::div_rounding_up_safe(num_rows, num_splits);
+    std::generate_n(splits.begin(), splits.size(), [num_row_per_chunk, i = 0]() mutable {
+      return (i += num_row_per_chunk);
+    });
+    std::vector<cudf::table_view> split_tables = cudf::split(table->view(), splits, stream);
+    auto writer                                = cudf::io::parquet_chunked_writer(options, stream);
+    for (auto const& chunk_table : split_tables) {
+      writer.write(chunk_table);
+    }
+    writer.close();
+    return;
+  }
   // Write parquet data to host buffer
-  auto builder =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view());
+  auto builder = cudf::io::parquet_writer_options::builder(source.make_sink_info(), table->view());
   builder.metadata(table_input_metadata);
   auto const options = builder.build();
-  cudf::io::write_parquet(options);
+  cudf::io::write_parquet(options, stream);
+}
 
-  // Copy host buffer to device buffer
-  source.d_buffer.resize(h_buffer.size(), stream);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+inline auto make_managed_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    std::make_shared<rmm::mr::managed_memory_resource>(), rmm::percent_of_free_device_memory(50));
 }
 
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources)
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   CUDF_FUNC_RANGE();
-  std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) {
-    sources[table_name] = parquet_device_buffer();
-  });
 
-  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  // Set the memory resource to the managed pool
+  auto old_mr = cudf::get_current_device_resource();
+  // if already managed pool or managed, don't create new one.
+  using managed_pool_mr_t = decltype(make_managed_pool());
+  managed_pool_mr_t managed_pool_mr;
+  bool const is_managed =
+    dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(old_mr) or
+    dynamic_cast<rmm::mr::managed_memory_resource*>(old_mr);
+  if (!is_managed) {
+    std::cout << "Creating managed pool just for data generation\n";
+    managed_pool_mr = make_managed_pool();
+    cudf::set_current_device_resource(managed_pool_mr.get());
+    // drawback: if already pool takes 50% of free memory, we are left with 50% of 50% of free
+    // memory.
+  }
 
-  auto partsupp = cudf::datagen::generate_partsupp(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  std::unordered_set<std::string> const requested_table_names = [&table_names]() {
+    if (table_names.empty()) {
+      return std::unordered_set<std::string>{
+        "orders", "lineitem", "part", "partsupp", "supplier", "customer", "nation", "region"};
+    }
+    return std::unordered_set(table_names.begin(), table_names.end());
+  }();
+  std::for_each(
+    requested_table_names.begin(), requested_table_names.end(), [&](auto const& table_name) {
+      sources.emplace(table_name, cuio_source_sink_pair(io_type::HOST_BUFFER));
+    });
+  std::unordered_map<std::string, std::unique_ptr<cudf::table>> tables;
+
+  if (sources.count("orders") or sources.count("lineitem") or sources.count("part")) {
+    auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    if (sources.count("orders")) {
+      write_to_parquet_device_buffer(orders, SCHEMAS.at("orders"), sources.at("orders"));
+      orders = {};
+    }
+    if (sources.count("part")) {
+      write_to_parquet_device_buffer(part, SCHEMAS.at("part"), sources.at("part"));
+      part = {};
+    }
+    if (sources.count("lineitem")) {
+      write_to_parquet_device_buffer(lineitem, SCHEMAS.at("lineitem"), sources.at("lineitem"));
+      lineitem = {};
+    }
+  }
+
+  if (sources.count("partsupp")) {
+    auto partsupp = cudf::datagen::generate_partsupp(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(partsupp, SCHEMAS.at("partsupp"), sources.at("partsupp"));
+  }
 
-  auto supplier = cudf::datagen::generate_supplier(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("supplier")) {
+    auto supplier = cudf::datagen::generate_supplier(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(supplier, SCHEMAS.at("supplier"), sources.at("supplier"));
+  }
 
-  auto customer = cudf::datagen::generate_customer(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("customer")) {
+    auto customer = cudf::datagen::generate_customer(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(customer, SCHEMAS.at("customer"), sources.at("customer"));
+  }
 
-  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("nation")) {
+    auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(nation, SCHEMAS.at("nation"), sources.at("nation"));
+  }
 
-  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("region")) {
+    auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(region, SCHEMAS.at("region"), sources.at("region"));
+  }
 
-  write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]);
-  write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]);
-  write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]);
-  write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]);
-  write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]);
-  write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]);
-  write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]);
-  write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]);
+  // Restore the original memory resource
+  if (!is_managed) { cudf::set_current_device_resource(old_mr); }
 }
diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp
index 762e43deccf..cae07f86a98 100644
--- a/cpp/benchmarks/ndsh/utilities.hpp
+++ b/cpp/benchmarks/ndsh/utilities.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "io/cuio_common.hpp"
+
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/io/parquet.hpp>
@@ -196,24 +198,15 @@ std::tm make_tm(int year, int month, int day);
 int32_t days_since_epoch(int year, int month, int day);
 
 /**
- * @brief Struct representing a parquet device buffer
- */
-struct parquet_device_buffer {
-  parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
-  cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
-  rmm::device_uvector<std::byte> d_buffer;
-};
-
-/**
- * @brief Write a `cudf::table` to a parquet device buffer
+ * @brief Write a `cudf::table` to a parquet cuio sink
  *
  * @param table The `cudf::table` to write
  * @param col_names The column names of the table
- * @param parquet_device_buffer The parquet device buffer to write the table to
+ * @param source The source sink pair to write the table to
  */
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source);
+                                    cuio_source_sink_pair& source);
 
 /**
  * @brief Generate NDS-H tables and write to parquet device buffers
@@ -224,4 +217,4 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
  */
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources);
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources);
diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
index 7acfb1ffb0d..d6ccfae63e8 100644
--- a/cpp/benchmarks/string/combine.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -14,57 +14,41 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringCombine : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_combine(benchmark::State& state)
+static void bench_combine(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const table = create_random_table(
-    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile);
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, profile);
   cudf::strings_column_view input1(table->view().column(0));
   cudf::strings_column_view input2(table->view().column(1));
   cudf::string_scalar separator("+");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::concatenate(table->view(), separator);
-  }
-
-  state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) +
-                                                input2.chars_size(cudf::get_default_stream())));
-}
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size =
+    input1.chars_size(stream) + input2.chars_size(stream) + (num_rows * separator.size());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 4;
-  int const max_rowlen = 1 << 11;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::concatenate(table->view(), separator);
+  });
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name)          \
-  BENCHMARK_DEFINE_F(StringCombine, name)       \
-  (::benchmark::State & st) { BM_combine(st); } \
-  BENCHMARK_REGISTER_F(StringCombine, name)     \
-    ->Apply(generate_bench_args)                \
-    ->UseManualTime()                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(concat)
+NVBENCH_BENCH(bench_combine)
+  .set_name("concat")
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index ae6c8b844c8..a73017dda18 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -17,10 +17,6 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/filling.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -28,57 +24,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate)
-{
-  // build input table using the following data
-  auto raw_data = cudf::test::strings_column_wrapper(
-                    {
-                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
-                      "012345 6789 01234 56789 0123 456",  // the rest do not match
-                      "abc 4567890 DEFGHI 0987 Wxyz 123",
-                      "abcdefghijklmnopqrstuvwxyz 01234",
-                      "",
-                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "123 édf 4567890 DéFG 0987 X5",
-                      "1",
-                    })
-                    .release();
-
-  if (row_width / 32 > 1) {
-    std::vector<cudf::column_view> columns;
-    for (int i = 0; i < row_width / 32; ++i) {
-      columns.push_back(raw_data->view());
-    }
-    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
-  }
-  auto data_view = raw_data->view();
-
-  // compute number of rows in n_rows that should match
-  auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
-
-  // Create a randomized gather-map to build a column out of the strings in data.
-  data_profile gather_profile =
-    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
-  auto gather_table =
-    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
-  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-
-  // Create scatter map by placing 0-index values throughout the gather-map
-  auto scatter_data = cudf::sequence(
-    matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
-  auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
-  auto table       = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
-  auto gather_map  = table->view().column(0);
-  table            = cudf::gather(cudf::table_view({data_view}), gather_map);
-
-  return std::move(table->release().front());
-}
-
 // longer pattern lengths demand more working memory per string
 std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
@@ -94,7 +39,7 @@ static void bench_contains(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto col   = build_input_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   auto pattern = patterns[pattern_index];
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index a9c620e4bf0..996bdcf0332 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -19,7 +19,6 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
@@ -29,10 +28,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate);
-
 static void bench_find_string(nvbench::state& state)
 {
   auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -46,7 +41,7 @@ static void bench_find_string(nvbench::state& state)
   }
 
   auto const stream = cudf::get_default_stream();
-  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const col    = create_string_column(n_rows, row_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
   std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index 99cef640dc3..105ae65cbe8 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -18,68 +18,12 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/filling.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvbench/nvbench.cuh>
 
-namespace {
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate)
-{
-  // build input table using the following data
-  auto raw_data = cudf::test::strings_column_wrapper(
-                    {
-                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches always;
-                      "012345 6789 01234 56789 0123 456",  // the rest do not match
-                      "abc 4567890 DEFGHI 0987 Wxyz 123",
-                      "abcdefghijklmnopqrstuvwxyz 01234",
-                      "",
-                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "123 édf 4567890 DéFG 0987 X5",
-                      "1",
-                    })
-                    .release();
-  if (row_width / 32 > 1) {
-    std::vector<cudf::column_view> columns;
-    for (int i = 0; i < row_width / 32; ++i) {
-      columns.push_back(raw_data->view());
-    }
-    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
-  }
-  auto data_view = raw_data->view();
-
-  // compute number of rows in n_rows that should match
-  auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
-
-  // Create a randomized gather-map to build a column out of the strings in data.
-  data_profile gather_profile =
-    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
-  auto gather_table =
-    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
-  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-
-  // Create scatter map by placing 0-index values throughout the gather-map
-  auto scatter_data = cudf::sequence(
-    matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
-  auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
-  auto table       = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
-  auto gather_map  = table->view().column(0);
-  table            = cudf::gather(cudf::table_view({data_view}), gather_map);
-
-  return std::move(table->release().front());
-}
-
-}  // namespace
-
 static void bench_like(nvbench::state& state)
 {
   auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -91,7 +35,7 @@ static void bench_like(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto col   = build_input_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   // This pattern forces reading the entire target string (when matched expected)
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 8e48f8e9a05..43d57201b20 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -15,58 +15,45 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
-class TextNGrams : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class ngrams_type { tokens, characters };
-
-static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+static void bench_ngrams(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const ngram_type = state.get_string("type");
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   auto const separator = cudf::string_scalar("_");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
-      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
-    }
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size * 2);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 5;
-  int const max_rowlen = 40;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (ngram_type == "chars") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_character_ngrams(input);
+    });
+  } else {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_ngrams(input, 2, separator);
+    });
+  }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)                             \
-  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
-  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
-  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
-    ->Apply(generate_bench_args)                                  \
-    ->UseManualTime()                                             \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(tokens)
-NVTEXT_BENCHMARK_DEFINE(characters)
+NVBENCH_BENCH(bench_ngrams)
+  .set_name("ngrams")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578})
+  .add_int64_axis("row_width", {10, 20, 40, 100})
+  .add_string_axis("type", {"chars", "tokens"});
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index fce8adb4c06..1c1052487f2 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -370,7 +370,7 @@ any type that cudf supports. For example, a `list_scalar` representing a list of
 |Value type|Scalar class|Notes|
 |-|-|-|
 |fixed-width|`fixed_width_scalar<T>`| `T` can be any fixed-width type|
-|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`|
+|numeric|`numeric_scalar<T>` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int64_t`, `float` or `double`|
 |fixed-point|`fixed_point_scalar<T>` | `T` can be `numeric::decimal32` or `numeric::decimal64`|
 |timestamp|`timestamp_scalar<T>` | `T` can be `timestamp_D`, `timestamp_s`, etc.|
 |duration|`duration_scalar<T>` | `T` can be `duration_D`, `duration_s`, etc.|
@@ -1483,6 +1483,17 @@ struct, and therefore `cudf::struct_view` is the data type of a `cudf::column` o
 
 `cudf::type_dispatcher` dispatches to the `struct_view` data type when invoked on a `STRUCT` column.
 
+# Empty Columns
+
+The libcudf columns support empty, typed content. These columns have no data and no validity mask.
+Empty strings or lists columns may or may not contain a child offsets column.
+It is undefined behavior (UB) to access the offsets child of an empty strings or lists column.
+Nested columns like lists and structs may require other children columns to provide the
+nested structure of the empty types.
+
+Use `cudf::make_empty_column()` to create fixed-width and strings columns.
+Use `cudf::empty_like()` to create an empty column from an existing `cudf::column_view`.
+
 # cuIO: file reading and writing
 
 cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index a254171ef11..f4cce8e6da6 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -17,12 +17,8 @@
 
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <thrust/scan.h>
 
 #include <functional>
 #include <numeric>
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 6bbe32de134..e72661ce49a 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -24,8 +24,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/pair.h>
-
 namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_factories
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 48f89b8be25..6db5c8b3c7b 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/prefetch.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 10be5e1d36f..204eee49a2a 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/aggregation.hpp>
@@ -29,12 +28,31 @@
 #include <cuda/std/type_traits>
 
 namespace cudf::detail {
+/// Checks if an aggregation kind needs to operate on the underlying storage type
+template <aggregation::Kind k>
+__device__ constexpr bool uses_underlying_type()
+{
+  return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM;
+}
+
+/// Gets the underlying target type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_target_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(),
+                           cudf::device_storage_type_t<cudf::detail::target_type_t<Source, k>>,
+                           cudf::detail::target_type_t<Source, k>>;
+
+/// Gets the underlying source type for the given source type and aggregation kind
+template <typename Source, aggregation::Kind k>
+using underlying_source_t =
+  cuda::std::conditional_t<uses_underlying_type<k>(), cudf::device_storage_type_t<Source>, Source>;
+
 template <typename Source, aggregation::Kind k, typename Enable = void>
 struct update_target_element {
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
   {
     CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
   }
@@ -51,8 +69,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MIN>;
     cudf::detail::atomic_min(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -72,8 +88,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MIN>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -96,8 +110,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::MAX>;
     cudf::detail::atomic_max(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -117,8 +129,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::MAX>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -141,8 +151,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM>;
     cudf::detail::atomic_add(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -162,8 +170,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target       = target_type_t<Source, aggregation::SUM>;
     using DeviceTarget = device_storage_type_t<Target>;
     using DeviceSource = device_storage_type_t<Source>;
@@ -197,10 +203,10 @@ struct update_target_from_dictionary {
   template <typename Source,
             aggregation::Kind k,
             cuda::std::enable_if_t<is_dictionary<Source>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
+  __device__ void operator()(mutable_column_device_view,
+                             size_type,
+                             column_device_view,
+                             size_type) const noexcept
   {
   }
 };
@@ -227,8 +233,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     dispatch_type_and_aggregation(
       source.child(cudf::dictionary_column_view::keys_column_index).type(),
       k,
@@ -249,8 +253,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
     auto value   = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
@@ -267,8 +269,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::PRODUCT>;
     cudf::detail::atomic_mul(&target.element<Target>(target_index),
                              static_cast<Target>(source.element<Source>(source_index)));
@@ -286,8 +286,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::COUNT_VALID>;
     cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});
 
@@ -323,8 +321,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMAX>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
@@ -349,8 +345,6 @@ struct update_target_element<
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    if (source.is_null(source_index)) { return; }
-
     using Target = target_type_t<Source, aggregation::ARGMIN>;
     auto old     = cudf::detail::atomic_cas(
       &target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
@@ -376,6 +370,9 @@ struct elementwise_aggregator {
                              column_device_view source,
                              size_type source_index) const noexcept
   {
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
     update_target_element<Source, k>{}(target, target_index, source, source_index);
   }
 };
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index ec5a511bb7c..486808ebe18 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -19,7 +19,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/types.hpp>
 
 #include <unordered_map>
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index dfb646c66c4..4159e324472 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -19,6 +19,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -36,7 +37,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -256,7 +256,7 @@ struct scatter_gather_functor {
 
     cudf::detail::grid_1d grid{input.size(), block_size, per_thread};
 
-    rmm::device_scalar<cudf::size_type> null_count{0, stream};
+    cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
     if (output.nullable()) {
       // Have to initialize the output mask to all zeros because we may update
       // it with atomicOr().
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index a70cd5a0661..5dc75b1a3fb 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -19,12 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <rmm/device_scalar.hpp>
-
 #include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
 
@@ -171,7 +170,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
 
   // if we have validity in the output
   if (nullable) {
-    rmm::device_scalar<size_type> valid_count{0, stream};
+    cudf::detail::device_scalar<size_type> valid_count{0, stream};
 
     // call the kernel
     copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, true>
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 3aa136d630b..fcb80fe45f7 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
@@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin,
   auto grid = cudf::detail::grid_1d{num_items, block_size, 1};
 
   if (target.nullable()) {
-    rmm::device_scalar<size_type> null_count(target.null_count(), stream);
+    cudf::detail::device_scalar<size_type> null_count(target.null_count(), stream);
 
     auto kernel =
       copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, true>;
diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp
new file mode 100644
index 00000000000..16ca06c6561
--- /dev/null
+++ b/cpp/include/cudf/detail/device_scalar.hpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+template <typename T>
+class device_scalar : public rmm::device_scalar<T> {
+ public:
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  ~device_scalar() = default;
+
+// Implementation is the same as what compiler should generate
+// Could not use default move constructor as 11.8 compiler fails to generate it
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  device_scalar(device_scalar&& other) noexcept
+    : rmm::device_scalar<T>{std::move(other)}, bounce_buffer{std::move(other.bounce_buffer)}
+  {
+  }
+  device_scalar& operator=(device_scalar&&) noexcept = default;
+
+  device_scalar(device_scalar const&)            = delete;
+  device_scalar& operator=(device_scalar const&) = delete;
+
+  device_scalar() = delete;
+
+  explicit device_scalar(
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+  }
+
+  explicit device_scalar(
+    T const& initial_value,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+    bounce_buffer[0] = initial_value;
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  device_scalar(device_scalar const& other,
+                rmm::cuda_stream_view stream,
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
+    : rmm::device_scalar<T>(other, stream, mr), bounce_buffer{make_host_vector<T>(1, stream)}
+  {
+  }
+
+  [[nodiscard]] T value(rmm::cuda_stream_view stream) const
+  {
+    cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
+    return bounce_buffer[0];
+  }
+
+  void set_value_async(T const& value, rmm::cuda_stream_view stream)
+  {
+    bounce_buffer[0] = value;
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  void set_value_async(T&& value, rmm::cuda_stream_view stream)
+  {
+    bounce_buffer[0] = std::move(value);
+    cuda_memcpy_async<T>(device_span<T>{this->data(), 1}, bounce_buffer, stream);
+  }
+
+  void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); }
+
+ private:
+  mutable cudf::detail::host_vector<T> bounce_buffer;
+};
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp
index 4b74d12f306..26b1bec2ced 100644
--- a/cpp/include/cudf/detail/is_element_valid.hpp
+++ b/cpp/include/cudf/detail/is_element_valid.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 4349e1b70fd..30f36d6a5da 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -38,18 +38,19 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-#include <utility>
-
 namespace cudf {
 namespace detail {
 /**
  * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a
- * `thrust::counting_iterator`.
+ * `thrust::counting_iterator` within the range [0, INT_MAX].
+ *
  *
  * Example:
  * @code{.cpp}
@@ -62,14 +63,21 @@ namespace detail {
  * iter[n] == n * n
  * @endcode
  *
- * @param start The starting value of the counting iterator
+ * @param start The starting value of the counting iterator (must be size_type or smaller type).
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */
-template <typename UnaryFunction>
-CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(cudf::size_type start,
+template <typename CountingIterType, typename UnaryFunction>
+CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {
+  // Check if the `start` for counting_iterator is of size_type or a smaller integral type
+  static_assert(
+    cuda::std::is_integral_v<CountingIterType> and
+      cuda::std::numeric_limits<CountingIterType>::digits <=
+        cuda::std::numeric_limits<cudf::size_type>::digits,
+    "The `start` for the counting_transform_iterator must be size_type or smaller type");
+
   return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f);
 }
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 327c732716c..025e2ccc3ec 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
@@ -25,7 +26,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cub/block/block_reduce.cuh>
@@ -165,17 +165,10 @@ size_type inplace_bitmask_binop(Binop op,
                "Mask pointer cannot be null");
 
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
-  rmm::device_scalar<size_type> d_counter{0, stream, mr};
-  rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
-  rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyDefault, stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
-                                masks_begin_bits.data(),
-                                masks_begin_bits.size_bytes(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::device_scalar<size_type> d_counter{0, stream, mr};
+
+  auto d_masks      = cudf::detail::make_device_uvector_async(masks, stream, mr);
+  auto d_begin_bits = cudf::detail::make_device_uvector_async(masks_begin_bits, stream, mr);
 
   auto constexpr block_size = 256;
   cudf::detail::grid_1d config(dest_mask.size(), block_size);
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 18b1e9b2d2e..0f852db0c54 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -59,7 +59,7 @@ std::unique_ptr<column> true_if(InputIterator begin,
   auto output_mutable_view = output->mutable_view();
   auto output_data         = output_mutable_view.data<bool>();
 
-  thrust::transform(rmm::exec_policy(stream), begin, end, output_data, p);
+  thrust::transform(rmm::exec_policy_nosync(stream), begin, end, output_data, p);
 
   return output;
 }
diff --git a/cpp/include/cudf/detail/utilities/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp
index 75f738f7529..78be5b91248 100644
--- a/cpp/include/cudf/detail/utilities/batched_memset.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp
@@ -53,8 +53,8 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
 
   // get a vector with the sizes of all buffers
-  auto sizes = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
+  auto sizes = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
     cuda::proclaim_return_type<std::size_t>(
       [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
 
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index cfb2e70bfed..af182b69c3a 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
@@ -25,7 +26,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/distance.h>
 
@@ -101,7 +101,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
 
   size_type null_count{0};
   if (size > 0) {
-    rmm::device_scalar<size_type> valid_count{0, stream};
+    cudf::detail::device_scalar<size_type> valid_count{0, stream};
 
     constexpr size_type block_size{256};
     grid_1d grid{size, block_size};
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 5596f78a90b..0a799f27d00 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 
 /**
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
similarity index 99%
rename from cpp/include/cudf/fixed_point/floating_conversion.hpp
rename to cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
index f0d50edccd1..fce08b4a5c4 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
@@ -26,14 +26,6 @@
 #include <cstring>
 
 namespace CUDF_EXPORT numeric {
-
-/**
- * @addtogroup floating_conversion
- * @{
- * @file
- * @brief fixed_point <--> floating-point conversion functions.
- */
-
 namespace detail {
 
 /**
@@ -1141,6 +1133,4 @@ CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& val
 }
 
 }  // namespace detail
-
-/** @} */  // end of group
 }  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 0c5327edb91..307a52cd242 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -22,26 +22,27 @@
 
 namespace CUDF_EXPORT cudf {
 
-/**
- * @addtogroup column_hash
- * @{
- * @file
- */
-
 /**
  * @brief Type of hash value
- *
+ * @ingroup column_hash
  */
 using hash_value_type = uint32_t;
 
 /**
  * @brief The default seed value for hash functions
+ * @ingroup column_hash
  */
 static constexpr uint32_t DEFAULT_HASH_SEED = 0;
 
 //! Hash APIs
 namespace hashing {
 
+/**
+ * @addtogroup column_hash
+ * @{
+ * @file
+ */
+
 /**
  * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
  *
@@ -183,7 +184,8 @@ std::unique_ptr<column> xxhash_64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/** @} */  // end of group
+
 }  // namespace hashing
 
-/** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
index 1827ba0e3e6..13a76d50346 100644
--- a/cpp/include/cudf/io/config_utils.hpp
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -18,7 +18,8 @@
 #include <cudf/utilities/export.hpp>
 
 namespace CUDF_EXPORT cudf {
-namespace io::cufile_integration {
+namespace io {
+namespace cufile_integration {
 
 /**
  * @brief Returns true if cuFile and its compatibility mode are enabled.
@@ -35,9 +36,15 @@ bool is_gds_enabled();
  */
 bool is_kvikio_enabled();
 
-}  // namespace io::cufile_integration
+/**
+ * @brief Set kvikIO thread pool size according to the environment variable KVIKIO_NTHREADS. If
+ * KVIKIO_NTHREADS is not set, use 8 threads by default.
+ */
+void set_thread_pool_nthreads_from_env();
+
+}  // namespace cufile_integration
 
-namespace io::nvcomp_integration {
+namespace nvcomp_integration {
 
 /**
  * @brief Returns true if all nvCOMP uses are enabled.
@@ -49,5 +56,6 @@ bool is_all_enabled();
  */
 bool is_stable_enabled();
 
-}  // namespace io::nvcomp_integration
+}  // namespace nvcomp_integration
+}  // namespace io
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 7d2cc4ad493..7bec40893fd 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -79,7 +79,7 @@ class datasource {
     template <typename Container>
     static std::unique_ptr<buffer> create(Container&& data_owner)
     {
-      return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
+      return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
     }
   };
 
@@ -335,13 +335,19 @@ class datasource {
   template <typename Container>
   class owning_buffer : public buffer {
    public:
+    // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
+    // reference).
+    static_assert(std::is_rvalue_reference_v<Container&&>,
+                  "The container argument passed to the constructor must be an rvalue.");
+
     /**
      * @brief Moves the input container into the newly created object.
      *
-     * @param data_owner The container to construct the buffer from (ownership is transferred)
+     * @param moved_data_owner The container to construct the buffer from. Callers should explicitly
+     * pass std::move(data_owner) to this function to transfer the ownership.
      */
-    owning_buffer(Container&& data_owner)
-      : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
+    owning_buffer(Container&& moved_data_owner)
+      : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
     {
     }
 
@@ -349,12 +355,13 @@ class datasource {
      * @brief Moves the input container into the newly created object, and exposes a subspan of the
      * buffer.
      *
-     * @param data_owner The container to construct the buffer from (ownership is transferred)
+     * @param moved_data_owner The container to construct the buffer from. Callers should explicitly
+     * pass std::move(data_owner) to this function to transfer the ownership.
      * @param data_ptr Pointer to the start of the subspan
      * @param size The size of the subspan
      */
-    owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
-      : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
+    owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
+      : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
     {
     }
 
diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
index 11eb4518210..5659f86b0c4 100644
--- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
+++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
@@ -16,16 +16,10 @@
 
 #pragma once
 
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <zlib.h>
-
-#include <algorithm>
-#include <array>
 #include <fstream>
-#include <limits>
 
 namespace CUDF_EXPORT cudf {
 namespace io::text::detail::bgzip {
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 385da993262..f9a68e4fffc 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -70,6 +70,7 @@ enum class hash_id {
  * @param partition_map Non-nullable column of integer values that map each row
  * in `t` to it's partition.
  * @param num_partitions The total number of partitions
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Pair containing the reordered table and vector of `num_partitions +
  * 1` offsets to each partition such that the size of partition `i` is
@@ -79,6 +80,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -242,6 +244,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
  * @param[in] input The input table to be round-robin partitioned
  * @param[in] num_partitions Number of partitions for the table
  * @param[in] start_partition Index of the 1st partition
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  *
  * @return A std::pair consisting of a unique_ptr to the partitioned table
@@ -251,6 +254,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition   = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index ba56ff34b97..158e6df7e5f 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -66,6 +67,7 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN };
  * @param decimal_places Number of decimal places to round to (default 0). If negative, this
  * specifies the number of positions to the left of the decimal point.
  * @param method         Rounding method
+ * @param stream         CUDA stream used for device memory operations and kernel launches
  * @param mr             Device memory resource used to allocate the returned column's device memory
  *
  * @return Column with each of the values rounded
@@ -74,6 +76,7 @@ std::unique_ptr<column> round(
   column_view const& input,
   int32_t decimal_places            = 0,
   rounding_method method            = rounding_method::HALF_UP,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 66be2a12fbe..360dde11fc0 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -94,8 +95,8 @@ class scalar {
   [[nodiscard]] bool const* validity_data() const;
 
  protected:
-  data_type _type{type_id::EMPTY};     ///< Logical type of value in the scalar
-  rmm::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
+  data_type _type{type_id::EMPTY};              ///< Logical type of value in the scalar
+  cudf::detail::device_scalar<bool> _is_valid;  ///< Device bool signifying validity
 
   /**
    * @brief Move constructor for scalar.
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index fb0b25cf9f1..de2f1770e28 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -65,19 +65,20 @@ rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
   auto chars_data      = rmm::device_uvector<char>(chars_size, stream, mr);
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);
 
-  auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
+  auto const src_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
       // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
       // we have to use `const_cast` to remove `const` qualifier from the source pointer.
       // This should be fine as long as we only read but not write anything to the source.
       return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
     }));
-  auto const src_sizes = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
-      return begin[idx].second;
-    }));
-  auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u,
+  auto const src_sizes = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<size_type>(
+      [begin] __device__(uint32_t idx) { return begin[idx].second; }));
+  auto const dst_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
     cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
                                         uint32_t idx) { return output + offsets[idx]; }));
 
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 53e0f3a15d2..046e9745a71 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <cudf/fixed_point/detail/floating_conversion.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp
index 97a42243250..3e740b81cc9 100644
--- a/cpp/include/cudf/utilities/default_stream.hpp
+++ b/cpp/include/cudf/utilities/default_stream.hpp
@@ -16,10 +16,8 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace CUDF_EXPORT cudf {
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index d558cfb5e85..21ee4fa9e9b 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -425,21 +425,21 @@ class base_2dspan {
    *
    * @return A pointer to the first element of the span
    */
-  constexpr auto data() const noexcept { return _flat.data(); }
+  [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); }
 
   /**
    * @brief Returns the size in the span as pair.
    *
    * @return pair representing rows and columns size of the span
    */
-  constexpr auto size() const noexcept { return _size; }
+  [[nodiscard]] constexpr auto size() const noexcept { return _size; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return Number of elements in the span
    */
-  constexpr auto count() const noexcept { return _flat.size(); }
+  [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); }
 
   /**
    * @brief Checks if the span is empty.
@@ -467,7 +467,7 @@ class base_2dspan {
    *
    * @return A flattened span of the 2D span
    */
-  constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
+  [[nodiscard]] constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
 
   /**
    * @brief Construct a 2D span from another 2D span of convertible type
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 3f37ae02151..cf8413b597f 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -22,8 +22,6 @@
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cuda/std/type_traits>
-
 namespace CUDF_EXPORT cudf {
 
 /**
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 15b5f921c1b..6351a84e38f 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7c909f1a948..42124461cdf 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext {
  *
  * This function uses MurmurHash3_x86_32 for the hash algorithm.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
@@ -71,6 +73,8 @@ std::unique_ptr<cudf::column> minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -83,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   cudf::size_type width             = 4,
@@ -102,6 +106,8 @@ std::unique_ptr<cudf::column> minhash(
  * The hash function returns 2 uint64 values but only the first value
  * is used with the minhash calculation.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -112,7 +118,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values as UINT64 for each string in input
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
@@ -132,6 +138,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -144,7 +152,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   cudf::size_type width             = 4,
@@ -164,6 +172,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -173,7 +183,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
   cudf::lists_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -193,6 +203,8 @@ std::unique_ptr<cudf::column> word_minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -202,7 +214,7 @@ std::unique_ptr<cudf::column> word_minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
   cudf::lists_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index bbd0503379b..822edcbdb43 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext {
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column with replaced strings
  */
 std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& input,
@@ -131,7 +131,7 @@ std::unique_ptr<cudf::column> replace_tokens(
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column of filtered strings
  */
 std::unique_ptr<cudf::column> filter_tokens(
   cudf::strings_column_view const& input,
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 55a4124bfd0..e5b2a4cc21b 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -51,7 +51,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b1 = is_letter(st, VOWEL, 1)
  * b1 is now [false, true, true]
  * @endcode
@@ -62,7 +62,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
  * b2 is now [false, true, false]
  * @endcode
@@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, 1, 4]
  * b1 = is_letter(st, VOWEL, ix)
  * b1 is now [true, true, false]
@@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, -2, 4] // 2nd to last character in st[1] is checked
  * b2 = is_letter(st, CONSONANT, ix)
  * b2 is now [false, false, true]
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index e61601c6fea..e345587f88b 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -292,7 +292,7 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(
  * @throw cudf::logic_error if `delimiter` is invalid
  *
  * @param input Strings column to tokenize
- * @param vocabulary Used to lookup tokens within
+ * @param vocabulary Used to lookup tokens within `input`
  * @param delimiter Used to identify tokens within `input`
  * @param default_id The token id to be used for tokens not found in the `vocabulary`;
  *                   Default is -1
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index 3b650d791aa..5815ce33e33 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,6 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp
index b45b9d0c78c..4c2b56dd4f5 100644
--- a/cpp/src/ast/expressions.cpp
+++ b/cpp/src/ast/expressions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,6 @@
 #include <cudf/ast/detail/expression_transformer.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index a6c878efbbc..1b23ea12a5e 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -27,15 +27,10 @@
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 4ca05f9c335..e6659f76c7c 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -32,7 +33,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cub/cub.cuh>
@@ -329,7 +329,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,
 
   cudf::detail::grid_1d grid(num_words, block_size);
 
-  rmm::device_scalar<size_type> non_zero_count(0, stream);
+  cudf::detail::device_scalar<size_type> non_zero_count(0, stream);
 
   count_set_bits_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 482413d0ccb..972f97e8668 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -15,19 +15,13 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/detail/fill.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <thrust/iterator/constant_iterator.h>
-
 namespace cudf {
 namespace {
 struct size_of_helper {
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 386c5ebe478..e831aa9645d 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -27,9 +26,7 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #include <algorithm>
-#include <exception>
 #include <numeric>
-#include <string>
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b8e140f1fa5..d8419760120 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -19,6 +19,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -162,7 +163,7 @@ size_type concatenate_masks(device_span<column_device_view const> d_views,
                             size_type output_size,
                             rmm::cuda_stream_view stream)
 {
-  rmm::device_scalar<size_type> d_valid_count(0, stream);
+  cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
   concatenate_masks_kernel<block_size>
@@ -265,7 +266,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
   auto out_view     = out_col->mutable_view();
   auto d_out_view   = mutable_column_device_view::create(out_view, stream);
 
-  rmm::device_scalar<size_type> d_valid_count(0, stream);
+  cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
 
   // Launch kernel
   constexpr size_type block_size{256};
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index d60fb5ce110..5e2065ba844 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -20,16 +20,11 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/iterator/transform_iterator.h>
-
 #include <algorithm>
 
 namespace cudf {
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 29a28f81d1a..80b0bd5242f 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/is_element_valid.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -71,7 +72,7 @@ struct get_element_functor {
     auto device_col = column_device_view::create(input, stream);
 
     rmm::device_scalar<string_view> temp_data(stream, mr);
-    rmm::device_scalar<bool> temp_valid(stream, mr);
+    cudf::detail::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
@@ -155,8 +156,8 @@ struct get_element_functor {
 
     auto device_col = column_device_view::create(input, stream);
 
-    rmm::device_scalar<Type> temp_data(stream, mr);
-    rmm::device_scalar<bool> temp_valid(stream, mr);
+    cudf::detail::device_scalar<Type> temp_data(stream, mr);
+    cudf::detail::device_scalar<bool> temp_valid(stream, mr);
 
     device_single_thread(
       [buffer   = temp_data.data(),
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 1282eec6c44..a001807c82b 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -18,7 +18,6 @@
 #include <cudf/detail/contiguous_split.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp
index 832a72ed5b0..116e3516460 100644
--- a/cpp/src/copying/split.cpp
+++ b/cpp/src/copying/split.cpp
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 2196ee97fee..f786624680c 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -13,12 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <filesystem>
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
new file mode 100644
index 00000000000..59457bea694
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_groupby.hpp"
+#include "compute_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "sparse_to_dense_results.hpp"
+
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <iterator>
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
+                                                      size_type num_keys,
+                                                      rmm::cuda_stream_view stream)
+{
+  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
+  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
+
+  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
+  return populated_keys;
+}
+
+template <typename Equal, typename Hash>
+std::unique_ptr<table> compute_groupby(table_view const& keys,
+                                       host_span<aggregation_request const> requests,
+                                       bool skip_rows_with_nulls,
+                                       Equal const& d_row_equal,
+                                       Hash const& d_row_hash,
+                                       cudf::detail::result_cache* cache,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+{
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys = static_cast<int64_t>(keys.num_rows());
+
+  // Cache of sparse results where the location of aggregate value in each
+  // column is indexed by the hash set
+  cudf::detail::result_cache sparse_results(requests.size());
+
+  auto const set = cuco::static_set{
+    num_keys,
+    cudf::detail::CUCO_DESIRED_LOAD_FACTOR,  // 50% load factor
+    cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    d_row_equal,
+    probing_scheme_t{d_row_hash},
+    cuco::thread_scope_device,
+    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
+  auto row_bitmask =
+    skip_rows_with_nulls
+      ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
+      : rmm::device_buffer{};
+
+  // Compute all single pass aggs first
+  compute_single_pass_aggs(num_keys,
+                           skip_rows_with_nulls,
+                           static_cast<bitmask_type*>(row_bitmask.data()),
+                           set.ref(cuco::insert_and_find),
+                           requests,
+                           &sparse_results,
+                           stream);
+
+  // Extract the populated indices from the hash set and create a gather map.
+  // Gathering using this map from sparse results will give dense results.
+  auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
+
+  // Compact all results from sparse_results and insert into cache
+  sparse_to_dense_results(requests,
+                          &sparse_results,
+                          cache,
+                          gather_map,
+                          set.ref(cuco::find),
+                          static_cast<bitmask_type*>(row_bitmask.data()),
+                          stream,
+                          mr);
+
+  return cudf::detail::gather(keys,
+                              gather_map,
+                              out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr);
+}
+
+template rmm::device_uvector<size_type> extract_populated_keys<global_set_t>(
+  global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template rmm::device_uvector<size_type> extract_populated_keys<nullable_global_set_t>(
+  nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream);
+
+template std::unique_ptr<table> compute_groupby<row_comparator_t, row_hash_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  bool skip_rows_with_nulls,
+  row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<table> compute_groupby<nullable_row_comparator_t, row_hash_t>(
+  table_view const& keys,
+  host_span<aggregation_request const> requests,
+  bool skip_rows_with_nulls,
+  nullable_row_comparator_t const& d_row_equal,
+  row_hash_t const& d_row_hash,
+  cudf::detail::result_cache* cache,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp
new file mode 100644
index 00000000000..7bb3a60ff07
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_groupby.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes and returns a device vector containing all populated keys in
+ * `key_set`.
+ *
+ * @tparam SetType Type of key hash set
+ *
+ * @param key_set Key hash set
+ * @param num_keys Number of input keys
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return An array of unique keys contained in `key_set`
+ */
+template <typename SetType>
+rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
+                                                      size_type num_keys,
+                                                      rmm::cuda_stream_view stream);
+
+/**
+ * @brief Computes groupby using hash table.
+ *
+ * First, we create a hash table that stores the indices of unique rows in
+ * `keys`. The upper limit on the number of values in this map is the number
+ * of rows in `keys`.
+ *
+ * To store the results of aggregations, we create temporary sparse columns
+ * which have the same size as input value columns. Using the hash map, we
+ * determine the location within the sparse column to write the result of the
+ * aggregation into.
+ *
+ * The sparse column results of all aggregations are stored into the cache
+ * `sparse_results`. This enables the use of previously calculated results in
+ * other aggregations.
+ *
+ * All the aggregations which can be computed in a single pass are computed
+ * first, in a combined kernel. Then using these results, aggregations that
+ * require multiple passes, will be computed.
+ *
+ * Finally, using the hash map, we generate a vector of indices of populated
+ * values in sparse result columns. Then, for each aggregation originally
+ * requested in `requests`, we gather sparse results into a column of dense
+ * results using the aforementioned index vector. Dense results are stored into
+ * the in/out parameter `cache`.
+ *
+ * @tparam Equal Device row comparator type
+ * @tparam Hash Device row hasher type
+ *
+ * @param keys Table whose rows act as the groupby keys
+ * @param requests The set of columns to aggregate and the aggregations to perform
+ * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not
+ * @param d_row_equal Device row comparator
+ * @param d_row_hash Device row hasher
+ * @param cache Dense aggregation results
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table
+ * @return Table of unique keys
+ */
+template <typename Equal, typename Hash>
+std::unique_ptr<cudf::table> compute_groupby(table_view const& keys,
+                                             host_span<aggregation_request const> requests,
+                                             bool skip_rows_with_nulls,
+                                             Equal const& d_row_equal,
+                                             Hash const& d_row_hash,
+                                             cudf::detail::result_cache* cache,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu
new file mode 100644
index 00000000000..519d7cd2f1c
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type max_occupancy_grid_size<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type n);
+
+template void compute_mapping_indices<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
new file mode 100644
index 00000000000..d353830780f
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_mapping_indices.hpp"
+#include "helpers.cuh"
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cuco/static_set_ref.cuh>
+#include <cuda/std/atomic>
+#include <cuda/std/utility>
+
+#include <algorithm>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+__device__ void find_local_mapping(cooperative_groups::thread_block const& block,
+                                   cudf::size_type idx,
+                                   cudf::size_type num_input_rows,
+                                   SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  auto const is_valid_input =
+    idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx));
+  auto const [result_idx, inserted] = [&]() {
+    if (is_valid_input) {
+      auto const result      = shared_set.insert_and_find(idx);
+      auto const matched_idx = *result.first;
+      auto const inserted    = result.second;
+      // inserted a new element
+      if (result.second) {
+        auto const shared_set_index          = atomicAdd(cardinality, 1);
+        shared_set_indices[shared_set_index] = idx;
+        local_mapping_index[idx]             = shared_set_index;
+      }
+      return cuda::std::pair{matched_idx, inserted};
+    }
+    return cuda::std::pair{0, false};  // dummy values
+  }();
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  block.sync();
+  if (is_valid_input) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetRef>
+__device__ void find_global_mapping(cooperative_groups::thread_block const& block,
+                                    cudf::size_type cardinality,
+                                    SetRef global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  // for all unique keys in shared memory hash set, stores their matches in
+  // global hash set to `global_mapping_index`
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const input_idx = shared_set_indices[idx];
+    global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] =
+      *global_set.insert_and_find(input_idx).first;
+  }
+}
+
+/*
+ * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given
+ * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds
+ * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating
+ * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
+ * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef>
+CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
+                                        SetRef global_set,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
+                                        cudf::size_type* local_mapping_index,
+                                        cudf::size_type* global_mapping_index,
+                                        cudf::size_type* block_cardinality,
+                                        cuda::std::atomic_flag* needs_global_memory_fallback)
+{
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+
+  auto raw_set = cuco::static_set_ref{
+    cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    global_set.key_eq(),
+    probing_scheme_t{global_set.hash_function()},
+    cuco::thread_scope_block,
+    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
+      window_extent, windows}};
+  auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
+
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+
+  __shared__ cudf::size_type cardinality;
+  if (block.thread_rank() == 0) { cardinality = 0; }
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto idx = cudf::detail::grid_1d::global_thread_id();
+       idx - block.thread_rank() < num_input_rows;
+       idx += stride) {
+    find_local_mapping(block,
+                       idx,
+                       num_input_rows,
+                       shared_set,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); }
+      break;
+    }
+  }
+
+  // Insert unique keys from shared to global hash set if block-cardinality
+  // doesn't exceed the threshold upper-limit
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index);
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+template <class SetRef>
+cudf::size_type max_occupancy_grid_size(cudf::size_type n)
+{
+  cudf::size_type max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, mapping_indices_kernel<SetRef>, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
+                             rmm::cuda_stream_view stream)
+{
+  mapping_indices_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(
+    num,
+    global_set,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    needs_global_memory_fallback);
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp
new file mode 100644
index 00000000000..473ad99e650
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cuda/std/atomic>
+
+namespace cudf::groupby::detail::hash {
+/*
+ * @brief Computes the maximum number of active blocks of the given kernel that can be executed on
+ * the underlying device
+ */
+template <class SetRef>
+[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n);
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
+                             rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
new file mode 100644
index 00000000000..81c3c9e456f
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type
+max_occupancy_grid_size<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(cudf::size_type n);
+
+template void compute_mapping_indices<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  nullable_hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
new file mode 100644
index 00000000000..12c02a1865e
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_shared_memory_aggs.hpp"
+#include "global_memory_aggregator.cuh"
+#include "helpers.cuh"
+#include "shared_memory_aggregator.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cuda/std/cstddef>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+/// Functor used by type dispatcher returning the size of the underlying C++ type
+struct size_of_functor {
+  template <typename T>
+  __device__ constexpr cudf::size_type operator()()
+  {
+    return sizeof(T);
+  }
+};
+
+/// Shared memory data alignment
+CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
+
+// Prepares shared memory data required by each output column, exits if
+// no enough memory space to perform the shared memory aggregation for the
+// current output column
+__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
+                                               cudf::size_type& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               cudf::size_type output_size,
+                                               cudf::size_type* shmem_agg_res_offsets,
+                                               cudf::size_type* shmem_agg_mask_offsets,
+                                               cudf::size_type cardinality,
+                                               cudf::size_type total_agg_size)
+{
+  col_start                       = col_end;
+  cudf::size_type bytes_allocated = 0;
+
+  auto const valid_col_size =
+    cudf::util::round_up_safe(static_cast<cudf::size_type>(sizeof(bool) * cardinality), ALIGNMENT);
+
+  while (bytes_allocated < total_agg_size && col_end < output_size) {
+    auto const col_idx = col_end;
+    auto const next_col_size =
+      cudf::util::round_up_safe(cudf::type_dispatcher<cudf::dispatch_storage_type>(
+                                  output_values.column(col_idx).type(), size_of_functor{}) *
+                                  cardinality,
+                                ALIGNMENT);
+    auto const next_col_total_size = next_col_size + valid_col_size;
+
+    if (bytes_allocated + next_col_total_size > total_agg_size) {
+      CUDF_UNREACHABLE("Not enough memory for shared memory aggregations");
+    }
+
+    shmem_agg_res_offsets[col_end]  = bytes_allocated;
+    shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;
+
+    bytes_allocated += next_col_total_size;
+    ++col_end;
+  }
+}
+
+// Each block initialize its own shared memory aggregation results
+__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block,
+                                              cudf::size_type col_start,
+                                              cudf::size_type col_end,
+                                              cudf::mutable_table_device_view output_values,
+                                              cuda::std::byte* shmem_agg_storage,
+                                              cudf::size_type* shmem_agg_res_offsets,
+                                              cudf::size_type* shmem_agg_mask_offsets,
+                                              cudf::size_type cardinality,
+                                              cudf::aggregation::Kind const* d_agg_kinds)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+      auto target =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+      auto target_mask =
+        reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  initialize_shmem{},
+                                                  target,
+                                                  target_mask,
+                                                  idx);
+    }
+  }
+  block.sync();
+}
+
+__device__ void compute_pre_aggregrations(cudf::size_type col_start,
+                                          cudf::size_type col_end,
+                                          bitmask_type const* row_bitmask,
+                                          bool skip_rows_with_nulls,
+                                          cudf::table_device_view source,
+                                          cudf::size_type num_input_rows,
+                                          cudf::size_type* local_mapping_index,
+                                          cuda::std::byte* shmem_agg_storage,
+                                          cudf::size_type* shmem_agg_res_offsets,
+                                          cudf::size_type* shmem_agg_mask_offsets,
+                                          cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // Aggregates global memory sources to shared memory targets
+  for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows;
+       source_idx += cudf::detail::grid_1d::grid_stride()) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) {
+      auto const target_idx = local_mapping_index[source_idx];
+      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+        auto const source_col = source.column(col_idx);
+
+        cuda::std::byte* target =
+          reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+        bool* target_mask =
+          reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
+
+        cudf::detail::dispatch_type_and_aggregation(source_col.type(),
+                                                    d_agg_kinds[col_idx],
+                                                    shmem_element_aggregator{},
+                                                    target,
+                                                    target_mask,
+                                                    target_idx,
+                                                    source_col,
+                                                    source_idx);
+      }
+    }
+  }
+}
+
+__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block,
+                                           cudf::size_type col_start,
+                                           cudf::size_type col_end,
+                                           cudf::table_device_view input_values,
+                                           cudf::mutable_table_device_view target,
+                                           cudf::size_type cardinality,
+                                           cudf::size_type* global_mapping_index,
+                                           cuda::std::byte* shmem_agg_storage,
+                                           cudf::size_type* agg_res_offsets,
+                                           cudf::size_type* agg_mask_offsets,
+                                           cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // Aggregates shared memory sources to global memory targets
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const target_idx =
+      global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto target_col = target.column(col_idx);
+
+      cuda::std::byte* source =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + agg_res_offsets[col_idx]);
+      bool* source_mask = reinterpret_cast<bool*>(shmem_agg_storage + agg_mask_offsets[col_idx]);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  target_col,
+                                                  target_idx,
+                                                  input_values.column(col_idx),
+                                                  source,
+                                                  source_mask,
+                                                  idx);
+    }
+  }
+  block.sync();
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
+                                               bitmask_type const* row_bitmask,
+                                               bool skip_rows_with_nulls,
+                                               cudf::size_type* local_mapping_index,
+                                               cudf::size_type* global_mapping_index,
+                                               cudf::size_type* block_cardinality,
+                                               cudf::table_device_view input_values,
+                                               cudf::mutable_table_device_view output_values,
+                                               cudf::aggregation::Kind const* d_agg_kinds,
+                                               cudf::size_type total_agg_size,
+                                               cudf::size_type offsets_size)
+{
+  auto const block       = cooperative_groups::this_thread_block();
+  auto const cardinality = block_cardinality[block.group_index().x];
+  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
+
+  auto const num_cols = output_values.num_columns();
+
+  __shared__ cudf::size_type col_start;
+  __shared__ cudf::size_type col_end;
+  extern __shared__ cuda::std::byte shmem_agg_storage[];
+
+  cudf::size_type* shmem_agg_res_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size);
+  cudf::size_type* shmem_agg_mask_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size + offsets_size);
+
+  if (block.thread_rank() == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  block.sync();
+
+  while (col_end < num_cols) {
+    if (block.thread_rank() == 0) {
+      calculate_columns_to_aggregate(col_start,
+                                     col_end,
+                                     output_values,
+                                     num_cols,
+                                     shmem_agg_res_offsets,
+                                     shmem_agg_mask_offsets,
+                                     cardinality,
+                                     total_agg_size);
+    }
+    block.sync();
+
+    initialize_shmem_aggregations(block,
+                                  col_start,
+                                  col_end,
+                                  output_values,
+                                  shmem_agg_storage,
+                                  shmem_agg_res_offsets,
+                                  shmem_agg_mask_offsets,
+                                  cardinality,
+                                  d_agg_kinds);
+
+    compute_pre_aggregrations(col_start,
+                              col_end,
+                              row_bitmask,
+                              skip_rows_with_nulls,
+                              input_values,
+                              num_rows,
+                              local_mapping_index,
+                              shmem_agg_storage,
+                              shmem_agg_res_offsets,
+                              shmem_agg_mask_offsets,
+                              d_agg_kinds);
+    block.sync();
+
+    compute_final_aggregations(block,
+                               col_start,
+                               col_end,
+                               input_values,
+                               output_values,
+                               cardinality,
+                               global_mapping_index,
+                               shmem_agg_storage,
+                               shmem_agg_res_offsets,
+                               shmem_agg_mask_offsets,
+                               d_agg_kinds);
+  }
+}
+}  // namespace
+
+std::size_t available_shared_memory_size(cudf::size_type grid_size)
+{
+  auto const active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
+
+  size_t dynamic_shmem_size = 0;
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return cudf::util::round_down_safe(static_cast<cudf::size_type>(0.5 * dynamic_shmem_size),
+                                     ALIGNMENT);
+}
+
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                std::size_t available_shmem_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream)
+{
+  // For each aggregation, need one offset determining where the aggregation is
+  // performed, another indicating the validity of the aggregation
+  auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
+  // The rest of shmem is utilized for the actual arrays in shmem
+  CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2,
+               "No enough space for shared memory aggregations");
+  auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2;
+  single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, available_shmem_size, stream>>>(
+    num_input_rows,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    d_agg_kinds,
+    shmem_agg_size,
+    shmem_offsets_size);
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
new file mode 100644
index 00000000000..653821fd53b
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::groupby::detail::hash {
+
+std::size_t available_shared_memory_size(cudf::size_type grid_size);
+
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                std::size_t available_shmem_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream);
+
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
new file mode 100644
index 00000000000..e292543e6e9
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_single_pass_aggs.hpp"
+#include "create_sparse_results_table.hpp"
+#include "flatten_single_pass_aggs.hpp"
+#include "helpers.cuh"
+#include "single_pass_functors.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+void compute_single_pass_aggs(int64_t num_keys,
+                              bool skip_rows_with_nulls,
+                              bitmask_type const* row_bitmask,
+                              SetType set,
+                              host_span<aggregation_request const> requests,
+                              cudf::detail::result_cache* sparse_results,
+                              rmm::cuda_stream_view stream)
+{
+  // flatten the aggs to a table that can be operated on by aggregate_row
+  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+
+  // make table that will hold sparse results
+  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
+  // prepare to launch kernel to do the actual aggregation
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
+  auto const d_aggs   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, cudf::get_current_device_resource_ref());
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    num_keys,
+    hash::compute_single_pass_aggs_fn{
+      set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls});
+  // Add results back to sparse_results cache
+  auto sparse_result_cols = sparse_table.release();
+  for (size_t i = 0; i < aggs.size(); i++) {
+    // Note that the cache will make a copy of this temporary aggregation
+    sparse_results->add_result(
+      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
+  }
+}
+
+template void compute_single_pass_aggs<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  int64_t num_keys,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  hash_set_ref_t<cuco::insert_and_find_tag> set,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+
+template void compute_single_pass_aggs<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
+  int64_t num_keys,
+  bool skip_rows_with_nulls,
+  bitmask_type const* row_bitmask,
+  nullable_hash_set_ref_t<cuco::insert_and_find_tag> set,
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
new file mode 100644
index 00000000000..a7434bdf61a
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Computes all aggregations from `requests` that require a single pass
+ * over the data and stores the results in `sparse_results`
+ */
+template <typename SetType>
+void compute_single_pass_aggs(int64_t num_keys,
+                              bool skip_rows_with_nulls,
+                              bitmask_type const* row_bitmask,
+                              SetType set,
+                              cudf::host_span<cudf::groupby::aggregation_request const> requests,
+                              cudf::detail::result_cache* sparse_results,
+                              rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
new file mode 100644
index 00000000000..22fa4fc584c
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "create_sparse_results_table.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+// make table that will hold sparse results
+cudf::table create_sparse_results_table(table_view const& flattened_values,
+                                        std::vector<aggregation::Kind> aggs,
+                                        rmm::cuda_stream_view stream)
+{
+  // TODO single allocation - room for performance improvement
+  std::vector<std::unique_ptr<column>> sparse_columns;
+  sparse_columns.reserve(flattened_values.num_columns());
+  std::transform(
+    flattened_values.begin(),
+    flattened_values.end(),
+    aggs.begin(),
+    std::back_inserter(sparse_columns),
+    [stream](auto const& col, auto const& agg) {
+      bool nullable =
+        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
+          ? false
+          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
+      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
+
+      auto col_type = cudf::is_dictionary(col.type())
+                        ? cudf::dictionary_column_view(col).keys().type()
+                        : col.type();
+
+      return make_fixed_width_column(
+        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
+    });
+
+  table sparse_table(std::move(sparse_columns));
+  mutable_table_view table_view = sparse_table.mutable_view();
+  cudf::detail::initialize_with_identity(table_view, aggs, stream);
+  return sparse_table;
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
new file mode 100644
index 00000000000..c1d4e0d3f20
--- /dev/null
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <vector>
+
+namespace cudf::groupby::detail::hash {
+// make table that will hold sparse results
+cudf::table create_sparse_results_table(table_view const& flattened_values,
+                                        std::vector<aggregation::Kind> aggs_kinds,
+                                        rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
index 2bf983e5e90..dfad51f27d4 100644
--- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -17,7 +17,6 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <memory>
diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh
new file mode 100644
index 00000000000..50e89c727ff
--- /dev/null
+++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+
+namespace cudf::groupby::detail::hash {
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
+struct update_target_element_gmem {
+  __device__ void operator()(cudf::mutable_column_device_view,
+                             cudf::size_type,
+                             cudf::column_device_view,
+                             cuda::std::byte*,
+                             cudf::size_type) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MIN,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_min(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::MAX,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_max(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceType          = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    DeviceType* source_casted = reinterpret_cast<DeviceType*>(source);
+    cudf::detail::atomic_add(&target.element<DeviceType>(target_index),
+                             static_cast<DeviceType>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// The shared memory will already have it squared
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    Target value          = static_cast<Target>(source_casted[source_index]);
+
+    cudf::detail::atomic_add(&target.element<Target>(target_index), value);
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_mul(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and
+// non-fixed point column
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_VALID is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+
+    Target* source_casted = reinterpret_cast<Target*>(source);
+    cudf::detail::atomic_add(&target.element<Target>(target_index),
+                             static_cast<Target>(source_casted[source_index]));
+
+    // It is assumed the output for COUNT_ALL is initialized to be all valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmax_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source_column.element<Source>(source_argmax_index) >
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmax_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+template <typename Source>
+struct update_target_element_gmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target             = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* source_casted    = reinterpret_cast<Target*>(source);
+    auto source_argmin_index = source_casted[source_index];
+    auto old                 = cudf::detail::atomic_cas(
+      &target.element<Target>(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source_column.element<Source>(source_argmin_index) <
+             source_column.element<Source>(old)) {
+        old =
+          cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_argmin_index);
+      }
+    }
+
+    if (target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
+/**
+ * @brief A functor that updates a single element in the target column stored in global memory by
+ * applying an aggregation operation to a corresponding element from a source column in shared
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
+struct gmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cudf::mutable_column_device_view target,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source_column,
+                             cuda::std::byte* source,
+                             bool* source_mask,
+                             cudf::size_type source_index) const noexcept
+  {
+    // Early exit for all aggregation kinds since shared memory aggregation of
+    // `COUNT_ALL` is always valid
+    if (!source_mask[source_index]) { return; }
+
+    update_target_element_gmem<Source, k>{}(
+      target, target_index, source_column, source, source_index);
+  }
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 0432b9d120a..30e1d52fdbf 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -14,60 +14,32 @@
  * limitations under the License.
  */
 
-#include "flatten_single_pass_aggs.hpp"
+#include "compute_groupby.hpp"
 #include "groupby/common/utils.hpp"
-#include "groupby_kernels.cuh"
-#include "var_hash_functor.cuh"
+#include "helpers.cuh"
 
 #include <cudf/aggregation.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <cuco/static_set.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
+#include <algorithm>
 #include <memory>
-#include <unordered_set>
 #include <utility>
+#include <vector>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace cudf::groupby::detail::hash {
 namespace {
-
-// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
-// types and `cg_size = 1`for flat data to improve performance
-using probing_scheme_type = cuco::linear_probing<
-  1,  ///< Number of threads used to handle each input key
-  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
-                                                   cudf::nullate::DYNAMIC>>;
-
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
@@ -112,413 +84,33 @@ bool constexpr is_hash_aggregation(aggregation::Kind t)
   return array_contains(hash_aggregations, t);
 }
 
-template <typename SetType>
-class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
-  column_view col;
-  data_type result_type;
-  cudf::detail::result_cache* sparse_results;
-  cudf::detail::result_cache* dense_results;
-  device_span<size_type const> gather_map;
-  SetType set;
-  bitmask_type const* __restrict__ row_bitmask;
-  rmm::cuda_stream_view stream;
-  rmm::device_async_resource_ref mr;
-
- public:
-  using cudf::detail::aggregation_finalizer::visit;
-
-  hash_compound_agg_finalizer(column_view col,
-                              cudf::detail::result_cache* sparse_results,
-                              cudf::detail::result_cache* dense_results,
-                              device_span<size_type const> gather_map,
-                              SetType set,
-                              bitmask_type const* row_bitmask,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-    : col(col),
-      sparse_results(sparse_results),
-      dense_results(dense_results),
-      gather_map(gather_map),
-      set(set),
-      row_bitmask(row_bitmask),
-      stream(stream),
-      mr(mr)
-  {
-    result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type()
-                                                  : col.type();
-  }
-
-  auto to_dense_agg_result(cudf::aggregation const& agg)
-  {
-    auto s                  = sparse_results->get_result(col, agg);
-    auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
-                                                   gather_map,
-                                                   out_of_bounds_policy::DONT_CHECK,
-                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                   stream,
-                                                   mr);
-    return std::move(dense_result_table->release()[0]);
-  }
-
-  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
-  auto gather_argminmax(aggregation const& agg)
-  {
-    auto arg_result = to_dense_agg_result(agg);
-    // We make a view of ARG(MIN/MAX) result without a null mask and gather
-    // using this map. The values in data buffer of ARG(MIN/MAX) result
-    // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
-    // which is an out of bounds index value (-1) and causes the gathered
-    // value to be null.
-    column_view null_removed_map(
-      data_type(type_to_id<size_type>()),
-      arg_result->size(),
-      static_cast<void const*>(arg_result->view().template data<size_type>()),
-      nullptr,
-      0);
-    auto gather_argminmax =
-      cudf::detail::gather(table_view({col}),
-                           null_removed_map,
-                           arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
-                                                  : cudf::out_of_bounds_policy::DONT_CHECK,
-                           cudf::detail::negative_index_policy::NOT_ALLOWED,
-                           stream,
-                           mr);
-    return std::move(gather_argminmax->release()[0]);
-  }
-
-  // Declare overloads for each kind of aggregation to dispatch
-  void visit(cudf::aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::min_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmin_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::max_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    if (result_type.id() == type_id::STRING) {
-      auto transformed_agg = make_argmax_aggregation();
-      dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
-    } else {
-      dense_results->add_result(col, agg, to_dense_agg_result(agg));
-    }
-  }
-
-  void visit(cudf::detail::mean_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = dense_results->get_result(col, *sum_agg);
-    column_view count_result = dense_results->get_result(col, *count_agg);
-
-    auto result =
-      cudf::detail::binary_operation(sum_result,
-                                     count_result,
-                                     binary_operator::DIV,
-                                     cudf::detail::target_type(result_type, aggregation::MEAN),
-                                     stream,
-                                     mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-
-  void visit(cudf::detail::var_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-
-    auto sum_agg   = make_sum_aggregation();
-    auto count_agg = make_count_aggregation();
-    this->visit(*sum_agg);
-    this->visit(*count_agg);
-    column_view sum_result   = sparse_results->get_result(col, *sum_agg);
-    column_view count_result = sparse_results->get_result(col, *count_agg);
-
-    auto values_view = column_device_view::create(col, stream);
-    auto sum_view    = column_device_view::create(sum_result, stream);
-    auto count_view  = column_device_view::create(count_result, stream);
-
-    auto var_result = make_fixed_width_column(
-      cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
-    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
-    mutable_table_view var_table_view{{var_result->mutable_view()}};
-    cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
-
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      col.size(),
-      var_hash_functor{
-        set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
-    sparse_results->add_result(col, agg, std::move(var_result));
-    dense_results->add_result(col, agg, to_dense_agg_result(agg));
-  }
-
-  void visit(cudf::detail::std_aggregation const& agg) override
-  {
-    if (dense_results->has_result(col, agg)) return;
-    auto var_agg = make_variance_aggregation(agg._ddof);
-    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
-    column_view variance = dense_results->get_result(col, *var_agg);
-
-    auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
-    dense_results->add_result(col, agg, std::move(result));
-  }
-};
-
-/**
- * @brief Gather sparse results into dense using `gather_map` and add to
- * `dense_cache`
- *
- * @see groupby_null_templated()
- */
-template <typename SetType>
-void sparse_to_dense_results(table_view const& keys,
-                             host_span<aggregation_request const> requests,
-                             cudf::detail::result_cache* sparse_results,
-                             cudf::detail::result_cache* dense_results,
-                             device_span<size_type const> gather_map,
-                             SetType set,
-                             bool keys_have_nulls,
-                             null_policy include_null_keys,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
-  bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
-  bitmask_type const* row_bitmask_ptr =
-    skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
-
-  for (auto const& request : requests) {
-    auto const& agg_v = request.aggregations;
-    auto const& col   = request.values;
-
-    // Given an aggregation, this will get the result from sparse_results and
-    // convert and return dense, compacted result
-    auto finalizer = hash_compound_agg_finalizer(
-      col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr);
-    for (auto&& agg : agg_v) {
-      agg->finalize(finalizer);
-    }
-  }
-}
-
-// make table that will hold sparse results
-auto create_sparse_results_table(table_view const& flattened_values,
-                                 std::vector<aggregation::Kind> aggs,
-                                 rmm::cuda_stream_view stream)
-{
-  // TODO single allocation - room for performance improvement
-  std::vector<std::unique_ptr<column>> sparse_columns;
-  std::transform(
-    flattened_values.begin(),
-    flattened_values.end(),
-    aggs.begin(),
-    std::back_inserter(sparse_columns),
-    [stream](auto const& col, auto const& agg) {
-      bool nullable =
-        (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL)
-          ? false
-          : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD);
-      auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
-
-      auto col_type = cudf::is_dictionary(col.type())
-                        ? cudf::dictionary_column_view(col).keys().type()
-                        : col.type();
-
-      return make_fixed_width_column(
-        cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream);
-    });
-
-  table sparse_table(std::move(sparse_columns));
-  mutable_table_view table_view = sparse_table.mutable_view();
-  cudf::detail::initialize_with_identity(table_view, aggs, stream);
-  return sparse_table;
-}
-
-/**
- * @brief Computes all aggregations from `requests` that require a single pass
- * over the data and stores the results in `sparse_results`
- */
-template <typename SetType>
-void compute_single_pass_aggs(table_view const& keys,
-                              host_span<aggregation_request const> requests,
-                              cudf::detail::result_cache* sparse_results,
-                              SetType set,
-                              bool keys_have_nulls,
-                              null_policy include_null_keys,
-                              rmm::cuda_stream_view stream)
-{
-  // flatten the aggs to a table that can be operated on by aggregate_row
-  auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
-
-  // make table that will hold sparse results
-  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
-  // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
-  auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, cudf::get_current_device_resource_ref());
-  auto const skip_key_rows_with_nulls =
-    keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
-
-  auto row_bitmask =
-    skip_key_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
-      : rmm::device_buffer{};
-
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    keys.num_rows(),
-    hash::compute_single_pass_aggs_fn{set,
-                                      *d_values,
-                                      *d_sparse_table,
-                                      d_aggs.data(),
-                                      static_cast<bitmask_type*>(row_bitmask.data()),
-                                      skip_key_rows_with_nulls});
-  // Add results back to sparse_results cache
-  auto sparse_result_cols = sparse_table.release();
-  for (size_t i = 0; i < aggs.size(); i++) {
-    // Note that the cache will make a copy of this temporary aggregation
-    sparse_results->add_result(
-      flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i]));
-  }
-}
-
-/**
- * @brief Computes and returns a device vector containing all populated keys in
- * `map`.
- */
-template <typename SetType>
-rmm::device_uvector<size_type> extract_populated_keys(SetType const& key_set,
-                                                      size_type num_keys,
-                                                      rmm::cuda_stream_view stream)
-{
-  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
-  auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value());
-
-  populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream);
-  return populated_keys;
-}
-
-/**
- * @brief Computes groupby using hash table.
- *
- * First, we create a hash table that stores the indices of unique rows in
- * `keys`. The upper limit on the number of values in this map is the number
- * of rows in `keys`.
- *
- * To store the results of aggregations, we create temporary sparse columns
- * which have the same size as input value columns. Using the hash map, we
- * determine the location within the sparse column to write the result of the
- * aggregation into.
- *
- * The sparse column results of all aggregations are stored into the cache
- * `sparse_results`. This enables the use of previously calculated results in
- * other aggregations.
- *
- * All the aggregations which can be computed in a single pass are computed
- * first, in a combined kernel. Then using these results, aggregations that
- * require multiple passes, will be computed.
- *
- * Finally, using the hash map, we generate a vector of indices of populated
- * values in sparse result columns. Then, for each aggregation originally
- * requested in `requests`, we gather sparse results into a column of dense
- * results using the aforementioned index vector. Dense results are stored into
- * the in/out parameter `cache`.
- */
-std::unique_ptr<table> groupby(table_view const& keys,
-                               host_span<aggregation_request const> requests,
-                               cudf::detail::result_cache* cache,
-                               bool const keys_have_nulls,
-                               null_policy const include_null_keys,
-                               rmm::cuda_stream_view stream,
-                               rmm::device_async_resource_ref mr)
+std::unique_ptr<table> dispatch_groupby(table_view const& keys,
+                                        host_span<aggregation_request const> requests,
+                                        cudf::detail::result_cache* cache,
+                                        bool const keys_have_nulls,
+                                        null_policy const include_null_keys,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
 {
-  // convert to int64_t to avoid potential overflow with large `keys`
-  auto const num_keys            = static_cast<int64_t>(keys.num_rows());
-  auto const null_keys_are_equal = null_equality::EQUAL;
-  auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const null_keys_are_equal  = null_equality::EQUAL;
+  auto const has_null             = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
+  auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
   auto const comparator  = cudf::experimental::row::equality::self_comparator{preprocessed_keys};
   auto const row_hash    = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)};
   auto const d_row_hash  = row_hash.device_hasher(has_null);
 
-  // Cache of sparse results where the location of aggregate value in each
-  // column is indexed by the hash set
-  cudf::detail::result_cache sparse_results(requests.size());
-
-  auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = cuco::static_set{
-      num_keys,
-      0.5,  // desired load factor
-      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-      d_key_equal,
-      probing_scheme_type{d_row_hash},
-      cuco::thread_scope_device,
-      cuco::storage<1>{},
-      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-      stream.value()};
-
-    // Compute all single pass aggs first
-    compute_single_pass_aggs(keys,
-                             requests,
-                             &sparse_results,
-                             set.ref(cuco::insert_and_find),
-                             keys_have_nulls,
-                             include_null_keys,
-                             stream);
-
-    // Extract the populated indices from the hash set and create a gather map.
-    // Gathering using this map from sparse results will give dense results.
-    auto gather_map = extract_populated_keys(set, keys.num_rows(), stream);
-
-    // Compact all results from sparse_results and insert into cache
-    sparse_to_dense_results(keys,
-                            requests,
-                            &sparse_results,
-                            cache,
-                            gather_map,
-                            set.ref(cuco::find),
-                            keys_have_nulls,
-                            include_null_keys,
-                            stream,
-                            mr);
-
-    return cudf::detail::gather(keys,
-                                gather_map,
-                                out_of_bounds_policy::DONT_CHECK,
-                                cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                stream,
-                                mr);
-  };
-
   if (cudf::detail::has_nested_columns(keys)) {
-    auto const d_key_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<true>(has_null, null_keys_are_equal);
+    return compute_groupby<nullable_row_comparator_t>(
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   } else {
-    auto const d_key_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
-    return comparator_helper(d_key_equal);
+    auto const d_row_equal = comparator.equal_to<false>(has_null, null_keys_are_equal);
+    return compute_groupby<row_comparator_t>(
+      keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr);
   }
 }
-
 }  // namespace
 
 /**
@@ -559,11 +151,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   cudf::detail::result_cache cache(requests.size());
 
   std::unique_ptr<table> unique_keys =
-    groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
+    dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr);
 
   return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
new file mode 100644
index 00000000000..37a61c1a22c
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_compound_agg_finalizer.hpp"
+#include "helpers.cuh"
+#include "var_hash_functor.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+hash_compound_agg_finalizer<SetType>::hash_compound_agg_finalizer(
+  column_view col,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  SetType set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+  : col(col),
+    sparse_results(sparse_results),
+    dense_results(dense_results),
+    gather_map(gather_map),
+    set(set),
+    row_bitmask(row_bitmask),
+    stream(stream),
+    mr(mr)
+{
+  result_type =
+    cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::to_dense_agg_result(cudf::aggregation const& agg)
+{
+  auto s                  = sparse_results->get_result(col, agg);
+  auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr);
+  return std::move(dense_result_table->release()[0]);
+}
+
+template <typename SetType>
+auto hash_compound_agg_finalizer<SetType>::gather_argminmax(aggregation const& agg)
+{
+  auto arg_result = to_dense_agg_result(agg);
+  // We make a view of ARG(MIN/MAX) result without a null mask and gather
+  // using this map. The values in data buffer of ARG(MIN/MAX) result
+  // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+  // which is an out of bounds index value (-1) and causes the gathered
+  // value to be null.
+  column_view null_removed_map(
+    data_type(type_to_id<size_type>()),
+    arg_result->size(),
+    static_cast<void const*>(arg_result->view().template data<size_type>()),
+    nullptr,
+    0);
+  auto gather_argminmax =
+    cudf::detail::gather(table_view({col}),
+                         null_removed_map,
+                         arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY
+                                                : cudf::out_of_bounds_policy::DONT_CHECK,
+                         cudf::detail::negative_index_policy::NOT_ALLOWED,
+                         stream,
+                         mr);
+  return std::move(gather_argminmax->release()[0]);
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::min_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmin_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::max_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  if (result_type.id() == type_id::STRING) {
+    auto transformed_agg = make_argmax_aggregation();
+    dense_results->add_result(col, agg, gather_argminmax(*transformed_agg));
+  } else {
+    dense_results->add_result(col, agg, to_dense_agg_result(agg));
+  }
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::mean_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = dense_results->get_result(col, *sum_agg);
+  column_view count_result = dense_results->get_result(col, *count_agg);
+
+  auto result =
+    cudf::detail::binary_operation(sum_result,
+                                   count_result,
+                                   binary_operator::DIV,
+                                   cudf::detail::target_type(result_type, aggregation::MEAN),
+                                   stream,
+                                   mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::var_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+
+  auto sum_agg   = make_sum_aggregation();
+  auto count_agg = make_count_aggregation();
+  this->visit(*sum_agg);
+  this->visit(*count_agg);
+  column_view sum_result   = sparse_results->get_result(col, *sum_agg);
+  column_view count_result = sparse_results->get_result(col, *count_agg);
+
+  auto values_view = column_device_view::create(col, stream);
+  auto sum_view    = column_device_view::create(sum_result, stream);
+  auto count_view  = column_device_view::create(count_result, stream);
+
+  auto var_result = make_fixed_width_column(
+    cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
+  auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
+  mutable_table_view var_table_view{{var_result->mutable_view()}};
+  cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    col.size(),
+    var_hash_functor{
+      set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof});
+  sparse_results->add_result(col, agg, std::move(var_result));
+  dense_results->add_result(col, agg, to_dense_agg_result(agg));
+}
+
+template <typename SetType>
+void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::std_aggregation const& agg)
+{
+  if (dense_results->has_result(col, agg)) return;
+  auto var_agg = make_variance_aggregation(agg._ddof);
+  this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
+  column_view variance = dense_results->get_result(col, *var_agg);
+
+  auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
+  dense_results->add_result(col, agg, std::move(result));
+}
+
+template class hash_compound_agg_finalizer<hash_set_ref_t<cuco::find_tag>>;
+template class hash_compound_agg_finalizer<nullable_hash_set_ref_t<cuco::find_tag>>;
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
new file mode 100644
index 00000000000..8bee1a92c40
--- /dev/null
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
+  column_view col;
+  data_type result_type;
+  cudf::detail::result_cache* sparse_results;
+  cudf::detail::result_cache* dense_results;
+  device_span<size_type const> gather_map;
+  SetType set;
+  bitmask_type const* __restrict__ row_bitmask;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  hash_compound_agg_finalizer(column_view col,
+                              cudf::detail::result_cache* sparse_results,
+                              cudf::detail::result_cache* dense_results,
+                              device_span<size_type const> gather_map,
+                              SetType set,
+                              bitmask_type const* row_bitmask,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+  auto to_dense_agg_result(cudf::aggregation const& agg);
+
+  // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
+  auto gather_argminmax(cudf::aggregation const& agg);
+
+  // Declare overloads for each kind of aggregation to dispatch
+  void visit(cudf::aggregation const& agg) override;
+
+  void visit(cudf::detail::min_aggregation const& agg) override;
+
+  void visit(cudf::detail::max_aggregation const& agg) override;
+
+  void visit(cudf::detail::mean_aggregation const& agg) override;
+
+  void visit(cudf::detail::var_aggregation const& agg) override;
+
+  void visit(cudf::detail::std_aggregation const& agg) override;
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
new file mode 100644
index 00000000000..00836567b4f
--- /dev/null
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/static_set.cuh>
+
+namespace cudf::groupby::detail::hash {
+// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested
+// types and `cg_size = 1`for flat data to improve performance
+/// Number of threads to handle each input element
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
+
+/// Number of slots per thread
+CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
+
+/// Thread block size
+CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
+
+/// Threshold cardinality to switch between shared memory aggregations and global memory
+/// aggregations
+CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128;
+
+// We add additional `block_size`, because after the number of elements in the local hash set
+// exceeds the threshold, all threads in the thread block can still insert one more element.
+/// The maximum number of elements handled per block
+CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS =
+  GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE;
+
+// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy
+/// Shared memory hash set extent type
+using shmem_extent_t =
+  cuco::extent<cudf::size_type,
+               static_cast<cudf::size_type>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>;
+
+/// Number of windows needed by each shared memory hash set
+CUDF_HOST_DEVICE auto constexpr window_extent =
+  cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
+
+using row_hash_t =
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>;
+
+/// Probing scheme type used by groupby hash table
+using probing_scheme_t = cuco::linear_probing<GROUPBY_CG_SIZE, row_hash_t>;
+
+using row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  false,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
+using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator<
+  true,
+  cudf::nullate::DYNAMIC,
+  cudf::experimental::row::equality::nan_equal_physical_equality_comparator>;
+
+using global_set_t = cuco::static_set<cudf::size_type,
+                                      cuco::extent<int64_t>,
+                                      cuda::thread_scope_device,
+                                      row_comparator_t,
+                                      probing_scheme_t,
+                                      cudf::detail::cuco_allocator<char>,
+                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+using nullable_global_set_t = cuco::static_set<cudf::size_type,
+                                               cuco::extent<int64_t>,
+                                               cuda::thread_scope_device,
+                                               nullable_row_comparator_t,
+                                               probing_scheme_t,
+                                               cudf::detail::cuco_allocator<char>,
+                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+
+template <typename Op>
+using hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  Op>;
+
+template <typename Op>
+using nullable_hash_set_ref_t = cuco::static_set_ref<
+  cudf::size_type,
+  cuda::thread_scope_device,
+  nullable_row_comparator_t,
+  probing_scheme_t,
+  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  Op>;
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
new file mode 100644
index 00000000000..9cbeeb34b86
--- /dev/null
+++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/device_aggregators.cuh>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/traits.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/type_traits>
+
+namespace cudf::groupby::detail::hash {
+template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
+struct update_target_element_shmem {
+  __device__ void operator()(
+    cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MIN,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MIN>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MIN>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_min(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::MAX,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::MAX>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::MAX>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_max(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM,
+  cuda::std::enable_if_t<cudf::is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                         !cudf::is_timestamp<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using DeviceTarget = cudf::detail::underlying_target_t<Source, aggregation::SUM>;
+    using DeviceSource = cudf::detail::underlying_source_t<Source, aggregation::SUM>;
+
+    DeviceTarget* target_casted = reinterpret_cast<DeviceTarget*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index],
+                             static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::SUM_OF_SQUARES,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::SUM_OF_SQUARES>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto value            = static_cast<Target>(source.element<Source>(source_index));
+    cudf::detail::atomic_add(&target_casted[target_index], value * value);
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::PRODUCT,
+  cuda::std::enable_if_t<cudf::detail::is_product_supported<Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::PRODUCT>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_mul(&target_casted[target_index],
+                             static_cast<Target>(source.element<Source>(source_index)));
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_VALID,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_VALID>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    // The nullability was checked prior to this call in the `shmem_element_aggregator` functor
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_VALID>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::COUNT_ALL,
+  cuda::std::enable_if_t<
+    cudf::detail::is_valid_aggregation<Source, cudf::aggregation::COUNT_ALL>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::COUNT_ALL>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    cudf::detail::atomic_add(&target_casted[target_index], Target{1});
+
+    // Assumes target is already set to be valid
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMAX,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMAX>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMAX>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMAX_SENTINEL) {
+      while (source.element<Source>(source_index) > source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+template <typename Source>
+struct update_target_element_shmem<
+  Source,
+  cudf::aggregation::ARGMIN,
+  cuda::std::enable_if_t<cudf::detail::is_valid_aggregation<Source, cudf::aggregation::ARGMIN>() and
+                         cudf::is_relationally_comparable<Source, Source>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    using Target          = cudf::detail::target_type_t<Source, cudf::aggregation::ARGMIN>;
+    Target* target_casted = reinterpret_cast<Target*>(target);
+    auto old              = cudf::detail::atomic_cas(
+      &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index);
+    if (old != cudf::detail::ARGMIN_SENTINEL) {
+      while (source.element<Source>(source_index) < source.element<Source>(old)) {
+        old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index);
+      }
+    }
+
+    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+  }
+};
+
+/**
+ * @brief A functor that updates a single element in the target column stored in shared memory by
+ * applying an aggregation operation to a corresponding element from a source column in global
+ * memory.
+ *
+ * This functor can NOT be used for dictionary columns.
+ *
+ * This is a redundant copy replicating the behavior of `elementwise_aggregator` from
+ * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts
+ * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from
+ * shared memory.
+ */
+struct shmem_element_aggregator {
+  template <typename Source, cudf::aggregation::Kind k>
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type target_index,
+                             cudf::column_device_view source,
+                             cudf::size_type source_index) const noexcept
+  {
+    // Check nullability for all aggregation kinds but `COUNT_ALL`
+    if constexpr (k != cudf::aggregation::COUNT_ALL) {
+      if (source.is_null(source_index)) { return; }
+    }
+    update_target_element_shmem<Source, k>{}(
+      target, target_mask, target_index, source, source_index);
+  }
+};
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
similarity index 55%
rename from cpp/src/groupby/hash/groupby_kernels.cuh
rename to cpp/src/groupby/hash/single_pass_functors.cuh
index 86f4d76487f..28a5b578e00 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -21,12 +20,90 @@
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/bit.hpp>
 
-#include <thrust/pair.h>
+#include <cuco/static_set_ref.cuh>
+
+namespace cudf::groupby::detail::hash {
+// TODO: TO BE REMOVED issue tracked via #17171
+template <typename T, cudf::aggregation::Kind k>
+__device__ constexpr bool is_supported()
+{
+  return cudf::is_fixed_width<T>() and
+         ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::SUM_OF_SQUARES) or
+          (k == cudf::aggregation::MIN) or (k == cudf::aggregation::MAX) or
+          (k == cudf::aggregation::COUNT_VALID) or (k == cudf::aggregation::COUNT_ALL) or
+          (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::ARGMAX) or
+          (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or
+          (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported<T>());
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ std::enable_if_t<not std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  using DeviceType = cudf::device_storage_type_t<T>;
+  return cudf::detail::corresponding_operator_t<k>::template identity<DeviceType>();
+}
+
+template <typename T, cudf::aggregation::Kind k, typename Enable = void>
+__device__ std::enable_if_t<std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator");
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ T get_identity()
+{
+  if ((k == cudf::aggregation::ARGMAX) or (k == cudf::aggregation::ARGMIN)) {
+    if constexpr (cudf::is_timestamp<T>()) {
+      return k == cudf::aggregation::ARGMAX
+               ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)}
+               : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)};
+    } else {
+      using DeviceType = cudf::device_storage_type_t<T>;
+      return k == cudf::aggregation::ARGMAX
+               ? static_cast<DeviceType>(cudf::detail::ARGMAX_SENTINEL)
+               : static_cast<DeviceType>(cudf::detail::ARGMIN_SENTINEL);
+    }
+  }
+  return identity_from_operator<T, k>();
+}
+
+template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
+struct initialize_target_element {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target, k>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* target_casted = reinterpret_cast<DeviceType*>(target);
+
+    target_casted[idx] = get_identity<DeviceType, k>();
+
+    target_mask[idx] = (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::COUNT_VALID);
+  }
+};
+
+struct initialize_shmem {
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    initialize_target_element<Target, k>{}(target, target_mask, idx);
+  }
+};
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
 /**
  * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
  * and populate `set` with indices of unique keys
@@ -102,8 +179,4 @@ struct compute_single_pass_aggs_fn {
     }
   }
 };
-
-}  // namespace hash
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu
new file mode 100644
index 00000000000..e1c2cd22309
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_compound_agg_finalizer.hpp"
+#include "helpers.cuh"
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetRef>
+void sparse_to_dense_results(host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetRef set,
+                             bitmask_type const* row_bitmask,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  for (auto const& request : requests) {
+    auto const& agg_v = request.aggregations;
+    auto const& col   = request.values;
+
+    // Given an aggregation, this will get the result from sparse_results and
+    // convert and return dense, compacted result
+    auto finalizer = hash_compound_agg_finalizer(
+      col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr);
+    for (auto&& agg : agg_v) {
+      agg->finalize(finalizer);
+    }
+  }
+}
+
+template void sparse_to_dense_results<hash_set_ref_t<cuco::find_tag>>(
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  hash_set_ref_t<cuco::find_tag> set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template void sparse_to_dense_results<nullable_hash_set_ref_t<cuco::find_tag>>(
+  host_span<aggregation_request const> requests,
+  cudf::detail::result_cache* sparse_results,
+  cudf::detail::result_cache* dense_results,
+  device_span<size_type const> gather_map,
+  nullable_hash_set_ref_t<cuco::find_tag> set,
+  bitmask_type const* row_bitmask,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
new file mode 100644
index 00000000000..3a2b3090b99
--- /dev/null
+++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::groupby::detail::hash {
+/**
+ * @brief Gather sparse aggregation results into dense using `gather_map` and add to
+ * `dense_results`
+ *
+ * @tparam SetRef Device hash set ref type
+ *
+ * @param[in] requests The set of columns to aggregate and the aggregations to perform
+ * @param[in] sparse_results Sparse aggregation results
+ * @param[out] dense_results Dense aggregation results
+ * @param[in] gather_map Gather map indicating valid elements in `sparse_results`
+ * @param[in] set Device hash set ref
+ * @param[in] row_bitmask Bitmask indicating the validity of input keys
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @param[in] mr Device memory resource used to allocate the returned table
+ */
+template <typename SetRef>
+void sparse_to_dense_results(host_span<aggregation_request const> requests,
+                             cudf::detail::result_cache* sparse_results,
+                             cudf::detail::result_cache* dense_results,
+                             device_span<size_type const> gather_map,
+                             SetRef set,
+                             bitmask_type const* row_bitmask,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index a9085a1f1fd..3041e261945 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -26,7 +26,6 @@
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 82d557b9f7e..d6c900fb689 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -108,7 +109,7 @@ struct quantiles_functor {
     auto values_view     = column_device_view::create(values, stream);
     auto group_size_view = column_device_view::create(group_sizes, stream);
     auto result_view     = mutable_column_device_view::create(result->mutable_view(), stream);
-    auto null_count      = rmm::device_scalar<cudf::size_type>(0, stream, mr);
+    auto null_count      = cudf::detail::device_scalar<cudf::size_type>(0, stream, mr);
 
     // For each group, calculate quantile
     if (!cudf::is_dictionary(values.type())) {
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 86ee20dbbe2..c3dfac46502 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -134,7 +134,7 @@ struct var_functor {
 
     // set nulls
     auto result_view  = mutable_column_device_view::create(*result, stream);
-    auto null_count   = rmm::device_scalar<cudf::size_type>(0, stream, mr);
+    auto null_count   = cudf::detail::device_scalar<cudf::size_type>(0, stream, mr);
     auto d_null_count = null_count.data();
     thrust::for_each_n(
       rmm::exec_policy(stream),
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index c7bfd4aecf4..a0c51940c87 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -302,7 +302,8 @@ std::unique_ptr<column> md5(table_view const& input,
                              }
                              return md5_leaf_type_check(col.type());
                            }),
-               "Unsupported column type for hash function.");
+               "Unsupported column type for hash function.",
+               cudf::data_type_error);
 
   // Digest size in bytes
   auto constexpr digest_size = 32;
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index ebaec8e2775..eb002cf9c6f 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -513,7 +513,8 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   CUDF_EXPECTS(
     std::all_of(
       input.begin(), input.end(), [](auto const& col) { return sha_leaf_type_check(col.type()); }),
-    "Unsupported column type for hash function.");
+    "Unsupported column type for hash function.",
+    cudf::data_type_error);
 
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(Hasher::digest_size);
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index a99262fb3bf..c69ebe12d2c 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -20,11 +20,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
 #include <nanoarrow/nanoarrow.h>
 
 namespace cudf {
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 1b79fbf9eda..e4bdedf6603 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index a1be6aade4e..4395b741e53 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -16,11 +16,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/list_view.hpp>
-#include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index a2874b46b06..fc1b0226a48 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -19,6 +19,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -35,7 +36,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -60,7 +60,7 @@ template <typename>
 struct is_device_scalar : public std::false_type {};
 
 template <typename T>
-struct is_device_scalar<rmm::device_scalar<T>> : public std::true_type {};
+struct is_device_scalar<cudf::detail::device_scalar<T>> : public std::true_type {};
 
 template <typename>
 struct is_device_uvector : public std::false_type {};
@@ -232,10 +232,10 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
     // in the offsets buffer. While some arrow implementations may accept a zero-sized
     // offsets buffer, best practices would be to allocate the buffer with the single value.
     if (nanoarrow_type == NANOARROW_TYPE_STRING) {
-      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int32_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     } else {
-      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int64_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     }
 
@@ -466,10 +466,10 @@ int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out
   if (column.size() == 0) {
     // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
     if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
-      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int64_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     } else {
-      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      auto zero = std::make_unique<cudf::detail::device_scalar<int32_t>>(0, stream, mr);
       NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
     }
 
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index d5caa4720ac..b3fcca62314 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -17,7 +17,6 @@
 #include "avro.hpp"
 
 #include <array>
-#include <cstring>
 #include <unordered_map>
 
 namespace cudf {
diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp
index 2e992546ccc..fd2c781b8a1 100644
--- a/cpp/src/io/avro/avro.hpp
+++ b/cpp/src/io/avro/avro.hpp
@@ -18,11 +18,9 @@
 
 #include "avro_common.hpp"
 
-#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <cstdio>
 #include <cstring>
 #include <map>
 #include <string>
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 583bd6a3523..2e1cda2d6b7 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -18,9 +18,7 @@
 
 #include "gpuinflate.hpp"
 
-#include <cudf/io/config_utils.hpp>
 #include <cudf/io/nvcomp_adapter.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 1af45b41d8e..fb8c308065d 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -24,8 +24,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <cuda_runtime.h>
-
 #include <zlib.h>  // uncompress
 
 #include <cstring>  // memset
@@ -538,8 +536,10 @@ size_t decompress_zstd(host_span<uint8_t const> src,
   CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed");
 
   // Copy temporary output to `dst`
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    dst.data(), d_dst.data(), hd_stats[0].bytes_written, cudaMemcpyDefault, stream.value()));
+  cudf::detail::cuda_memcpy_async(
+    dst.subspan(0, hd_stats[0].bytes_written),
+    device_span<uint8_t const>{d_dst.data(), hd_stats[0].bytes_written},
+    stream);
 
   return hd_stats[0].bytes_written;
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 8c32fc85f78..72fca75c56b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -21,6 +21,7 @@
 
 #include "csv_common.hpp"
 #include "csv_gpu.hpp"
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/io_uncomp.hpp"
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
@@ -275,11 +276,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
     auto const read_offset = byte_range_offset + input_pos + previous_data_size;
     auto const read_size   = target_pos - input_pos - previous_data_size;
     if (data.has_value()) {
-      CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                    data->data() + read_offset,
-                                    target_pos - input_pos - previous_data_size,
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(
+        device_span<char>{d_data.data() + previous_data_size, read_size},
+        data->subspan(read_offset, read_size),
+        stream);
     } else {
       if (source->is_device_read_preferred(read_size)) {
         source->device_read(read_offset,
@@ -288,12 +288,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                             stream);
       } else {
         auto const buffer = source->host_read(read_offset, read_size);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                      buffer->data(),
-                                      buffer->size(),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();  // To prevent buffer going out of scope before we copy the data.
+        // Use sync version to prevent buffer going out of scope before we copy the data.
+        cudf::detail::cuda_memcpy(
+          device_span<char>{d_data.data() + previous_data_size, read_size},
+          host_span<char const>{reinterpret_cast<char const*>(buffer->data()), buffer->size()},
+          stream);
       }
     }
 
@@ -311,12 +310,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                                                    range_end,
                                                                    skip_rows,
                                                                    stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_ctx.device_ptr(),
-                                  num_blocks * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                              device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                              stream);
 
     // Sum up the rows in each character block, selecting the row count that
     // corresponds to the current input context. Also stores the now known input
@@ -331,11 +328,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
       // At least one row in range in this batch
       all_row_offsets.resize(total_rows - skip_rows, stream);
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
-                                    row_ctx.host_ptr(),
-                                    num_blocks * sizeof(uint64_t),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(device_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                      host_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                      stream);
 
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
@@ -352,12 +347,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                              stream);
       // With byte range, we want to keep only one row out of the specified range
       if (range_end < data_size) {
-        CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                      row_ctx.device_ptr(),
-                                      num_blocks * sizeof(uint64_t),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();
+        cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                  device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                  stream);
 
         size_t rows_out_of_range = 0;
         for (uint32_t i = 0; i < num_blocks; i++) {
@@ -401,12 +393,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   // Remove header rows and extract header
   auto const header_row_index = std::max<size_t>(header_rows, 1) - 1;
   if (header_row_index + 1 < row_offsets.size()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_offsets.data() + header_row_index,
-                                  2 * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, 2),
+                              device_span<uint64_t const>{row_offsets.data() + header_row_index, 2},
+                              stream);
 
     auto const header_start = input_pos + row_ctx[0];
     auto const header_end   = input_pos + row_ctx[1];
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index b84446b5f3e..2bbe05ced84 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
@@ -405,13 +406,8 @@ void write_chunked(data_sink* out_sink,
     out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
-    thrust::host_vector<char> h_bytes(total_num_bytes);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
-                                  ptr_all_bytes,
-                                  total_num_bytes * sizeof(char),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const h_bytes = cudf::detail::make_host_vector_sync(
+      device_span<char const>{ptr_all_bytes, total_num_bytes}, stream);
 
     out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index a8682e6a760..ceaeb5d8f85 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -32,10 +32,8 @@
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/parquet_metadata.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <utility>
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index f7e8134b68d..570a00cbfc2 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -134,12 +135,13 @@ std::vector<std::string> copy_strings_to_host_sync(
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
     host_data.reserve(col.size());
-    std::transform(
-      std::begin(h_offsets),
-      std::end(h_offsets) - 1,
-      std::begin(h_offsets) + 1,
-      std::back_inserter(host_data),
-      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
+    std::transform(std::begin(h_offsets),
+                   std::end(h_offsets) - 1,
+                   std::begin(h_offsets) + 1,
+                   std::back_inserter(host_data),
+                   [&h_chars](auto start, auto end) {
+                     return std::string(h_chars.data() + start, end - start);
+                   });
     return host_data;
   };
   return to_host(d_column_names->view());
@@ -170,636 +172,78 @@ rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const>
   rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
   thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
 
-  auto parse_opt = parsing_options(options, stream);
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [options           = parse_opt.view(),
-     data              = input.data(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
-      auto const node_category = column_categories[col_ids[i]];
-      if (node_category == NC_STR or node_category == NC_VAL) {
-        auto const is_null_literal = serialized_trie_contains(
-          options.trie_na,
-          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
-        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
-      }
-    });
-  return is_all_nulls;
-}
-
-NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
-                                       bool is_enabled_lines,
-                                       rmm::cuda_stream_view stream)
-{
-  NodeIndexT value = parent_node_sentinel;
-  if (!col_ids.empty()) {
-    auto const list_node_index = is_enabled_lines ? 0 : 1;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                  col_ids.data() + list_node_index,
-                                  sizeof(NodeIndexT),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
-  }
-  return value;
-}
-/**
- * @brief Holds member data pointers of `d_json_column`
- *
- */
-struct json_column_data {
-  using row_offset_t = json_column::row_offset_t;
-  row_offset_t* string_offsets;
-  row_offset_t* string_lengths;
-  row_offset_t* child_offsets;
-  bitmask_type* validity;
-};
-
-using hashmap_of_device_columns =
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
-
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-void scatter_offsets(tree_meta_t const& tree,
-                     device_span<NodeIndexT const> col_ids,
-                     device_span<size_type const> row_offsets,
-                     device_span<size_type> node_ids,
-                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-                     tree_meta_t const& d_column_tree,
-                     host_span<const bool> ignore_vals,
-                     hashmap_of_device_columns const& columns,
-                     rmm::cuda_stream_view stream);
-
-/**
- * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are inserted into `root`'s children.
- * `root` must be a list type.
- *
- * @param input Input JSON string device data
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param row_offsets Row offsets of the nodes in the tree
- * @param root Root node of the `d_json_column` tree
- * @param is_array_of_arrays Whether the tree is an array of arrays
- * @param options Parsing options specifying the parsing behaviour
- * options affecting behaviour are
- *   is_enabled_lines: Whether the input is a line-delimited JSON
- *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the device memory
- * of child_offets and validity members of `d_json_column`
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t const& tree,
-                             device_span<NodeIndexT const> col_ids,
-                             device_span<size_type const> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  bool const is_enabled_lines                 = options.is_enabled_lines();
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  // make a copy
-  auto sorted_col_ids = cudf::detail::make_device_uvector_async(
-    col_ids, stream, cudf::get_current_device_resource_ref());
-
-  // sort by {col_id} on {node_ids} stable
-  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
-                             sorted_col_ids.begin(),
-                             sorted_col_ids.end(),
-                             node_ids.begin());
-
-  NodeIndexT const row_array_parent_col_id =
-    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
-
-  // 1. gather column information.
-  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
-    reduce_to_column_tree(tree,
-                          col_ids,
-                          sorted_col_ids,
-                          node_ids,
-                          row_offsets,
-                          is_array_of_arrays,
-                          row_array_parent_col_id,
-                          stream);
-  auto num_columns                      = d_unique_col_ids.size();
-  std::vector<std::string> column_names = copy_strings_to_host_sync(
-    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  // array of arrays column names
-  if (is_array_of_arrays) {
-    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-    auto const column_parent_ids =
-      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
-    auto values_column_indices =
-      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
-    auto h_values_column_indices =
-      cudf::detail::make_host_vector_sync(values_column_indices, stream);
-    std::transform(unique_col_ids.begin(),
-                   unique_col_ids.end(),
-                   column_names.cbegin(),
-                   column_names.begin(),
-                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
-                     auto col_id, auto name) mutable {
-                     return column_parent_ids[col_id] == row_array_parent_col_id
-                              ? std::to_string(h_values_column_indices[col_id])
-                              : name;
-                   });
-  }
-
-  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
-    if (is_enabled_mixed_types_as_string) {
-      return cudf::detail::make_std_vector_sync(
-        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
-    }
-    return std::vector<uint8_t>();
-  }();
-  auto const [ignore_vals, columns] = build_tree(root,
-                                                 is_str_column_all_nulls,
-                                                 d_column_tree,
-                                                 d_unique_col_ids,
-                                                 d_max_row_offsets,
-                                                 column_names,
-                                                 row_array_parent_col_id,
-                                                 is_array_of_arrays,
-                                                 options,
-                                                 stream,
-                                                 mr);
-
-  scatter_offsets(tree,
-                  col_ids,
-                  row_offsets,
-                  node_ids,
-                  sorted_col_ids,
-                  d_column_tree,
-                  ignore_vals,
-                  columns,
-                  stream);
-}
-
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-  auto column_categories =
-    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
-  auto const column_parent_ids =
-    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-  auto column_range_beg =
-    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
-  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
-  auto num_columns           = d_unique_col_ids.size();
-  stream.synchronize();
-
-  auto to_json_col_type = [](auto category) {
-    switch (category) {
-      case NC_STRUCT: return json_col_t::StructColumn;
-      case NC_LIST: return json_col_t::ListColumn;
-      case NC_STR: [[fallthrough]];
-      case NC_VAL: return json_col_t::StringColumn;
-      default: return json_col_t::Unknown;
-    }
-  };
-  auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
-  };
-
-  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
-    if (column_category == NC_ERR || column_category == NC_FN) {
-      return;
-    } else if (column_category == NC_VAL || column_category == NC_STR) {
-      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-      init_to_zero(col.string_offsets);
-      init_to_zero(col.string_lengths);
-    } else if (column_category == NC_LIST) {
-      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
-      init_to_zero(col.child_offsets);
-    }
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_category);
-  };
-
-  auto reinitialize_as_string = [&](auto i, auto& col) {
-    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-    init_to_zero(col.string_offsets);
-    init_to_zero(col.string_lengths);
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = json_col_t::StringColumn;
-    // destroy references of all child columns after this step, by calling remove_child_columns
-  };
-
-  path_from_tree tree_path{column_categories,
-                           column_parent_ids,
-                           column_names,
-                           is_array_of_arrays,
-                           row_array_parent_col_id};
-
-  // 2. generate nested columns tree and its device_memory
-  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
-  auto h_range_col_id_it =
-    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<0>(a) < thrust::get<0>(b);
-  });
-
-  // use hash map because we may skip field name's col_ids
-  hashmap_of_device_columns columns;
-  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
-  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
-  // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<bool>(num_columns, stream);
-  std::fill(ignore_vals.begin(), ignore_vals.end(), false);
-  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
-  std::vector<uint8_t> is_pruned(num_columns, 0);
-  // for columns that are not mixed type but have been forced as string
-  std::vector<bool> forced_as_string_column(num_columns);
-  columns.try_emplace(parent_node_sentinel, std::ref(root));
-
-  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
-    [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto const& col_name : col.column_order) {
-        auto child_id                  = mapped_columns[{this_col_id, col_name}];
-        is_mixed_type_column[child_id] = 1;
-        remove_child_columns(child_id, col.child_columns.at(col_name));
-        mapped_columns.erase({this_col_id, col_name});
-        columns.erase(child_id);
-      }
-      col.child_columns.clear();  // their references are deleted above.
-      col.column_order.clear();
-    };
-
-  auto name_and_parent_index = [&is_array_of_arrays,
-                                &row_array_parent_col_id,
-                                &column_parent_ids,
-                                &column_categories,
-                                &column_names](auto this_col_id) {
-    std::string name   = "";
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    } else {
-      CUDF_FAIL("Unexpected parent column category");
-    }
-    return std::pair{name, parent_col_id};
-  };
-
-  // Prune columns that are not required to be parsed.
-  if (options.is_enabled_prune_columns()) {
-    for (auto const this_col_id : unique_col_ids) {
-      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-        continue;
-      }
-      // Struct, List, String, Value
-      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-      // get path of this column, and get its dtype if present in options
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
-        is_pruned[this_col_id] = 1;
-        continue;
-      } else {
-        // make sure all its parents are not pruned.
-        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
-          is_pruned[parent_col_id] = 0;
-          parent_col_id            = column_parent_ids[parent_col_id];
-        }
-      }
-    }
-  }
-
-  // Build the column tree, also, handles mixed types.
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
-    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-
-    // if parent is mixed type column or this column is pruned or if parent
-    // has been forced as string, ignore this column.
-    if (parent_col_id != parent_node_sentinel &&
-          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
-        forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = true;
-      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
-      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
-      continue;
-    }
-
-    // If the child is already found,
-    // replace if this column is a nested column and the existing was a value column
-    // ignore this column if this column is a value column and the existing was a nested column
-    auto it = columns.find(parent_col_id);
-    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
-    auto& parent_col = it->second.get();
-    bool replaced    = false;
-    if (mapped_columns.count({parent_col_id, name}) > 0) {
-      auto const old_col_id = mapped_columns[{parent_col_id, name}];
-      // If mixed type as string is enabled, make both of them strings and merge them.
-      // All child columns will be ignored when parsing.
-      if (is_enabled_mixed_types_as_string) {
-        bool const is_mixed_type = [&]() {
-          // If new or old is STR and they are all not null, make it mixed type, else ignore.
-          if (column_categories[this_col_id] == NC_VAL ||
-              column_categories[this_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[this_col_id]) return false;
-          }
-          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[old_col_id]) return false;
-          }
-          return true;
-        }();
-        if (is_mixed_type) {
-          is_mixed_type_column[this_col_id] = 1;
-          is_mixed_type_column[old_col_id]  = 1;
-          // if old col type (not cat) is list or struct, replace with string.
-          auto& col = columns.at(old_col_id).get();
-          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
-            reinitialize_as_string(old_col_id, col);
-            remove_child_columns(old_col_id, col);
-            // all its children (which are already inserted) are ignored later.
-          }
-          col.forced_as_string_column = true;
-          columns.try_emplace(this_col_id, columns.at(old_col_id));
-          continue;
-        }
-      }
-
-      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = true;
-        continue;
-      }
-      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-        // remap
-        ignore_vals[old_col_id] = true;
-        mapped_columns.erase({parent_col_id, name});
-        columns.erase(old_col_id);
-        parent_col.child_columns.erase(name);
-        replaced = true;  // to skip duplicate name in column_order
-      } else {
-        // If this is a nested column but we're trying to insert either (a) a list node into a
-        // struct column or (b) a struct node into a list column, we fail
-        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
-                          column_categories[this_col_id] == NC_STRUCT) or
-                         (column_categories[old_col_id] == NC_STRUCT and
-                          column_categories[this_col_id] == NC_LIST)),
-                     "A mix of lists and structs within the same column is not supported");
-      }
-    }
-
-    auto this_column_category = column_categories[this_col_id];
-    // get path of this column, check if it is a struct/list forced as string, and enforce it
-    auto const nt                             = tree_path.get_path(this_col_id);
-    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      this_column_category = NC_STR;
-    }
-
-    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
-    // move into parent
-    device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col, this_column_category);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      col.forced_as_string_column          = true;
-      forced_as_string_column[this_col_id] = true;
-    }
-
-    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
-    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
-    if (not replaced) parent_col.column_order.push_back(name);
-    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
-    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
-  }
-
-  if (is_enabled_mixed_types_as_string) {
-    // ignore all children of mixed type columns
-    for (auto const this_col_id : unique_col_ids) {
-      auto parent_col_id = column_parent_ids[this_col_id];
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
-        is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = true;
-        columns.erase(this_col_id);
-      }
-      // Convert only mixed type columns as string (so to copy), but not its children
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
-          is_mixed_type_column[this_col_id] == 1)
-        column_categories[this_col_id] = NC_STR;
-    }
-    cudf::detail::cuda_memcpy_async<NodeT>(
-      d_column_tree.node_categories, column_categories, stream);
-  }
-
-  // ignore all children of columns forced as string
-  for (auto const this_col_id : unique_col_ids) {
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
-      forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = true;
-    }
-    // Convert only mixed type columns as string (so to copy), but not its children
-    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
-        forced_as_string_column[this_col_id])
-      column_categories[this_col_id] = NC_STR;
-  }
-  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);
-
-  // restore unique_col_ids order
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
-  return {ignore_vals, columns};
-}
-
-void scatter_offsets(tree_meta_t const& tree,
-                     device_span<NodeIndexT const> col_ids,
-                     device_span<size_type const> row_offsets,
-                     device_span<size_type> node_ids,
-                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
-                     tree_meta_t const& d_column_tree,
-                     host_span<const bool> ignore_vals,
-                     hashmap_of_device_columns const& columns,
-                     rmm::cuda_stream_view stream)
-{
-  auto const num_nodes   = col_ids.size();
-  auto const num_columns = d_column_tree.node_categories.size();
-  // move columns data to device.
-  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
-  for (auto& [col_id, col_ref] : columns) {
-    if (col_id == parent_node_sentinel) continue;
-    auto& col            = col_ref.get();
-    columns_data[col_id] = json_column_data{col.string_offsets.data(),
-                                            col.string_lengths.data(),
-                                            col.child_offsets.data(),
-                                            static_cast<bitmask_type*>(col.validity.data())};
-  }
-
-  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, cudf::get_current_device_resource_ref());
-  auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, cudf::get_current_device_resource_ref());
-
-  // 3. scatter string offsets to respective columns, set validity bits
-  thrust::for_each_n(
-    rmm::exec_policy_nosync(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     row_offsets       = row_offsets.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     d_ignore_vals     = d_ignore_vals.begin(),
-     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
-      if (d_ignore_vals[col_ids[i]]) return;
-      auto const node_category = column_categories[col_ids[i]];
-      switch (node_category) {
-        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_STR: [[fallthrough]];
-        case NC_VAL:
-          if (d_ignore_vals[col_ids[i]]) break;
-          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
-          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
-          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
-          break;
-        default: break;
-      }
-    });
-
-  // 4. scatter List offset
-  // copy_if only node's whose parent is list, (node_id, parent_col_id)
-  // stable_sort by parent_col_id of {node_id}.
-  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
-  //                                  (i==last, i+1!=i), write end offset.
-  //    unique_copy_by_key {parent_node_id} {row_offset} to
-  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
-
-  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
-  auto parent_col_id   = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<NodeIndexT>(
-      [col_ids         = col_ids.begin(),
-       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
-        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
-                                                                  : col_ids[parent_node_ids[node_id]];
-      }));
-  auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
-      num_nodes,
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [d_ignore_vals     = d_ignore_vals.begin(),
-     parent_node_ids   = tree.parent_node_ids.begin(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin()] __device__(size_type node_id) {
-      auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and
-             column_categories[col_ids[parent_node_id]] == NC_LIST and
-             (!d_ignore_vals[col_ids[parent_node_id]]);
-    });
-
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
-                             parent_col_ids.begin(),
-                             parent_col_ids.begin() + num_list_children,
-                             node_ids.begin());
+  auto parse_opt = parsing_options(options, stream);
   thrust::for_each_n(
     rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    num_list_children,
-    [node_ids        = node_ids.begin(),
-     parent_node_ids = tree.parent_node_ids.begin(),
-     parent_col_ids  = parent_col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     d_columns_data  = d_columns_data.begin(),
-     num_list_children] __device__(size_type i) {
-      auto const node_id        = node_ids[i];
-      auto const parent_node_id = parent_node_ids[node_id];
-      // scatter to list_offset
-      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
-          row_offsets[node_id];
-      }
-      // last value of list child_offset is its size.
-      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
-          row_offsets[node_id] + 1;
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [options           = parse_opt.view(),
+     data              = input.data(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
+      auto const node_category = column_categories[col_ids[i]];
+      if (node_category == NC_STR or node_category == NC_VAL) {
+        auto const is_null_literal = serialized_trie_contains(
+          options.trie_na,
+          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
+        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
       }
     });
+  return is_all_nulls;
+}
 
-  // 5. scan on offsets.
-  for (auto& [id, col_ref] : columns) {
-    auto& col = col_ref.get();
-    if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.string_offsets.begin(),
-                             col.string_offsets.end(),
-                             col.string_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.child_offsets.begin(),
-                             col.child_offsets.end(),
-                             col.child_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    }
-  }
-  stream.synchronize();
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
+                                       bool is_enabled_lines,
+                                       rmm::cuda_stream_view stream)
+{
+  if (col_ids.empty()) { return parent_node_sentinel; }
+
+  auto const list_node_index = is_enabled_lines ? 0 : 1;
+  auto const value           = cudf::detail::make_host_vector_sync(
+    device_span<NodeIndexT const>{col_ids.data() + list_node_index, 1}, stream);
+
+  return value[0];
 }
+/**
+ * @brief Holds member data pointers of `d_json_column`
+ *
+ */
+struct json_column_data {
+  using row_offset_t = json_column::row_offset_t;
+  row_offset_t* string_offsets;
+  row_offset_t* string_lengths;
+  row_offset_t* child_offsets;
+  bitmask_type* validity;
+};
+
+using hashmap_of_device_columns =
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
+
+std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
+  device_json_column& root,
+  host_span<uint8_t const> is_str_column_all_nulls,
+  tree_meta_t& d_column_tree,
+  device_span<NodeIndexT const> d_unique_col_ids,
+  device_span<size_type const> d_max_row_offsets,
+  std::vector<std::string> const& column_names,
+  NodeIndexT row_array_parent_col_id,
+  bool is_array_of_arrays,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
-namespace experimental {
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream);
 
 std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_options const& options)
 {
@@ -829,19 +273,6 @@ std::map<std::string, schema_element> unified_schema(cudf::io::json_reader_optio
     options.get_dtypes());
 }
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
 /**
  * @brief Constructs `d_json_column` from node tree representation
  * Newly constructed columns are inserted into `root`'s children.
@@ -1033,7 +464,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
   std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns());
 
   // prune all children of a column, but not self.
-  auto ignore_all_children = [&](auto parent_col_id) {
+  auto ignore_all_children = [&adj, &is_pruned](auto parent_col_id) {
     std::deque<NodeIndexT> offspring;
     if (adj.count(parent_col_id)) {
       for (auto const& child : adj[parent_col_id]) {
@@ -1384,14 +815,149 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                  column_categories.cbegin(),
                  expected_types.begin(),
                  [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
-  cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                  expected_types.data(),
-                  expected_types.size() * sizeof(column_categories[0]),
-                  cudaMemcpyDefault,
-                  stream.value());
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);
 
   return {is_pruned, columns};
 }
-}  // namespace experimental
+
+void scatter_offsets(tree_meta_t const& tree,
+                     device_span<NodeIndexT const> col_ids,
+                     device_span<size_type const> row_offsets,
+                     device_span<size_type> node_ids,
+                     device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+                     tree_meta_t const& d_column_tree,
+                     host_span<const bool> ignore_vals,
+                     hashmap_of_device_columns const& columns,
+                     rmm::cuda_stream_view stream)
+{
+  auto const num_nodes   = col_ids.size();
+  auto const num_columns = d_column_tree.node_categories.size();
+  // move columns data to device.
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
+  for (auto& [col_id, col_ref] : columns) {
+    if (col_id == parent_node_sentinel) continue;
+    auto& col            = col_ref.get();
+    columns_data[col_id] = json_column_data{col.string_offsets.data(),
+                                            col.string_lengths.data(),
+                                            col.child_offsets.data(),
+                                            static_cast<bitmask_type*>(col.validity.data())};
+  }
+
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, cudf::get_current_device_resource_ref());
+
+  // 3. scatter string offsets to respective columns, set validity bits
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      switch (node_category) {
+        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_STR: [[fallthrough]];
+        case NC_VAL:
+          if (d_ignore_vals[col_ids[i]]) break;
+          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
+          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
+          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
+          break;
+        default: break;
+      }
+    });
+
+  // 4. scatter List offset
+  // copy_if only node's whose parent is list, (node_id, parent_col_id)
+  // stable_sort by parent_col_id of {node_id}.
+  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
+  //                                  (i==last, i+1!=i), write end offset.
+  //    unique_copy_by_key {parent_node_id} {row_offset} to
+  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
+
+  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
+  auto parent_col_id   = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<NodeIndexT>(
+      [col_ids         = col_ids.begin(),
+       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
+                                                                  : col_ids[parent_node_ids[node_id]];
+      }));
+  auto const list_children_end = thrust::copy_if(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
+      num_nodes,
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
+      auto parent_node_id = parent_node_ids[node_id];
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
+    });
+
+  auto const num_list_children =
+    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
+  thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
+                             parent_col_ids.begin(),
+                             parent_col_ids.begin() + num_list_children,
+                             node_ids.begin());
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids        = node_ids.begin(),
+     parent_node_ids = tree.parent_node_ids.begin(),
+     parent_col_ids  = parent_col_ids.begin(),
+     row_offsets     = row_offsets.begin(),
+     d_columns_data  = d_columns_data.begin(),
+     num_list_children] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      // scatter to list_offset
+      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
+          row_offsets[node_id];
+      }
+      // last value of list child_offset is its size.
+      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
+          row_offsets[node_id] + 1;
+      }
+    });
+
+  // 5. scan on offsets.
+  for (auto& [id, col_ref] : columns) {
+    auto& col = col_ref.get();
+    if (col.type == json_col_t::StringColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.string_offsets.begin(),
+                             col.string_offsets.end(),
+                             col.string_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    } else if (col.type == json_col_t::ListColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.child_offsets.begin(),
+                             col.child_offsets.end(),
+                             col.child_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    }
+  }
+  stream.synchronize();
+}
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 912e93d52ae..7e4d975e431 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -485,16 +485,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
-template <typename... Args>
-auto make_device_json_column_dispatch(bool experimental, Args&&... args)
-{
-  if (experimental) {
-    return experimental::make_device_json_column(std::forward<Args>(args)...);
-  } else {
-    return make_device_json_column(std::forward<Args>(args)...);
-  }
-}
-
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -523,16 +513,14 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 #endif
 
   bool const is_array_of_arrays = [&]() {
-    std::array<node_t, 2> h_node_categories = {NC_ERR, NC_ERR};
-    auto const size_to_copy                 = std::min(size_t{2}, gpu_tree.node_categories.size());
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(),
-                                  gpu_tree.node_categories.data(),
-                                  sizeof(node_t) * size_to_copy,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size());
+    if (size_to_copy == 0) return false;
+    auto const h_node_categories = cudf::detail::make_host_vector_sync(
+      device_span<NodeT const>{gpu_tree.node_categories.data(), size_to_copy}, stream);
+
     if (options.is_enabled_lines()) return h_node_categories[0] == NC_LIST;
-    return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST;
+    return h_node_categories.size() >= 2 and h_node_categories[0] == NC_LIST and
+           h_node_categories[1] == NC_LIST;
   }();
 
   auto [gpu_col_id, gpu_row_offsets] =
@@ -553,16 +541,15 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                0);
 
   // Get internal JSON column
-  make_device_json_column_dispatch(options.is_enabled_experimental(),
-                                   d_input,
-                                   gpu_tree,
-                                   gpu_col_id,
-                                   gpu_row_offsets,
-                                   root_column,
-                                   is_array_of_arrays,
-                                   options,
-                                   stream,
-                                   mr);
+  make_device_json_column(d_input,
+                          gpu_tree,
+                          gpu_col_id,
+                          gpu_row_offsets,
+                          root_column,
+                          is_array_of_arrays,
+                          options,
+                          stream,
+                          mr);
 
   // data_root refers to the root column of the data represented by the given JSON string
   auto& data_root =
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 2d435dc8e1a..34a87918e57 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -16,6 +16,7 @@
 
 #include "io/fst/lookup_tables.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
@@ -24,7 +25,6 @@
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -316,7 +316,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
                           stream);
 
   rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
   parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
                    static_cast<SymbolT*>(outbuf.data()),
@@ -401,7 +401,7 @@ std::
                           stream);
 
   rmm::device_uvector<size_type> outbuf_indices(inbuf.size(), stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
   parser.Transduce(inbuf.data(),
                    static_cast<SymbolOffsetT>(inbuf.size()),
                    thrust::make_discard_iterator(),
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index d949635c1cc..e2fe926ea19 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -264,16 +264,13 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       error_count > 0) {
     auto const error_location =
       thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
-    SymbolOffsetT error_index;
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(&error_index,
-                      token_indices.data() + thrust::distance(tokens.begin(), error_location),
-                      sizeof(SymbolOffsetT),
-                      cudaMemcpyDefault,
-                      stream.value()));
-    stream.synchronize();
+    auto error_index = cudf::detail::make_host_vector_sync<SymbolOffsetT>(
+      device_span<SymbolOffsetT const>{
+        token_indices.data() + thrust::distance(tokens.begin(), error_location), 1},
+      stream);
+
     CUDF_FAIL("JSON Parser encountered an invalid format at location " +
-              std::to_string(error_index));
+              std::to_string(error_index[0]));
   }
 
   auto const num_tokens = tokens.size();
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 3d9a51833e0..7b3b04dea16 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -19,10 +19,7 @@
 #include <cudf/io/detail/tokenize_json.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
 #include <vector>
@@ -405,21 +402,6 @@ void make_device_json_column(device_span<SymbolT const> input,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
-namespace experimental {
-/**
- * @copydoc cudf::io::json::detail::make_device_json_column
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t const& tree,
-                             device_span<NodeIndexT const> col_ids,
-                             device_span<size_type const> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr);
-}  // namespace experimental
-
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 69a51fab5dc..60e78f4763d 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -21,6 +21,7 @@
 #include "nested_json.hpp"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
@@ -34,7 +35,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -1446,11 +1446,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr StackSymbolT read_symbol = 'x';
 
   // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes)
-  rmm::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
-
-  // Sequence of stack symbols and their position in the original input (sparse representation)
-  rmm::device_uvector<StackSymbolT> stack_ops{json_in.size(), stream};
-  rmm::device_uvector<SymbolOffsetT> stack_op_indices{json_in.size(), stream};
+  cudf::detail::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
 
   // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes
   constexpr auto max_translation_table_size =
@@ -1468,11 +1464,26 @@ void get_stack_context(device_span<SymbolT const> json_in,
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
   // of structs/lists
+  // Run FST to estimate the sizes of translated buffers
+  json_to_stack_ops_fst.Transduce(json_in.begin(),
+                                  static_cast<SymbolOffsetT>(json_in.size()),
+                                  thrust::make_discard_iterator(),
+                                  thrust::make_discard_iterator(),
+                                  d_num_stack_ops.data(),
+                                  to_stack_op::start_state,
+                                  stream);
+
+  auto stack_ops_bufsize = d_num_stack_ops.value(stream);
+  // Sequence of stack symbols and their position in the original input (sparse representation)
+  rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
+  rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
+
+  // Run bracket-brace FST to retrieve starting positions of structs and lists
   json_to_stack_ops_fst.Transduce(json_in.begin(),
                                   static_cast<SymbolOffsetT>(json_in.size()),
                                   stack_ops.data(),
                                   stack_op_indices.data(),
-                                  d_num_stack_ops.data(),
+                                  thrust::make_discard_iterator(),
                                   to_stack_op::start_state,
                                   stream);
 
@@ -1508,6 +1519,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream)
 {
+  CUDF_FUNC_RANGE();
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
@@ -1519,7 +1531,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
     stream);
 
   auto const mr = cudf::get_current_device_resource_ref();
-  rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
+  cudf::detail::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
   rmm::device_uvector<PdaTokenT> filtered_tokens_out{tokens.size(), stream, mr};
   rmm::device_uvector<SymbolOffsetT> filtered_token_indices_out{tokens.size(), stream, mr};
 
@@ -1638,26 +1650,33 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   std::size_t constexpr max_tokens_per_struct = 6;
   auto const max_token_out_count =
     cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct;
-  rmm::device_scalar<std::size_t> num_written_tokens{stream};
+  cudf::detail::device_scalar<std::size_t> num_written_tokens{stream};
   // In case we're recovering on invalid JSON lines, post-processing the token stream requires to
   // see a JSON-line delimiter as the very first item
   SymbolOffsetT const delimiter_offset =
     (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0);
-  rmm::device_uvector<PdaTokenT> tokens{max_token_out_count + delimiter_offset, stream, mr};
-  rmm::device_uvector<SymbolOffsetT> tokens_indices{
-    max_token_out_count + delimiter_offset, stream, mr};
 
+  // Run FST to estimate the size of output buffers
   json_to_tokens_fst.Transduce(zip_in,
                                static_cast<SymbolOffsetT>(json_in.size()),
-                               tokens.data() + delimiter_offset,
-                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               thrust::make_discard_iterator(),
                                num_written_tokens.data(),
                                tokenizer_pda::start_state,
                                stream);
 
   auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset;
-  tokens.resize(num_total_tokens, stream);
-  tokens_indices.resize(num_total_tokens, stream);
+  rmm::device_uvector<PdaTokenT> tokens{num_total_tokens, stream, mr};
+  rmm::device_uvector<SymbolOffsetT> tokens_indices{num_total_tokens, stream, mr};
+
+  // Run FST to translate the input JSON string into tokens and indices at which they occur
+  json_to_tokens_fst.Transduce(zip_in,
+                               static_cast<SymbolOffsetT>(json_in.size()),
+                               tokens.data() + delimiter_offset,
+                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               tokenizer_pda::start_state,
+                               stream);
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index c424d2b3b62..2bc15ea19cb 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -127,7 +128,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 
   std::size_t const total_source_size       = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
-  auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
+  auto const delimiter                      = reader_opts.get_delimiter();
+  auto const num_extra_delimiters           = num_delimiter_chars * sources.size();
   compression_type const reader_compression = reader_opts.get_compression();
   std::size_t const chunk_offset            = reader_opts.get_byte_range_offset();
   std::size_t chunk_size                    = reader_opts.get_byte_range_size();
@@ -135,10 +137,10 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting",
                std::invalid_argument);
-  auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
-  chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
+  auto should_load_till_last_source = !chunk_size || chunk_size >= total_source_size - chunk_offset;
+  chunk_size = should_load_till_last_source ? total_source_size - chunk_offset : chunk_size;
 
-  int num_subchunks_prealloced        = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  int num_subchunks_prealloced        = should_load_till_last_source ? 0 : max_subchunks_prealloced;
   std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
@@ -155,17 +157,17 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 
   // Offset within buffer indicating first read position
   std::int64_t buffer_offset = 0;
-  auto readbufspan =
-    ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream);
+  auto readbufspan           = ingest_raw_input(
+    bufspan, sources, reader_compression, chunk_offset, chunk_size, delimiter, stream);
 
   auto const shift_for_nonzero_offset = std::min<std::int64_t>(chunk_offset, 1);
   auto const first_delim_pos =
-    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
+    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, delimiter, stream);
   if (first_delim_pos == -1) {
     // return empty owning datasource buffer
     auto empty_buf = rmm::device_buffer(0, stream);
     return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
-  } else if (!should_load_all_sources) {
+  } else if (!should_load_till_last_source) {
     // Find next delimiter
     std::int64_t next_delim_pos     = -1;
     std::size_t next_subchunk_start = chunk_offset + chunk_size;
@@ -180,14 +182,15 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
                                        reader_compression,
                                        next_subchunk_start,
                                        size_per_subchunk,
+                                       delimiter,
                                        stream);
-        next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+        next_delim_pos = find_first_delimiter(readbufspan, delimiter, stream) + buffer_offset;
         next_subchunk_start += size_per_subchunk;
       }
       if (next_delim_pos < buffer_offset) {
         if (next_subchunk_start >= total_source_size) {
           // If we have reached the end of source list but the source does not terminate with a
-          // newline character
+          // delimiter character
           next_delim_pos = buffer_offset + readbufspan.size();
         } else {
           // Our buffer_size estimate is insufficient to read until the end of the line! We need to
@@ -209,10 +212,26 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
       reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
       next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
+
+  // Add delimiter to end of buffer - possibly adding an empty line to the input buffer - iff we are
+  // reading till the end of the last source i.e. should_load_till_last_source is true Note that the
+  // table generated from the JSONL input remains unchanged since empty lines are ignored by the
+  // parser.
+  size_t num_chars = readbufspan.size() - first_delim_pos - shift_for_nonzero_offset;
+  if (num_chars) {
+    auto last_char = delimiter;
+    cudf::detail::cuda_memcpy_async<char>(
+      device_span<char>(reinterpret_cast<char*>(buffer.data()), buffer.size())
+        .subspan(readbufspan.size(), 1),
+      host_span<char const>(&last_char, 1, false),
+      stream);
+    num_chars++;
+  }
+
   return datasource::owning_buffer<rmm::device_buffer>(
     std::move(buffer),
     reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
-    readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
+    num_chars);
 }
 
 // Helper function to read the current batch using byte range offsets and size
@@ -245,6 +264,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                    compression_type compression,
                                    std::size_t range_offset,
                                    std::size_t range_size,
+                                   char delimiter,
                                    rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
@@ -296,7 +316,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
     if (sources.size() > 1) {
       static_assert(num_delimiter_chars == 1,
                     "Currently only single-character delimiters are supported");
-      auto const delimiter_source = thrust::make_constant_iterator('\n');
+      auto const delimiter_source = thrust::make_constant_iterator(delimiter);
       auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
         delimiter_map, stream, cudf::get_current_device_resource_ref());
       thrust::scatter(rmm::exec_policy_nosync(stream),
@@ -315,13 +335,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // Reading to host because decompression of a single block is much faster on the CPU
   sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
   auto uncomp_data = decompress(compression, hbuffer);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
-                                reinterpret_cast<char*>(uncomp_data.data()),
-                                uncomp_data.size() * sizeof(char),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  stream.synchronize();
-  return buffer.first(uncomp_data.size());
+  auto ret_buffer  = buffer.first(uncomp_data.size());
+  cudf::detail::cuda_memcpy<char>(
+    ret_buffer,
+    host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
+    stream);
+  return ret_buffer;
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 982190eecb5..4def69cc629 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -56,6 +56,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                    compression_type compression,
                                    size_t range_offset,
                                    size_t range_size,
+                                   char delimiter,
                                    rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9cc77e8e738..fcaee9c548e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -516,10 +516,10 @@ void reader_impl::load_next_stripe_data(read_mode mode)
         _stream.synchronize();
         stream_synchronized = true;
       }
-      device_read_tasks.push_back(
-        std::pair(source_ptr->device_read_async(
-                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
-                  read_info.length));
+      device_read_tasks.emplace_back(
+        source_ptr->device_read_async(
+          read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+        read_info.length);
 
     } else {
       auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index a1e4aa65dcf..0081ed30d17 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -22,6 +22,8 @@
 #include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -32,7 +34,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -451,7 +452,7 @@ void decode_stream_data(int64_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  rmm::device_scalar<size_type> error_count(0, stream);
+  cudf::detail::device_scalar<size_type> error_count(0, stream);
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index 4c1079cffe8..7e5db4b7617 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,8 +16,6 @@
 
 #include "reader_impl_helpers.hpp"
 
-#include <cudf/utilities/memory_resource.hpp>
-
 namespace cudf::io::orc::detail {
 
 std::unique_ptr<column> create_empty_column(size_type orc_col_id,
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 5528b2ee763..4cded30d89b 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -20,9 +20,6 @@
 #include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
 
-#include <cudf/io/orc.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 03020eb649f..d432deb8e79 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -1408,7 +1409,8 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     num_entries_seen += stripes_per_col;
   }
 
-  std::vector<statistics_merge_group> file_stats_merge(num_file_blobs);
+  auto file_stats_merge =
+    cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
   for (auto i = 0u; i < num_file_blobs; ++i) {
     auto col_stats         = &file_stats_merge[i];
     col_stats->col_dtype   = per_chunk_stats.col_types[i];
@@ -1418,11 +1420,10 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
   }
 
   auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_file_stats_merge,
-                                file_stats_merge.data(),
-                                num_file_blobs * sizeof(statistics_merge_group),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<statistics_merge_group>(
+    device_span<statistics_merge_group>{stats_merge.device_ptr(num_stripe_blobs), num_file_blobs},
+    file_stats_merge,
+    stream);
 
   auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
   detail::merge_group_statistics<detail::io_file_format::ORC>(
@@ -1573,7 +1574,7 @@ void write_index_stream(int32_t stripe_id,
  * @param[in] strm_desc Stream's descriptor
  * @param[in] enc_stream Chunk's streams
  * @param[in] compressed_data Compressed stream data
- * @param[in,out] stream_out Temporary host output buffer
+ * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
  * @param[in] compression_kind The compression kind
@@ -1584,7 +1585,7 @@ void write_index_stream(int32_t stripe_id,
 std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     gpu::encoder_chunk_streams const& enc_stream,
                                     uint8_t const* compressed_data,
-                                    uint8_t* stream_out,
+                                    host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
                                     orc_streams* streams,
                                     CompressionKind compression_kind,
@@ -1604,11 +1605,10 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
     if (out_sink->is_device_write_preferred(length)) {
       return out_sink->device_write_async(stream_in, length, stream);
     } else {
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value()));
-      stream.synchronize();
+      cudf::detail::cuda_memcpy(
+        bounce_buffer.subspan(0, length), device_span<uint8_t const>{stream_in, length}, stream);
 
-      out_sink->host_write(stream_out, length);
+      out_sink->host_write(bounce_buffer.data(), length);
       return std::async(std::launch::deferred, [] {});
     }
   }();
@@ -2616,7 +2616,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
         strm_desc,
         enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
         compressed_data.data(),
-        bounce_buffer.data(),
+        bounce_buffer,
         &stripe,
         &streams,
         _compression_kind,
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index ddf65e9020f..d15435b2553 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -27,7 +27,6 @@
 #include "ipc/Schema_generated.h"
 #include "writer_impl_helpers.hpp"
 
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 9bc435bf6c8..66810ee163a 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -22,10 +22,9 @@
 #pragma once
 
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/parquet.hpp>
+#include <cudf/io/detail/utils.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-#include <cudf/types.hpp>
 
 namespace cudf::io::parquet::detail {
 
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 12c24e2b848..b87f2e9c692 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -22,10 +22,7 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <optional>
-#include <string>
 #include <utility>
-#include <vector>
 
 namespace CUDF_EXPORT cudf {
 namespace io::parquet::detail {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index d4778b1ea15..05859d60c03 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include "parquet.hpp"
-#include "parquet_common.hpp"
 
 #include <algorithm>
 #include <cstddef>
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 4522ea7fe56..45380e6ea20 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -37,7 +37,14 @@ struct block_scan_results {
 };
 
 template <int decode_block_size>
-static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
+using block_scan_temp_storage = int[decode_block_size / cudf::detail::warp_size];
+
+// Similar to CUB, must __syncthreads() after calling if reusing temp_storage
+template <int decode_block_size>
+__device__ inline static void scan_block_exclusive_sum(
+  int thread_bit,
+  block_scan_results& results,
+  block_scan_temp_storage<decode_block_size>& temp_storage)
 {
   int const t              = threadIdx.x;
   int const warp_index     = t / cudf::detail::warp_size;
@@ -45,15 +52,19 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul
   uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
 
   uint32_t warp_bits = ballot(thread_bit);
-  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+  scan_block_exclusive_sum<decode_block_size>(
+    warp_bits, warp_lane, warp_index, lane_mask, results, temp_storage);
 }
 
+// Similar to CUB, must __syncthreads() after calling if reusing temp_storage
 template <int decode_block_size>
-__device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
-                                                int warp_lane,
-                                                int warp_index,
-                                                uint32_t lane_mask,
-                                                block_scan_results& results)
+__device__ static void scan_block_exclusive_sum(
+  uint32_t warp_bits,
+  int warp_lane,
+  int warp_index,
+  uint32_t lane_mask,
+  block_scan_results& results,
+  block_scan_temp_storage<decode_block_size>& temp_storage)
 {
   // Compute # warps
   constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
@@ -64,49 +75,64 @@ __device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
   results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
 
   // Share the warp counts amongst the block threads
-  __shared__ int warp_counts[num_warps];
-  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
-  __syncthreads();
+  if (warp_lane == 0) { temp_storage[warp_index] = results.warp_count; }
+  __syncthreads();  // Sync to share counts between threads/warps
 
   // Compute block-wide results
   results.block_count               = 0;
   results.thread_count_within_block = results.thread_count_within_warp;
   for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
-    results.block_count += warp_counts[warp_idx];
-    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+    results.block_count += temp_storage[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += temp_storage[warp_idx]; }
   }
 }
 
-template <int block_size, typename state_buf>
-__device__ inline void gpuDecodeFixedWidthValues(
+template <int block_size, bool has_lists_t, typename state_buf>
+__device__ void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
   constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.physical_type;
+  // nesting level that is storing actual leaf values
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
+
+  int const dtype          = s->col.physical_type;
+  uint32_t const dtype_len = s->dtype_len;
+
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
-
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int const thread_pos = pos + t;
 
-    // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
+    if (thread_pos < target_pos && dst_pos >= 0) {
       // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
 
-      uint32_t dtype_len = s->dtype_len;
-      void* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) { return thread_pos + skipped_leaf_values; }
+        return thread_pos;
+      }();
+
+      void* const dst = data_out + (static_cast<size_t>(dst_pos) * dtype_len);
+
       if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
           case INT32: gpuOutputFast(s, sb, src_pos, static_cast<uint32_t*>(dst)); break;
@@ -145,15 +171,15 @@ __device__ inline void gpuDecodeFixedWidthValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthSplitValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
@@ -161,10 +187,15 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.physical_type;
-  auto const data_len                      = thrust::distance(s->data_start, s->data_end);
-  auto const num_values                    = data_len / s->dtype_len_in;
+  // nesting level that is storing actual leaf values
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
+
+  int const dtype       = s->col.physical_type;
+  auto const data_len   = thrust::distance(s->data_start, s->data_end);
+  auto const num_values = data_len / s->dtype_len_in;
+
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
@@ -172,21 +203,34 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
     int const batch_size = min(max_batch_size, end - pos);
 
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int const thread_pos = pos + t;
 
     // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
-      // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
+    if (thread_pos < target_pos && dst_pos >= 0) {
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) {
+          return thread_pos + skipped_leaf_values;
+        } else {
+          return thread_pos;
+        }
+      }();
 
-      uint32_t dtype_len = s->dtype_len;
-      uint8_t const* src = s->data_start + src_pos;
-      uint8_t* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      uint32_t const dtype_len = s->dtype_len;
+      uint8_t const* const src = s->data_start + src_pos;
+      uint8_t* const dst       = data_out + static_cast<size_t>(dst_pos) * dtype_len;
       auto const is_decimal =
         s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
 
@@ -239,11 +283,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_split_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthSplitValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
@@ -274,12 +318,14 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     // definition level
-    int d = 1;
-    if (t >= batch_size) {
-      d = -1;
-    } else if (def) {
-      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-    }
+    int const d = [&]() {
+      if (t >= batch_size) {
+        return -1;
+      } else if (def) {
+        return static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      }
+      return 1;
+    }();
 
     int const thread_value_count = t;
     int const block_value_count  = batch_size;
@@ -340,6 +386,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
         if (is_valid) {
           int const dst_pos = value_count + thread_value_count;
           int const src_pos = max_depth_valid_count + thread_valid_count;
+
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
         }
         // update stuff
@@ -396,16 +443,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
     // use definition level & row bounds to determine if is valid
-    int is_valid;
-    if (t >= batch_size) {
-      is_valid = 0;
-    } else if (def) {
-      int const def_level =
-        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
+    int const is_valid = [&]() {
+      if (t >= batch_size) {
+        return 0;
+      } else if (def) {
+        int const def_level =
+          static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+        return ((def_level > 0) && in_row_bounds) ? 1 : 0;
+      }
+      return in_row_bounds;
+    }();
 
     // thread and block validity count
     using block_scan = cub::BlockScan<int, decode_block_size>;
@@ -447,8 +494,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
 
     // output offset
     if (is_valid) {
-      int const dst_pos                                          = value_count + thread_value_count;
-      int const src_pos                                          = valid_count + thread_valid_count;
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
+
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -460,7 +508,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
     ni.valid_count       = valid_count;
-    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    ni.value_count       = value_count;
     s->nz_count          = valid_count;
     s->input_value_count = value_count;
     s->input_row_count   = value_count;
@@ -533,6 +581,239 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v
   return valid_count;
 }
 
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_count,
+                                                          page_state_s* s,
+                                                          state_buf* sb,
+                                                          level_t const* const def,
+                                                          level_t const* const rep,
+                                                          int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // how many (input) values we've processed in the page so far, prior to this loop iteration
+  int value_count = s->input_value_count;
+
+  // how many rows we've processed in the page so far
+  int input_row_count = s->input_row_count;
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row = s->first_row;
+  int const last_row  = first_row + s->num_rows;
+
+  int const row_index_lower_bound = s->row_index_lower_bound;
+  int const max_depth             = s->col.max_nesting_depth - 1;
+  int max_depth_valid_count       = s->nesting_info[max_depth].valid_count;
+
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  bool const is_first_lane = (warp_lane == 0);
+
+  __syncthreads();
+  __shared__ block_scan_temp_storage<decode_block_size> temp_storage;
+
+  while (value_count < target_value_count) {
+    bool const within_batch = value_count + t < target_value_count;
+
+    // get definition level, use repetition level to get start/end depth
+    // different for each thread, as each thread has a different r/d
+    auto const [def_level, start_depth, end_depth] = [&]() {
+      if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); }
+
+      int const level_index = rolling_index<state_buf::nz_buf_size>(value_count + t);
+      int const rep_level   = static_cast<int>(rep[level_index]);
+      int const start_depth = s->nesting_info[rep_level].start_depth;
+
+      if constexpr (!nullable) {
+        return cuda::std::make_tuple(-1, start_depth, max_depth);
+      } else {
+        if (def != nullptr) {
+          int const def_level = static_cast<int>(def[level_index]);
+          return cuda::std::make_tuple(
+            def_level, start_depth, s->nesting_info[def_level].end_depth);
+        } else {
+          return cuda::std::make_tuple(1, start_depth, max_depth);
+        }
+      }
+    }();
+
+    // Determine value count & row index
+    //  track (page-relative) row index for the thread so we can compare against input bounds
+    //  keep track of overall # of rows we've read.
+    int const is_new_row = start_depth == 0 ? 1 : 0;
+    int num_prior_new_rows, total_num_new_rows;
+    {
+      block_scan_results new_row_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(is_new_row, new_row_scan_results, temp_storage);
+      __syncthreads();
+      num_prior_new_rows = new_row_scan_results.thread_count_within_block;
+      total_num_new_rows = new_row_scan_results.block_count;
+    }
+
+    int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1);
+    input_row_count += total_num_new_rows;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // VALUE COUNT:
+    // in_nesting_bounds: if at a nesting level where we need to add value indices
+    // the bounds: from current rep to the rep AT the def depth
+    int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
+    int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count;
+    {
+      block_scan_results value_count_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(
+        in_nesting_bounds, value_count_scan_results, temp_storage);
+      __syncthreads();
+
+      thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp;
+      warp_value_count               = value_count_scan_results.warp_count;
+      thread_value_count             = value_count_scan_results.thread_count_within_block;
+      block_value_count              = value_count_scan_results.block_count;
+    }
+
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
+
+      // everything up to the max_def_level is a non-null value
+      int const is_valid = [&](int input_def_level) {
+        if constexpr (nullable) {
+          return ((input_def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0;
+        } else {
+          return in_nesting_bounds;
+        }
+      }(def_level);
+
+      // VALID COUNT:
+      // Not all values visited by this block will represent a value at this nesting level.
+      // the validity bit for thread t might actually represent output value t-6.
+      // the correct position for thread t's bit is thread_value_count.
+      uint32_t const warp_valid_mask =
+        WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
+      int thread_valid_count, block_valid_count;
+      {
+        auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
+
+        block_scan_results valid_count_scan_results;
+        scan_block_exclusive_sum<decode_block_size>(warp_valid_mask,
+                                                    warp_lane,
+                                                    warp_index,
+                                                    thread_mask,
+                                                    valid_count_scan_results,
+                                                    temp_storage);
+        __syncthreads();
+        thread_valid_count = valid_count_scan_results.thread_count_within_block;
+        block_valid_count  = valid_count_scan_results.block_count;
+      }
+
+      // compute warp and thread value counts for the -next- nesting level. we need to
+      // do this for lists so that we can emit an offset for the -current- nesting level.
+      // the offset for the current nesting level == current length of the next nesting level
+      int next_thread_value_count_within_warp = 0, next_warp_value_count = 0;
+      int next_thread_value_count = 0, next_block_value_count = 0;
+      int next_in_nesting_bounds = 0;
+      if (d_idx < max_depth) {
+        // NEXT DEPTH VALUE COUNT:
+        next_in_nesting_bounds =
+          ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0;
+        {
+          block_scan_results next_value_count_scan_results;
+          scan_block_exclusive_sum<decode_block_size>(
+            next_in_nesting_bounds, next_value_count_scan_results, temp_storage);
+          __syncthreads();
+
+          next_thread_value_count_within_warp =
+            next_value_count_scan_results.thread_count_within_warp;
+          next_warp_value_count   = next_value_count_scan_results.warp_count;
+          next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
+          next_block_value_count  = next_value_count_scan_results.block_count;
+        }
+
+        // STORE OFFSET TO THE LIST LOCATION
+        // if we're -not- at a leaf column and we're within nesting/row bounds
+        // and we have a valid data_out pointer, it implies this is a list column, so
+        // emit an offset.
+        if (in_nesting_bounds && ni.data_out != nullptr) {
+          const auto& next_ni = s->nesting_info[d_idx + 1];
+          int const idx       = ni.value_count + thread_value_count;
+          cudf::size_type const ofs =
+            next_ni.value_count + next_thread_value_count + next_ni.page_start_value;
+
+          (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
+        }
+      }
+
+      // validity is processed per-warp (on lane 0's)
+      // thi is because when atomic writes are needed, they are 32-bit operations
+      //
+      // lists always read and write to the same bounds
+      // (that is, read and write positions are already pre-bounded by first_row/num_rows).
+      // since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      if constexpr (nullable) {
+        if (is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
+          // absolute bit offset into the output validity map
+          // is cumulative sum of warp_value_count at the given nesting depth
+          // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
+          int const bit_offset = ni.valid_map_offset + thread_value_count;
+
+          store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
+        }
+
+        if (t == 0) { ni.null_count += block_value_count - block_valid_count; }
+      }
+
+      // if this is valid and we're at the leaf, output dst_pos
+      // Read value_count before the sync, so that when thread 0 modifies it we've already read its
+      // value
+      int const current_value_count = ni.value_count;
+      __syncthreads();  // guard against modification of ni.value_count below
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          int const dst_pos      = current_value_count + thread_value_count;
+          int const src_pos      = max_depth_valid_count + thread_valid_count;
+          int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
+
+          // Index from rolling buffer of values (which doesn't include nulls) to final array (which
+          // includes gaps for nulls)
+          sb->nz_idx[output_index] = dst_pos;
+        }
+        max_depth_valid_count += block_valid_count;
+      }
+
+      // update stuff
+      if (t == 0) {
+        ni.value_count += block_value_count;
+        ni.valid_map_offset += block_value_count;
+      }
+      __syncthreads();  // sync modification of ni.value_count
+
+      // propagate value counts for the next depth level
+      block_value_count              = next_block_value_count;
+      thread_value_count             = next_thread_value_count;
+      in_nesting_bounds              = next_in_nesting_bounds;
+      warp_value_count               = next_warp_value_count;
+      thread_value_count_within_warp = next_thread_value_count_within_warp;
+    }  // END OF DEPTH LOOP
+
+    int const batch_size = min(max_batch_size, target_value_count - value_count);
+    value_count += batch_size;
+  }
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nesting_info[max_depth].valid_count = max_depth_valid_count;
+    s->nz_count                            = max_depth_valid_count;
+    s->input_value_count                   = value_count;
+
+    // If we have lists # rows != # values
+    s->input_row_count = input_row_count;
+  }
+
+  return max_depth_valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -560,6 +841,23 @@ __device__ inline bool maybe_has_nulls(page_state_s* s)
   return run_val != s->col.max_level[lvl];
 }
 
+template <int rolling_buf_size, typename stream_type>
+__device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
+{
+  // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
+  // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front
+  // modulo 2 * block_size of course, since that's as many as we process at once
+  int num_skipped = parquet_stream.skip_decode(t, num_to_skip);
+  while (num_skipped < num_to_skip) {
+    // TODO: Instead of decoding, skip within the run to the appropriate location
+    auto const to_decode = min(rolling_buf_size, num_to_skip - num_skipped);
+    num_skipped += parquet_stream.decode_next(t, to_decode);
+    __syncthreads();
+  }
+
+  return num_skipped;
+}
+
 /**
  * @brief Kernel for computing fixed width non dictionary column data stored in the pages
  *
@@ -579,9 +877,10 @@ template <typename level_t,
           decode_kernel_mask kernel_mask_t,
           bool has_dict_t,
           bool has_nesting_t,
-          template <int block_size, typename state_buf>
+          bool has_lists_t,
+          template <int block_size, bool decode_has_lists_t, typename state_buf>
           typename DecodeValuesFunc>
-CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   gpuDecodePageDataGeneric(PageInfo* pages,
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
@@ -621,31 +920,29 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+  DecodeValuesFunc<decode_block_size_t, has_lists_t, state_buf_t> decode_values;
 
-  bool const nullable             = is_nullable(s);
-  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+  bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
-  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
-  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_rep_size =
+    has_lists_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16})
+      : 0;
   constexpr int shared_dict_size =
     has_dict_t
       ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
       : 0;
   constexpr int shared_def_size =
     cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
-  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size;
   __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
-  int shared_offset = 0;
-  /*
-  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
-  if constexpr (has_lists_t){
-    shared_offset += shared_rep_size;
-  }
-  */
+  int shared_offset          = 0;
+  rle_run<level_t>* rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t) { shared_offset += shared_rep_size; }
+
   rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
   if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
   rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
@@ -660,38 +957,51 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                      def,
                      s->page.num_input_values);
   }
-  /*
+
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
   level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
-  if constexpr(has_lists_t){
+  if constexpr (has_lists_t) {
     rep_decoder.init(s->col.level_bits[level_type::REPETITION],
                      s->abs_lvl_start[level_type::REPETITION],
                      s->abs_lvl_end[level_type::REPETITION],
                      rep,
                      s->page.num_input_values);
   }
-  */
 
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
     dict_stream.init(
       s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
   }
-  __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
+  // - processed_count: number of values out of num_input_values that we have decoded so far.
   //   the definition stream returns the number of total rows it has processed in each call
   //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
+  // - valid_count: number of non-null values we have decoded so far. In each iteration of the
   //   loop below, we look at the number of valid items (which could be all for non-nullable),
   //   and valid_count is that running count.
   int processed_count = 0;
   int valid_count     = 0;
+
+  // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists)
+  if constexpr (has_lists_t) {
+    auto const skipped_leaf_values = s->page.skipped_leaf_values;
+    if (skipped_leaf_values > 0) {
+      if (should_process_nulls) {
+        skip_decode<rolling_buf_size>(def_decoder, skipped_leaf_values, t);
+      }
+      processed_count = skip_decode<rolling_buf_size>(rep_decoder, skipped_leaf_values, t);
+      if constexpr (has_dict_t) {
+        skip_decode<rolling_buf_size>(dict_stream, skipped_leaf_values, t);
+      }
+    }
+  }
+
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
   // For chunked reads we may not process all of the rows on the page; if not stop early
-  int last_row = s->first_row + s->num_rows;
+  int const last_row = s->first_row + s->num_rows;
   while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
          (s->input_row_count <= last_row)) {
     int next_valid_count;
@@ -701,7 +1011,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      if constexpr (has_nesting_t) {
+      if constexpr (has_lists_t) {
+        rep_decoder.decode_next(t);
+        __syncthreads();
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, rep, t);
+      } else if constexpr (has_nesting_t) {
         next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       } else {
@@ -713,9 +1028,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count =
-        gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
+      if constexpr (has_lists_t) {
+        processed_count += rep_decoder.decode_next(t);
+        __syncthreads();
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, rep, t);
+      } else {
+        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
+      }
     }
     __syncthreads();
 
@@ -745,6 +1067,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t min_row,
                                   int level_type_size,
                                   bool has_nesting,
+                                  bool is_list,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
@@ -754,12 +1077,23 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -769,17 +1103,29 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -789,6 +1135,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -802,6 +1149,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t min_row,
                                       int level_type_size,
                                       bool has_nesting,
+                                      bool is_list,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
@@ -811,12 +1159,23 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -826,17 +1185,29 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -846,6 +1217,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               true,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -860,6 +1232,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                               size_t min_row,
                               int level_type_size,
                               bool has_nesting,
+                              bool is_list,
                               kernel_error::pointer error_code,
                               rmm::cuda_stream_view stream)
 {
@@ -869,12 +1242,23 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -884,17 +1268,29 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -904,6 +1300,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp
index f0fc9fab3ab..8b3d1d7a6c3 100644
--- a/cpp/src/io/parquet/error.hpp
+++ b/cpp/src/io/parquet/error.hpp
@@ -26,7 +26,7 @@
 namespace cudf::io::parquet {
 
 /**
- * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in
+ * @brief Specialized device scalar for use in reporting errors that occur in
  * kernel calls.
  *
  * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index b3276c81c1f..0d24fa4236f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf::io::parquet::detail {
@@ -476,9 +477,9 @@ void WriteFinalOffsets(host_span<size_type const> offsets,
   auto d_src_data = cudf::detail::make_device_uvector_async(
     offsets, stream, cudf::get_current_device_resource_ref());
   // Iterator for the source (scalar) data
-  auto src_iter = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
-    cuda::proclaim_return_type<size_type*>(
+  auto src_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<cudf::size_type*>(
       [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; }));
 
   // Copy buffer addresses to device and create an iterator
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index d604642be54..52d53cb8225 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -183,17 +183,20 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   }
 
-  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
-                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
+      return is_list(chunk)     ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST
+             : is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                                : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 4f6d41a97da..dba24b553e6 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -22,14 +22,13 @@
 #include "io/parquet/parquet_common.hpp"
 #include "io/statistics/statistics.cuh"
 #include "io/utilities/column_buffer.hpp"
-#include "io/utilities/hostdevice_vector.hpp"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cuda/atomic>
@@ -221,6 +220,10 @@ enum class decode_kernel_mask {
     (1 << 9),                              // Same as above but for nested, fixed-width data
   FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
   FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_DICT_LIST      = (1 << 12),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_NO_DICT_LIST   = (1 << 13),  // Run decode kernel for fixed width non-dictionary pages
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST =
+    (1 << 14),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -909,6 +912,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -918,6 +922,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          size_t min_row,
                          int level_type_size,
                          bool has_nesting,
+                         bool is_list,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -933,6 +938,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -942,6 +948,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              size_t min_row,
                              int level_type_size,
                              bool has_nesting,
+                             bool is_list,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
@@ -957,6 +964,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -966,6 +974,7 @@ void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages
                                    size_t min_row,
                                    int level_type_size,
                                    bool has_nesting,
+                                   bool is_list,
                                    kernel_error::pointer error_code,
                                    rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index f0a0bc0b51b..a965f3325d5 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -23,7 +23,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -454,15 +453,18 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
                "Filter expression must return a boolean column");
 
-  auto num_bitmasks = num_bitmask_words(predicate.size());
-  std::vector<bitmask_type> host_bitmask(num_bitmasks, ~bitmask_type{0});
-  if (predicate.nullable()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(host_bitmask.data(),
-                                  predicate.null_mask(),
-                                  num_bitmasks * sizeof(bitmask_type),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-  }
+  auto const host_bitmask = [&] {
+    auto const num_bitmasks = num_bitmask_words(predicate.size());
+    if (predicate.nullable()) {
+      return cudf::detail::make_host_vector_sync(
+        device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
+    } else {
+      auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
+      std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
+      return bitmask;
+    }
+  }();
+
   auto validity_it = cudf::detail::make_counting_transform_iterator(
     0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
 
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index dd354b905f3..170c6e8857f 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,8 +16,6 @@
 
 #include "reader_impl.hpp"
 
-#include <cudf/utilities/memory_resource.hpp>
-
 namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f0865c715bc..689386b8957 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -21,11 +21,9 @@
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
@@ -78,7 +76,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
   auto const has_strings = (kernel_mask & STRINGS_MASK) != 0;
-  std::vector<size_t> col_string_sizes(_input_columns.size(), 0L);
+  auto col_string_sizes  = cudf::detail::make_host_vector<size_t>(_input_columns.size(), _stream);
   if (has_strings) {
     // need to compute pages bounds/sizes if we lack page indexes or are using custom bounds
     // TODO: we could probably dummy up size stats for FLBA data since we know the width
@@ -274,6 +272,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   false,
+                                  false,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -286,6 +285,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   true,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for list columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  true,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -309,6 +322,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         false,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) {
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -321,6 +348,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         true,
+                        false,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -333,6 +361,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             false,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
@@ -345,6 +387,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             true,
+                            false,
                             error_code.data(),
                             streams[s_idx++]);
   }
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 62ffc4d3077..3aa9b94ed6b 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -284,7 +284,7 @@ class reader::impl {
    *
    * @return Vector of total string data sizes for each column
    */
-  std::vector<size_t> calculate_page_string_offsets();
+  cudf::detail::host_vector<size_t> calculate_page_string_offsets();
 
   /**
    * @brief Converts the page data and outputs to columns.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 3a3cdd34a58..a0c2dbd3e44 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -107,7 +107,7 @@ struct subpass_intermediate_data {
  * rowgroups may represent less than all of the rowgroups to be read for the file.
  */
 struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  std::vector<rmm::device_buffer> raw_page_data;
 
   // rowgroup, chunk and page information for the current pass.
   bool has_compressed_data{false};
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 5138a92ac14..f03f1214b9a 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -218,7 +218,7 @@ void generate_depth_remappings(
  */
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
-  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+  cudf::host_span<rmm::device_buffer> page_data,
   cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
@@ -251,23 +251,24 @@ void generate_depth_remappings(
       if (source->is_device_read_preferred(io_size)) {
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto buffer =
+        page_data[chunk] =
           rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
         auto fut_read_size = source->device_read_async(
-          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
+          io_offset, io_size, static_cast<uint8_t*>(page_data[chunk].data()), stream);
         read_tasks.emplace_back(std::move(fut_read_size));
-        page_data[chunk] = datasource::buffer::create(std::move(buffer));
       } else {
         auto const read_buffer = source->host_read(io_offset, io_size);
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto tmp_buffer = rmm::device_buffer(
+        page_data[chunk] = rmm::device_buffer(
           cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream));
-        page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer));
+        CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(),
+                                      read_buffer->data(),
+                                      read_buffer->size(),
+                                      cudaMemcpyDefault,
+                                      stream));
       }
-      auto d_compdata = page_data[chunk]->data();
+      auto d_compdata = static_cast<uint8_t const*>(page_data[chunk].data());
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -980,7 +981,7 @@ std::pair<bool, std::future<void>> reader::impl::read_column_chunks()
   std::vector<size_type> chunk_source_map(num_chunks);
 
   // Tracker for eventually deallocating compressed and uncompressed data
-  raw_page_data = std::vector<std::unique_ptr<datasource::buffer>>(num_chunks);
+  raw_page_data = std::vector<rmm::device_buffer>(num_chunks);
 
   // Keep track of column chunk file offsets
   std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1629,10 +1630,10 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         get_page_nesting_size{
           d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
 
-      // Manually create a int64_t `key_start` compatible counting_transform_iterator to avoid
-      // implicit casting to size_type.
-      auto const reduction_keys = thrust::make_transform_iterator(
-        thrust::make_counting_iterator<size_t>(key_start), get_reduction_key{subpass.pages.size()});
+      // Manually create a size_t `key_start` compatible counting_transform_iterator.
+      auto const reduction_keys =
+        thrust::make_transform_iterator(thrust::make_counting_iterator<std::size_t>(key_start),
+                                        get_reduction_key{subpass.pages.size()});
 
       // Find the size of each column
       thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
@@ -1695,15 +1696,14 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
-std::vector<size_t> reader::impl::calculate_page_string_offsets()
+cudf::detail::host_vector<size_t> reader::impl::calculate_page_string_offsets()
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   auto page_keys = make_page_key_iterator(subpass.pages);
 
-  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
-  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
+  rmm::device_uvector<size_t> d_col_sizes(_input_columns.size(), _stream);
 
   // use page_index to fetch page string sizes in the proper order
   auto val_iter = thrust::make_transform_iterator(subpass.pages.device_begin(),
@@ -1717,7 +1717,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                                 page_offset_output_iter{subpass.pages.device_ptr()});
 
   // now sum up page sizes
-  rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
+  rmm::device_uvector<int> reduce_keys(d_col_sizes.size(), _stream);
   thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
                         page_keys,
                         page_keys + subpass.pages.size(),
@@ -1725,14 +1725,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                         reduce_keys.begin(),
                         d_col_sizes.begin());
 
-  cudaMemcpyAsync(col_sizes.data(),
-                  d_col_sizes.data(),
-                  sizeof(size_t) * col_sizes.size(),
-                  cudaMemcpyDeviceToHost,
-                  _stream);
-  _stream.synchronize();
-
-  return col_sizes;
+  return cudf::detail::make_host_vector_sync(d_col_sizes, _stream);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 4a0791d5c54..69e783a89d0 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -19,6 +19,7 @@
 #include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 
 namespace cudf::io::parquet::detail {
 
@@ -216,6 +217,26 @@ struct rle_stream {
     decode_index = -1;  // signals the first iteration. Nothing to decode.
   }
 
+  __device__ inline int get_rle_run_info(rle_run<level_t>& run)
+  {
+    run.start     = cur;
+    run.level_run = get_vlq32(run.start, end);
+
+    // run_bytes includes the header size
+    int run_bytes = run.start - cur;
+    if (is_literal_run(run.level_run)) {
+      // from the parquet spec: literal runs always come in multiples of 8 values.
+      run.size = (run.level_run >> 1) * 8;
+      run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8);
+    } else {
+      // repeated value run
+      run.size = (run.level_run >> 1);
+      run_bytes += util::div_rounding_up_unsafe(level_bits, 8);
+    }
+
+    return run_bytes;
+  }
+
   __device__ inline void fill_run_batch()
   {
     // decode_index == -1 means we are on the very first decode iteration for this stream.
@@ -226,31 +247,14 @@ struct rle_stream {
     while (((decode_index == -1 && fill_index < num_rle_stream_decode_warps) ||
             fill_index < decode_index + run_buffer_size) &&
            cur < end) {
-      auto& run = runs[rolling_index<run_buffer_size>(fill_index)];
-
       // Encoding::RLE
+      // Pass by reference to fill the runs shared memory with the run data
+      auto& run           = runs[rolling_index<run_buffer_size>(fill_index)];
+      int const run_bytes = get_rle_run_info(run);
 
-      // bytes for the varint header
-      uint8_t const* _cur = cur;
-      int const level_run = get_vlq32(_cur, end);
-      // run_bytes includes the header size
-      int run_bytes = _cur - cur;
-
-      // literal run
-      if (is_literal_run(level_run)) {
-        // from the parquet spec: literal runs always come in multiples of 8 values.
-        run.size = (level_run >> 1) * 8;
-        run_bytes += ((run.size * level_bits) + 7) >> 3;
-      }
-      // repeated value run
-      else {
-        run.size = (level_run >> 1);
-        run_bytes += ((level_bits) + 7) >> 3;
-      }
-      run.output_pos = output_pos;
-      run.start      = _cur;
-      run.level_run  = level_run;
       run.remaining  = run.size;
+      run.output_pos = output_pos;
+
       cur += run_bytes;
       output_pos += run.size;
       fill_index++;
@@ -372,6 +376,39 @@ struct rle_stream {
     return values_processed_shared;
   }
 
+  __device__ inline int skip_runs(int target_count)
+  {
+    // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip
+    // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info.
+    // then when it hits the one that matters, we don't process it at all and bail as if we never
+    // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for
+    // the first time
+    while (cur < end) {
+      rle_run<level_t> run;
+      int run_bytes = get_rle_run_info(run);
+
+      if ((output_pos + run.size) > target_count) {
+        return output_pos;  // bail! we've reached the starting run
+      }
+
+      // skip this run
+      output_pos += run.size;
+      cur += run_bytes;
+    }
+
+    return output_pos;  // we skipped everything
+  }
+
+  __device__ inline int skip_decode(int t, int count)
+  {
+    int const output_count = min(count, total_values - cur_values);
+
+    // if level_bits == 0, there's nothing to do
+    // a very common case: columns with no nulls, especially if they are non-nested
+    cur_values = (level_bits == 0) ? output_count : skip_runs(output_count);
+    return cur_values;
+  }
+
   __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); }
 };
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 190f13eb688..f865c9a7643 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -183,7 +183,7 @@ struct aggregate_writer_metadata {
     std::vector<RowGroup> row_groups;
     std::vector<KeyValue> key_value_metadata;
     std::vector<OffsetIndex> offset_indexes;
-    std::vector<std::vector<uint8_t>> column_indexes;
+    std::vector<cudf::detail::host_vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
   std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
@@ -1543,12 +1543,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
       d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks = chunks.host_view();
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
-                                d_chunks.data(),
-                                d_chunks.flat_view().size_bytes(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  chunks.device_to_host_async(stream);
 
   if (comp_stats.has_value()) {
     comp_stats.value() += collect_compression_statistics(comp_in, comp_res, stream);
@@ -2559,12 +2554,11 @@ void writer::impl::write_parquet_data_to_sink(
         } else {
           CUDF_EXPECTS(bounce_buffer.size() >= ck.compressed_size,
                        "Bounce buffer was not properly initialized.");
-          CUDF_CUDA_TRY(cudaMemcpyAsync(bounce_buffer.data(),
-                                        dev_bfr + ck.ck_stat_size,
-                                        ck.compressed_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
-          _stream.synchronize();
+          cudf::detail::cuda_memcpy(
+            host_span{bounce_buffer}.subspan(0, ck.compressed_size),
+            device_span<uint8_t const>{dev_bfr + ck.ck_stat_size, ck.compressed_size},
+            _stream);
+
           _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size);
         }
 
@@ -2600,13 +2594,8 @@ void writer::impl::write_parquet_data_to_sink(
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
-          std::vector<uint8_t> column_idx;
-          column_idx.resize(ck.column_index_size);
-          CUDF_CUDA_TRY(cudaMemcpyAsync(column_idx.data(),
-                                        ck.column_index_blob,
-                                        ck.column_index_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
+          auto column_idx = cudf::detail::make_host_vector_async(
+            device_span<uint8_t const>{ck.column_index_blob, ck.column_index_size}, _stream);
 
           // calculate offsets while the column index is transferring
           int64_t curr_pg_offset = column_chunk_meta.data_page_offset;
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index badcd3f58f9..06069630685 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -74,8 +74,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     // Buffer needs to be padded.
     // Required by `inflate_kernel`.
     device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<T>(
+      device_span<T>{device}.subspan(0, host.size()), host, stream);
   }
 
   struct decompression_blocks {
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 58faa0ebfe4..f4a2f29026a 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -22,10 +22,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
-#include <rmm/device_buffer.hpp>
-
-#include <thrust/host_vector.h>
-
 #include <fstream>
 
 namespace cudf::io::text {
@@ -87,8 +83,10 @@ class datasource_chunk_reader : public data_chunk_reader {
       _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
 
       // copy the host-pinned data on to device
-      CUDF_CUDA_TRY(cudaMemcpyAsync(
-        chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+      cudf::detail::cuda_memcpy_async<char>(
+        device_span<char>{chunk}.subspan(0, read_size),
+        host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+        stream);
 
       // record the host-to-device copy.
       CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -153,8 +151,10 @@ class istream_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host-pinned data on to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      device_span<char>{chunk}.subspan(0, read_size),
+      host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+      stream);
 
     // record the host-to-device copy.
     CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -193,12 +193,10 @@ class host_span_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host data to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(  //
-      chunk.data(),
-      _data.data() + _position,
-      read_size,
-      cudaMemcpyDefault,
-      stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      cudf::device_span<char>{chunk}.subspan(0, read_size),
+      cudf::host_span<char const>{_data}.subspan(_position, read_size),
+      stream);
 
     _position += read_size;
 
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 249dc3b5875..6d954753af8 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -21,12 +21,12 @@
 
 #include "column_buffer.hpp"
 
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <iomanip>
 #include <sstream>
 
 namespace cudf::io::detail {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index e73b2bc88de..31c8b781e77 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -22,12 +22,9 @@
 #pragma once
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/io/types.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -35,6 +32,8 @@
 
 #include <thrust/pair.h>
 
+#include <optional>
+
 namespace cudf {
 namespace io {
 namespace detail {
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index a3afbd52896..b66742569d9 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,11 +16,10 @@
 
 #include "getenv_or.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cstdlib>
-#include <sstream>
+#include <kvikio/defaults.hpp>
+
 #include <string>
 
 namespace cudf::io {
@@ -53,6 +52,14 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_
 
 bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; }
 
+void set_thread_pool_nthreads_from_env()
+{
+  static std::once_flag flag{};
+  std::call_once(flag, [] {
+    auto nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 8U);
+    kvikio::defaults::thread_pool_nthreads_reset(nthreads);
+  });
+}
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
@@ -81,5 +88,4 @@ bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
 bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; }
 
 }  // namespace nvcomp_integration
-
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index f70171eef68..0c49b2e5d78 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -800,7 +801,7 @@ template <typename string_view_pair_it>
 static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                                             size_type col_size,
                                             rmm::device_buffer&& null_mask,
-                                            rmm::device_scalar<size_type>& d_null_count,
+                                            cudf::detail::device_scalar<size_type>& d_null_count,
                                             cudf::io::parse_options_view const& options,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
@@ -930,7 +931,7 @@ std::unique_ptr<column> parse_data(
   CUDF_FUNC_RANGE();
 
   if (col_size == 0) { return make_empty_column(col_type); }
-  auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
+  auto d_null_count    = cudf::detail::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
   if (null_mask.is_empty()) {
     null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 0b76f3d3e8f..a8a275919d8 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -42,6 +42,7 @@ class file_sink : public data_sink {
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 28f7f08521e..9668b30e9a9 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -15,8 +15,10 @@
  */
 
 #include "file_io_utilities.hpp"
+#include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
@@ -31,7 +33,6 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
-#include <unordered_map>
 #include <vector>
 
 namespace cudf {
@@ -47,6 +48,7 @@ class file_source : public datasource {
   {
     detail::force_init_cuda_context();
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
@@ -245,17 +247,18 @@ class device_buffer_source final : public datasource {
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(dst, _d_buffer.data() + offset, count, cudaMemcpyDefault, stream.value()));
-    stream.synchronize();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
+    cudf::detail::cuda_memcpy(host_span<uint8_t>{dst, count},
+                              device_span<uint8_t const>{
+                                reinterpret_cast<uint8_t const*>(_d_buffer.data() + offset), count},
+                              stream);
     return count;
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
     auto h_data       = cudf::detail::make_host_vector_async(
       cudf::device_span<std::byte const>{_d_buffer.data() + offset, count}, stream);
     stream.synchronize();
@@ -392,14 +395,21 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t offset,
                                                size_t max_size_estimate)
 {
-#ifdef CUFILE_FOUND
-  if (cufile_integration::is_always_enabled()) {
-    // avoid mmap as GDS is expected to be used for most reads
+  auto const use_memory_mapping = [] {
+    auto const policy = getenv_or("LIBCUDF_MMAP_ENABLED", std::string{"ON"});
+
+    if (policy == "ON") { return true; }
+    if (policy == "OFF") { return false; }
+
+    CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy);
+  }();
+
+  if (use_memory_mapping) {
+    return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate);
+  } else {
+    // `file_source` reads the file directly, without memory mapping
     return std::make_unique<file_source>(filepath.c_str());
   }
-#endif
-  // Use our own memory mapping implementation for direct file reads
-  return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate);
 }
 
 std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index d7b54399f8d..cf19bc591cc 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -22,8 +22,6 @@
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <dlfcn.h>
 
 #include <cerrno>
@@ -110,7 +108,11 @@ class cufile_shim {
 
   ~cufile_shim()
   {
-    if (driver_close != nullptr) driver_close();
+    // Explicit cuFile driver close should not be performed here to avoid segfault. However, in the
+    // absence of driver_close(), cuFile will implicitly do that, which in most cases causes
+    // segfault anyway. TODO: Revisit this conundrum once cuFile is fixed.
+    // https://github.com/rapidsai/cudf/issues/17121
+
     if (cf_lib != nullptr) dlclose(cf_lib);
   }
 
@@ -239,7 +241,7 @@ std::vector<std::future<ResultT>> make_sliced_tasks(
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
     return pool.submit_task(
-      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
+      [=] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 7e47b5b3d10..584b6213fa3 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -104,7 +104,7 @@ class cufile_shim;
 /**
  * @brief Class that provides RAII for cuFile file registration.
  */
-struct cufile_registered_file {
+class cufile_registered_file {
   void register_handle();
 
  public:
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index af1ba16a424..f969b45727b 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -176,13 +176,19 @@ class hostdevice_2dvector {
   operator device_2dspan<T const>() const { return {device_span<T const>{_data}, _size.second}; }
 
   device_2dspan<T> device_view() { return static_cast<device_2dspan<T>>(*this); }
-  device_2dspan<T const> device_view() const { return static_cast<device_2dspan<T const>>(*this); }
+  [[nodiscard]] device_2dspan<T const> device_view() const
+  {
+    return static_cast<device_2dspan<T const>>(*this);
+  }
 
   operator host_2dspan<T>() { return {host_span<T>{_data}, _size.second}; }
   operator host_2dspan<T const>() const { return {host_span<T const>{_data}, _size.second}; }
 
   host_2dspan<T> host_view() { return static_cast<host_2dspan<T>>(*this); }
-  host_2dspan<T const> host_view() const { return static_cast<host_2dspan<T const>>(*this); }
+  [[nodiscard]] host_2dspan<T const> host_view() const
+  {
+    return static_cast<host_2dspan<T const>>(*this);
+  }
 
   host_span<T> operator[](size_t row)
   {
@@ -194,16 +200,19 @@ class hostdevice_2dvector {
     return host_span<T const>{_data}.subspan(row * _size.second, _size.second);
   }
 
-  auto size() const noexcept { return _size; }
-  auto count() const noexcept { return _size.first * _size.second; }
-  auto is_empty() const noexcept { return count() == 0; }
+  [[nodiscard]] auto size() const noexcept { return _size; }
+  [[nodiscard]] auto count() const noexcept { return _size.first * _size.second; }
+  [[nodiscard]] auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
 
-  T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); }
+  [[nodiscard]] T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); }
 
-  T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); }
+  [[nodiscard]] T const* base_device_ptr(size_t offset = 0) const
+  {
+    return _data.device_ptr(offset);
+  }
 
   [[nodiscard]] size_t size_bytes() const noexcept { return _data.size_bytes(); }
 
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index c0bbca39167..cf252fe63af 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -16,10 +16,7 @@
 
 #include "io/utilities/row_selection.hpp"
 
-#include <cudf/utilities/error.hpp>
-
 #include <algorithm>
-#include <limits>
 
 namespace cudf::io::detail {
 
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 7c607099cdc..e826feff201 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <cstdint>
 #include <optional>
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index 43dc38c4ac6..af32b207d20 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -18,11 +18,10 @@
 #include "io/utilities/string_parsing.hpp"
 #include "io/utilities/trie.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/device_scalar.hpp>
-
 #include <cub/block/block_reduce.cuh>
 
 #include <cstddef>
@@ -242,7 +241,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
   constexpr int block_size = 128;
 
   auto const grid_size = (size + block_size - 1) / block_size;
-  auto d_column_info   = rmm::device_scalar<cudf::io::column_type_histogram>(stream);
+  auto d_column_info   = cudf::detail::device_scalar<cudf::io::column_type_histogram>(stream);
   CUDF_CUDA_TRY(cudaMemsetAsync(
     d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value()));
 
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 89c47d246d0..34a0bdce124 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -16,11 +16,8 @@
 
 #include <cudf/utilities/error.hpp>
 
-#include <cuda.h>
-
 #include <jitify2.hpp>
 
-#include <cstddef>
 #include <filesystem>
 
 namespace cudf {
diff --git a/cpp/src/jit/util.cpp b/cpp/src/jit/util.cpp
index 0585e02a031..d9a29203133 100644
--- a/cpp/src/jit/util.cpp
+++ b/cpp/src/jit/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <string>
-
 namespace cudf {
 namespace jit {
 struct get_data_ptr_functor {
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 2ec23e0dc6d..40d1c925889 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -21,6 +21,7 @@
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -81,7 +82,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = *output_size;
   } else {
     // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -94,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<std::size_t> write_index(0, stream);
+  cudf::detail::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -197,7 +198,7 @@ conditional_join(table_view const& left,
     join_size = *output_size;
   } else {
     // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -231,7 +232,7 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<std::size_t> write_index(0, stream);
+  cudf::detail::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -342,7 +343,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index c7294152982..515d28201e8 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -27,7 +27,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cooperative_groups.h>
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 84e9be45030..4049ccf35e1 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/export.hpp>
@@ -122,7 +123,7 @@ std::size_t launch_compute_mixed_join_output_size(
   rmm::device_async_resource_ref mr)
 {
   // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  cudf::detail::device_scalar<std::size_t> size(0, stream, mr);
 
   compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, has_nulls>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 59fdbedf089..fb5cf66dd60 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
@@ -1031,7 +1032,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr);
 
   // compute results
-  rmm::device_scalar<size_type> d_valid_count{0, stream};
+  cudf::detail::device_scalar<size_type> d_valid_count{0, stream};
 
   get_json_object_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 17008e80e79..ebab3beb08f 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -834,10 +834,11 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr);
+  return detail::partition(t, partition_map, num_partitions, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 5a4c90a67a5..ab6ab393878 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -273,11 +273,11 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_robin_partition(
-    input, num_partitions, start_partition, cudf::get_default_stream(), mr);
+  return detail::round_robin_partition(input, num_partitions, start_partition, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index b0a84a6d50c..d27420658d6 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1126,12 +1126,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  * `max` of 0.
  *
  * @param tdv input tdigests. The tdigests within this column are grouped by key.
- * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `group_offsets`.
  * @param group_offsets a device iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `h_group_offsets`.
+ * counted as one even when the cluster is empty in it.
  * @param group_labels a device iterator of the the group label for each tdigest cluster including
  * empty clusters.
  * @param num_group_labels the number of unique group labels.
@@ -1142,9 +1138,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  *
  * @return A column containing the merged tdigests.
  */
-template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
+template <typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1313,21 +1308,13 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
 
   if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
-  auto group_offsets_  = group_offsets_fn{input.size()};
-  auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_offsets   = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_labels    = thrust::make_constant_iterator(0);
-  return to_tdigest_scalar(merge_tdigests(tdv,
-                                          h_group_offsets,
-                                          group_offsets,
-                                          group_labels,
-                                          input.size(),
-                                          1,
-                                          max_centroids,
-                                          stream,
-                                          mr),
-                           stream,
-                           mr);
+  auto group_offsets_ = group_offsets_fn{input.size()};
+  auto group_offsets  = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
+  auto group_labels   = thrust::make_constant_iterator(0);
+  return to_tdigest_scalar(
+    merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr),
+    stream,
+    mr);
 }
 
 std::unique_ptr<column> group_tdigest(column_view const& col,
@@ -1376,16 +1363,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
     return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
-  // bring group offsets back to the host
-  std::vector<size_type> h_group_offsets(group_offsets.size());
-  cudaMemcpyAsync(h_group_offsets.data(),
-                  group_offsets.begin(),
-                  sizeof(size_type) * group_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
   return merge_tdigests(tdv,
-                        h_group_offsets.begin(),
                         group_offsets.data(),
                         group_labels.data(),
                         group_labels.size(),
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
index a9f86ac1b5f..17844b6bb0a 100644
--- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 67ea29a2cb1..890625830a5 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -16,6 +16,7 @@
 
 #include "simple.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -65,7 +66,8 @@ struct all_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
+    auto d_result =
+      cudf::detail::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 057f038c622..d70da369d72 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -16,6 +16,7 @@
 
 #include "simple.cuh"
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -65,7 +66,8 @@ struct any_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
+    auto d_result =
+      cudf::detail::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 139de068050..4f6eb23ce5b 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
@@ -69,18 +70,18 @@ struct minmax_pair {
  * @param num_items number of items to reduce
  * @param binary_op binary operator used to reduce
  * @param stream CUDA stream to run kernels on.
- * @return rmm::device_scalar<OutputType>
+ * @return cudf::detail::device_scalar<OutputType>
  */
 template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type>
-rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
-                                             size_type num_items,
-                                             Op binary_op,
-                                             rmm::cuda_stream_view stream)
+auto reduce_device(InputIterator d_in,
+                   size_type num_items,
+                   Op binary_op,
+                   rmm::cuda_stream_view stream)
 {
   OutputType identity{};
-  rmm::device_scalar<OutputType> result{identity, stream};
+  cudf::detail::device_scalar<OutputType> result{identity, stream};
 
   // Allocate temporary storage
   size_t storage_bytes = 0;
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d187375b69f..75ebc078930 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -26,8 +26,6 @@
 #include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/structs/structs_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index d3c0b54f286..b91ae19b51a 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -14,13 +14,10 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scan.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 40d1d8a0a53..c4f6c135dde 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -13,16 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 1df1549432f..d0e3358cc34 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -137,7 +138,7 @@ struct replace_nulls_column_kernel_forwarder {
     auto device_out         = cudf::mutable_column_device_view::create(output_view, stream);
     auto device_replacement = cudf::column_device_view::create(replacement, stream);
 
-    rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
+    cudf::detail::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
 
     replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 86ec8cfc91e..0cc97ca05e0 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -37,6 +37,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
@@ -53,7 +54,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -182,7 +182,7 @@ struct replace_kernel_forwarder {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
+    cudf::detail::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
 
     auto replace = [&] {
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 72c23395a93..7cad31c0658 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -18,13 +18,10 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/reduction/detail/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::detail {
diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 8a53e937f98..77cb2a8c7f5 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,7 @@
 #pragma once
 
 #include <cudf/rolling/range_window_bounds.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/wrappers/durations.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 528700137bf..bc0ee2eb519 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -33,6 +33,7 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/unary.hpp>
@@ -49,7 +50,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/climits>
@@ -1105,7 +1105,7 @@ struct rolling_window_launcher {
       auto const d_inp_ptr         = column_device_view::create(input, stream);
       auto const d_default_out_ptr = column_device_view::create(default_outputs, stream);
       auto const d_out_ptr = mutable_column_device_view::create(output->mutable_view(), stream);
-      auto d_valid_count   = rmm::device_scalar<size_type>{0, stream};
+      auto d_valid_count   = cudf::detail::device_scalar<size_type>{0, stream};
 
       auto constexpr block_size = 256;
       auto const grid           = cudf::detail::grid_1d(input.size(), block_size);
@@ -1271,7 +1271,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
     udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
 
   auto output_view = output->mutable_view();
-  rmm::device_scalar<size_type> device_valid_count{0, stream};
+  cudf::detail::device_scalar<size_type> device_valid_count{0, stream};
 
   std::string kernel_name =
     jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new")  //
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 69792136c64..7f698dfcd6b 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -19,7 +19,6 @@
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/wrappers/durations.hpp>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 8988d73fb02..332c440aea9 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -358,10 +358,11 @@ std::unique_ptr<column> round(column_view const& input,
 std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
+                              rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr);
+  return detail::round(input, decimal_places, method, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 31535198c58..4ec2174a96f 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -26,8 +26,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <string>
 
 namespace cudf {
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 656fe61fbbe..9f242bdffe0 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -16,10 +16,8 @@
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 4c015f3cbed..6a7c8ea45e9 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
@@ -348,7 +349,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // This check incurs ~20% performance hit for smaller strings and so we only use it
   // after the threshold check above. The check makes very little impact for long strings
   // but results in a large performance gain when the input contains only single-byte characters.
-  rmm::device_scalar<int64_t> mb_count(0, stream);
+  cudf::detail::device_scalar<int64_t> mb_count(0, stream);
   // cudf::detail::grid_1d is limited to size_type elements
   auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
   // we only need to check every other byte since either will contain high bit
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 0db1adf1223..f5d052c6657 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -152,12 +153,8 @@ struct format_compiler {
     }
 
     // create program in device memory
-    d_items.resize(items.size(), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
-                                  items.data(),
-                                  items.size() * sizeof(items[0]),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    d_items = cudf::detail::make_device_uvector_sync(
+      items, stream, cudf::get_current_device_resource_ref());
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 1d9d12686eb..9e4ef47ff79 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -27,7 +28,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/advance.h>
@@ -242,7 +242,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   }
 
   {  // Copy offsets columns with single kernel launch
-    rmm::device_scalar<size_type> d_valid_count(0, stream);
+    cudf::detail::device_scalar<size_type> d_valid_count(0, stream);
 
     constexpr size_type block_size{256};
     cudf::detail::grid_1d config(offsets_count, block_size);
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index d1990733e81..60ad714dfec 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -24,7 +24,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <algorithm>
 #include <functional>
 #include <numeric>
 
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
index 8a8001dd81a..957075017ba 100644
--- a/cpp/src/strings/replace/find_replace.cu
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -21,7 +22,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 352d883bdc5..88f343926c9 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -334,7 +334,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   auto const num_blocks = util::div_rounding_up_safe(
     util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
   count_targets<<<num_blocks, block_size, 0, stream.value()>>>(fn, chars_bytes, d_count.data());
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 16df0dbabdf..52ddef76c1a 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -285,7 +285,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  rmm::device_scalar<int64_t> d_target_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_target_count(0, stream);
   constexpr int64_t block_size         = 512;
   constexpr size_type bytes_per_thread = 4;
   auto const num_blocks                = util::div_rounding_up_safe(
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 81aca001d53..4b777be9d5b 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
@@ -361,7 +362,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   if (chars_bytes > 0) {
     constexpr int64_t block_size         = 512;
     constexpr size_type bytes_per_thread = 4;
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 219d1174d42..1cc405234b2 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -16,7 +16,6 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index b0284e9cb96..e14142a9ad1 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 5df9943303d..4012ee3d21c 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -21,13 +21,10 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index cb707c94288..41c64c6decb 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -18,7 +18,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 8a5340dc20d..659beb749af 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -20,10 +20,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <algorithm>
-#include <cassert>
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index df25950e6d5..89ca8a089d6 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -221,7 +222,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // To minimize memory, count the number of characters so we can
   // build the output offsets without an intermediate buffer.
   // In the worst case each byte is a character so the output is 4x the input.
-  rmm::device_scalar<int64_t> d_count(0, stream);
+  cudf::detail::device_scalar<int64_t> d_count(0, stream);
   auto const num_blocks = cudf::util::div_rounding_up_safe(
     cudf::util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)),
     block_size);
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 52b96bc9039..b919ac16956 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -23,8 +23,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp
index 53ca0608170..d979bda41d0 100644
--- a/cpp/src/utilities/cuda.cpp
+++ b/cpp/src/utilities/cuda.cpp
@@ -18,8 +18,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cuda_runtime.h>
-
 namespace cudf::detail {
 
 cudf::size_type num_multiprocessors()
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 9d8e3cf2fa6..e30806a5011 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -18,7 +18,6 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 58971552758..000526723c4 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/prefetch.hpp>
 
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 8c29182bfb5..7069b59be26 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -23,7 +23,6 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <memory>
 #include <mutex>
 #include <vector>
 
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index a68dc84e340..c1e71f5f8f9 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -19,8 +19,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/dictionary.hpp>
 
-#include <cuda_runtime.h>
-
 namespace cudf {
 
 namespace {
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index 3095b342748..84c8529641d 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -21,8 +21,6 @@
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <algorithm>
 
 namespace cudf {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a4213dcbe94..6d3d1454462 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -611,7 +611,6 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
-  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp
@@ -711,11 +710,13 @@ ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp ST
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_PARTITIONING_TEST streams/partitioning_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_ROUND_TEST streams/round_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index a4bde50a21e..7af88d8aa34 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/ast/expressions.hpp>
@@ -26,14 +25,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/device_uvector.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -41,7 +34,6 @@
 #include <limits>
 #include <list>
 #include <random>
-#include <type_traits>
 #include <vector>
 
 template <typename T>
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index aa5b49567e6..3bd67001c16 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -26,9 +26,7 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 03cc87a1968..e9a2761db4a 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -19,7 +19,6 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
-#include <tests/binaryop/util/operation.h>
 #include <tests/binaryop/util/runtime_support.h>
 
 #include <cudf/binaryop.hpp>
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index fe221fb1c48..799bf646e52 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
 
diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp
index ab230ab036e..5570a7d498c 100644
--- a/cpp/tests/column/bit_cast_test.cpp
+++ b/cpp/tests/column/bit_cast_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
@@ -26,8 +25,6 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <random>
-
 template <typename T, typename T2 = void>
 struct rep_type_impl {
   using type = void;
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 631f5150829..d700adaebd5 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp
index 6de9121158b..470437f4112 100644
--- a/cpp/tests/column/column_view_device_span_test.cpp
+++ b/cpp/tests/column/column_view_device_span_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index 37ab4b8f387..ad344476332 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -15,9 +15,7 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 603187f0330..aa9d508b6aa 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -26,11 +26,8 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 
 class ColumnFactoryTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 18140c34abd..aedc498964a 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -34,8 +34,6 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <thrust/iterator/constant_iterator.h>
-
 #include <numeric>
 #include <stdexcept>
 #include <string>
diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp
index cfbd181f944..e1cdfe9beed 100644
--- a/cpp/tests/copying/copy_if_else_nested_tests.cpp
+++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index 25d93da277b..e2133a546e4 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 4124f749012..9c00725d5d2 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp
index 247090aac90..93f71345c5c 100644
--- a/cpp/tests/copying/gather_list_tests.cpp
+++ b/cpp/tests/copying/gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 28098878086..795e3f30aa1 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp
index 1598ab2646a..b2c0f7acc3a 100644
--- a/cpp/tests/copying/gather_struct_tests.cpp
+++ b/cpp/tests/copying/gather_struct_tests.cpp
@@ -17,20 +17,15 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index 07ce672b14d..908dcd67673 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 90ff97e7355..b2d64dac7c8 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -16,10 +16,8 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
index 4f28ff12941..1f76efdc4c3 100644
--- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -16,13 +16,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/tests/copying/reverse_tests.cpp b/cpp/tests/copying/reverse_tests.cpp
index e4b2d319ddf..46516436901 100644
--- a/cpp/tests/copying/reverse_tests.cpp
+++ b/cpp/tests/copying/reverse_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,17 +17,13 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/tabulate.h>
diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp
index 2f76e3f1fcd..8be5d8c1fbb 100644
--- a/cpp/tests/copying/sample_tests.cpp
+++ b/cpp/tests/copying/sample_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,9 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
index 42d2e004d6b..23faa6e5b86 100644
--- a/cpp/tests/copying/scatter_list_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 
 using mask_vector = std::vector<cudf::valid_type>;
 using size_column = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp
index a82860a3eec..1f87fcfcc99 100644
--- a/cpp/tests/copying/scatter_list_tests.cpp
+++ b/cpp/tests/copying/scatter_list_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
index 78572b0bb37..1d1da8a1b1e 100644
--- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp
index c92244d047b..7d88e9af85f 100644
--- a/cpp/tests/copying/scatter_struct_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_tests.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
 using namespace cudf::test::iterators;
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 41a753cd0ac..74c04446bdd 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
@@ -23,7 +22,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/stream_compaction.hpp>
 
 #include <stdexcept>
 
diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp
index 8881fb344a2..a133ae43872 100644
--- a/cpp/tests/copying/segmented_gather_list_tests.cpp
+++ b/cpp/tests/copying/segmented_gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/gather.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index ff6808d9a79..72a8e7357bc 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -30,7 +29,6 @@
 
 #include <limits>
 #include <memory>
-#include <stdexcept>
 
 using TestTypes = cudf::test::Types<int32_t>;
 
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index aef0d4ad78a..3868a147fa8 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -22,12 +22,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-#include <cudf/wrappers/timestamps.hpp>
 
 #include <array>
 #include <stdexcept>
diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp
index 0905f9babdc..90457f8d74c 100644
--- a/cpp/tests/copying/utility_tests.cpp
+++ b/cpp/tests/copying/utility_tests.cpp
@@ -23,7 +23,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 603edb27c7c..44f99adc0e9 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -23,14 +23,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/transform.h>
-
 #define XXX false  // stub for null values
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index 46bf5468922..ebc8c11e86c 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -24,8 +24,6 @@
 #include <cudf/dictionary/update_keys.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <vector>
-
 struct DictionaryAddKeysTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryAddKeysTest, StringsColumn)
diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp
index 5db0e9fa1e4..dfa3ede5d46 100644
--- a/cpp/tests/dictionary/encode_test.cpp
+++ b/cpp/tests/dictionary/encode_test.cpp
@@ -21,8 +21,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 
-#include <vector>
-
 struct DictionaryEncodeTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryEncodeTest, EncodeStringColumn)
diff --git a/cpp/tests/dictionary/fill_test.cpp b/cpp/tests/dictionary/fill_test.cpp
index 18696b66e48..bc7d19201aa 100644
--- a/cpp/tests/dictionary/fill_test.cpp
+++ b/cpp/tests/dictionary/fill_test.cpp
@@ -18,13 +18,10 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 
-#include <vector>
-
 struct DictionaryFillTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryFillTest, StringsColumn)
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 25501b4fde7..2774173b80a 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/dictionary/detail/search.hpp>
diff --git a/cpp/tests/dictionary/slice_test.cpp b/cpp/tests/dictionary/slice_test.cpp
index d80f8dee079..8c15d6dbecd 100644
--- a/cpp/tests/dictionary/slice_test.cpp
+++ b/cpp/tests/dictionary/slice_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 26badefe698..a5e2db6a005 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index 6326765c68b..c856984a4a3 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -17,14 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -33,7 +30,6 @@
 
 #include <algorithm>
 #include <numeric>
-#include <random>
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
 
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index 0783b4e5bbb..53782c90c26 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/filling.hpp>
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index a222289216d..b96c6909e55 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -18,17 +18,14 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/unary.hpp>
 
 #include <algorithm>
 #include <limits>
 #include <numeric>
-#include <type_traits>
 #include <vector>
 
 using namespace numeric;
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index a79b6a32916..ba456084a7c 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-
 template <typename V>
 struct groupby_collect_list_test : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 61d2838590b..dfd7eb82c4a 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp
index 26f714632dd..f8cc813e877 100644
--- a/cpp/tests/groupby/correlation_tests.cpp
+++ b/cpp/tests/groupby/correlation_tests.cpp
@@ -25,7 +25,6 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 
 #include <limits>
-#include <vector>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp
index e3eb2da201f..81378bb91e8 100644
--- a/cpp/tests/groupby/covariance_tests.cpp
+++ b/cpp/tests/groupby/covariance_tests.cpp
@@ -23,10 +23,8 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <limits>
-#include <vector>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index 5d99d15ae77..df0375d6a09 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -17,8 +17,8 @@
 #include "groupby_test_util.hpp"
 
 #include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
@@ -27,9 +27,6 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-
-#include <random>
 
 void test_single_agg(cudf::column_view const& keys,
                      cudf::column_view const& values,
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 755b0c20f17..9d2e613be3e 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,8 @@
 
 #pragma once
 
-#include <cudf/column/column_view.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 enum class force_use_sort_impl : bool { NO, YES };
 
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 2d447025919..783cfb17e49 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index d86de798844..6195e0179ec 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/dictionary/update_keys.hpp>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 279d71560b4..4481e2dc022 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 9736bb84dd6..1bfba265478 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 7f31bc9089f..f2a50248b4a 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -22,8 +22,6 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-
 using namespace cudf::test::iterators;
 
 template <typename T>
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index 14c9ceb4508..49f9d7cb10a 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
 template <typename T>
diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp
index 69e518cbf8d..b54adb52496 100644
--- a/cpp/tests/hashing/md5_test.cpp
+++ b/cpp/tests/hashing/md5_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
index c1a6e6ff6e1..b4622f5eb81 100644
--- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
+++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
@@ -17,11 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>
 #include <cudf/hashing.hpp>
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp
index e28e71442a6..3aa0bda6ae8 100644
--- a/cpp/tests/hashing/sha1_test.cpp
+++ b/cpp/tests/hashing/sha1_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
@@ -137,7 +136,7 @@ TEST_F(SHA1HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error);
 }
 
 TEST_F(SHA1HashTest, StructsUnsupported)
@@ -146,7 +145,7 @@ TEST_F(SHA1HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha1(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp
index 61b584f94df..3f6aeb9d5e6 100644
--- a/cpp/tests/hashing/sha224_test.cpp
+++ b/cpp/tests/hashing/sha224_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
@@ -137,7 +136,7 @@ TEST_F(SHA224HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error);
 }
 
 TEST_F(SHA224HashTest, StructsUnsupported)
@@ -146,7 +145,7 @@ TEST_F(SHA224HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha224(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
index 8bc47c92c6b..9519e96fbae 100644
--- a/cpp/tests/hashing/sha256_test.cpp
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
@@ -136,7 +135,7 @@ TEST_F(SHA256HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error);
 }
 
 TEST_F(SHA256HashTest, StructsUnsupported)
@@ -145,7 +144,7 @@ TEST_F(SHA256HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha256(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp
index 4c79934f98d..9de566b9d9b 100644
--- a/cpp/tests/hashing/sha384_test.cpp
+++ b/cpp/tests/hashing/sha384_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
@@ -155,7 +154,7 @@ TEST_F(SHA384HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error);
 }
 
 TEST_F(SHA384HashTest, StructsUnsupported)
@@ -164,7 +163,7 @@ TEST_F(SHA384HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha384(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp
index 0eb1c60b8fc..95e5245f38e 100644
--- a/cpp/tests/hashing/sha512_test.cpp
+++ b/cpp/tests/hashing/sha512_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
@@ -155,7 +154,7 @@ TEST_F(SHA512HashTest, ListsUnsupported)
 
   auto const input = cudf::table_view({strings_list_col});
 
-  EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error);
 }
 
 TEST_F(SHA512HashTest, StructsUnsupported)
@@ -164,7 +163,7 @@ TEST_F(SHA512HashTest, StructsUnsupported)
   auto struct_col  = cudf::test::structs_column_wrapper{{child_col}};
   auto const input = cudf::table_view({struct_col});
 
-  EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error);
+  EXPECT_THROW(cudf::hashing::sha512(input), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/hashing/xxhash_64_test.cpp b/cpp/tests/hashing/xxhash_64_test.cpp
index ab4ed829681..d8694a72d94 100644
--- a/cpp/tests/hashing/xxhash_64_test.cpp
+++ b/cpp/tests/hashing/xxhash_64_test.cpp
@@ -17,11 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/hashing.hpp>
 
 using NumericTypesNoBools =
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 2151ec6e22f..1ddc33e749a 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -17,17 +17,13 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index ef9936b214c..d93ef28aab8 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
@@ -28,7 +27,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp
index 80a2e4b2ffd..3916025bf22 100644
--- a/cpp/tests/interop/from_arrow_stream_test.cpp
+++ b/cpp/tests/interop/from_arrow_stream_test.cpp
@@ -17,27 +17,14 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 struct VectorOfArrays {
   std::vector<nanoarrow::UniqueArray> arrays;
   nanoarrow::UniqueSchema schema;
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 6e742b9e4cf..18efae75cb1 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -25,9 +25,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
@@ -37,8 +35,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <arrow/c/bridge.h>
-#include <nanoarrow/nanoarrow.h>
-#include <nanoarrow/nanoarrow_device.h>
 
 std::unique_ptr<cudf::table> get_cudf_table()
 {
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 7ba586461dc..29aa928c277 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -17,21 +17,15 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
index fcb4433b42e..fa3aa82fee2 100644
--- a/cpp/tests/interop/to_arrow_host_test.cpp
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -17,20 +17,14 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index a6aa4b22eca..86295d8efb1 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -19,14 +19,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index b265dcf9273..cc1e367d114 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -17,14 +17,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
@@ -32,18 +30,12 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/unary.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/execution_policy.h>
-#include <thrust/find.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <arrow/io/api.h>
-
 #include <algorithm>
 #include <fstream>
-#include <iostream>
 #include <iterator>
 #include <limits>
 #include <numeric>
diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp
index 3c41f21b0a4..1b85541687a 100644
--- a/cpp/tests/io/file_io_test.cpp
+++ b/cpp/tests/io/file_io_test.cpp
@@ -15,13 +15,10 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <src/io/utilities/file_io_utilities.hpp>
 
-#include <type_traits>
-
 // Base test fixture for tests
 struct CuFileIOTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index d23acf3ae00..c8c2d18903f 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
@@ -29,7 +28,6 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <string>
 
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index cb6716f4a18..b58ca56e066 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -39,8 +39,6 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
-#include <arrow/io/api.h>
-
 #include <fstream>
 #include <limits>
 #include <memory>
@@ -2975,4 +2973,22 @@ TEST_F(JsonReaderTest, JsonDtypeSchema)
                                  cudf::test::debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(JsonReaderTest, LastRecordInvalid)
+{
+  std::string data = R"({"key": "1"}
+    {"key": "})";
+  std::map<std::string, cudf::io::schema_element> schema{{"key", {dtype<cudf::string_view>()}}};
+  auto opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(schema)
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .build();
+  auto const result = cudf::io::read_json(opts);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "key");
+  cudf::test::strings_column_wrapper expected{{"1", ""}, cudf::test::iterators::nulls_at({1})};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}});
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 15682c6ae6b..887d4fa783f 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -15,12 +15,8 @@
  */
 
 #include "io/json/nested_json.hpp"
-#include "io/utilities/hostdevice_vector.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/random.hpp>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
@@ -29,9 +25,9 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
+#include <numeric>
 #include <stack>
 #include <string>
 #include <unordered_map>
diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh
index 9383797d91b..c31bb2d24e0 100644
--- a/cpp/tests/io/json/json_utils.cuh
+++ b/cpp/tests/io/json/json_utils.cuh
@@ -52,6 +52,7 @@ std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
                                                                 reader_opts.get_compression(),
                                                                 reader_opts.get_byte_range_offset(),
                                                                 reader_opts.get_byte_range_size(),
+                                                                reader_opts.get_delimiter(),
                                                                 stream);
     // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the
     // return type of that function is size_type. However, when the chunk_size is
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index f32aba0e632..e0e955c4f48 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -21,24 +21,16 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/random.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
 
 #include <string>
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 8ad1fea649d..5f1aea71f73 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1358,10 +1358,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
-    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
-  });
-  auto const col = data_col(it, it + num_rows);
+  auto const it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<int64_t>(0), [num_rows](int64_t i) {
+      return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+    });
+  auto const col         = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index cce0adbf317..fce99187516 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -31,7 +31,6 @@
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index c90b81ed27a..d66aa3bde9d 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -22,13 +22,11 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/io/parquet_metadata.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <src/io/parquet/compact_protocol_reader.hpp>
 #include <src/io/parquet/parquet.hpp>
-#include <src/io/parquet/parquet_gpu.hpp>
 
 #include <random>
 #include <type_traits>
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index f1286a00d22..d66f685cd9c 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
-#include <cudf/stream_compaction.hpp>
-#include <cudf/transform.hpp>
 
 #include <array>
 
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index ab4645c2e25..177e6163d4f 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
 
+#include <src/io/parquet/parquet_gpu.hpp>
+
 #include <array>
 
 TEST_F(ParquetReaderTest, UserBounds)
@@ -2725,7 +2727,9 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
 }
 
-TEST_F(ParquetReaderTest, ListsWideTable)
+// The test below requires several minutes to complete with memcheck, thus it is disabled by
+// default.
+TEST_F(ParquetReaderTest, DISABLED_ListsWideTable)
 {
   auto constexpr num_rows = 2;
   auto constexpr num_cols = 26'755;  // for slightly over 2B keys
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index be2ecd56424..5c3c8342cd2 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 // NOTE: this file exists to define the parquet test's `main()` function.
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index ebadd870091..c40d3bbd299 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <src/io/utilities/row_selection.hpp>
diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp
index 6f46df20633..79ce908f3e0 100644
--- a/cpp/tests/io/text/data_chunk_source_test.cpp
+++ b/cpp/tests/io/text/data_chunk_source_test.cpp
@@ -15,14 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <fstream>
 #include <random>
 
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 74d08061df9..60244462e2c 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -19,16 +19,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/concatenate.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 using cudf::test::strings_column_wrapper;
diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp
index 22bc7475dbe..f7f7c0f2721 100644
--- a/cpp/tests/iterator/value_iterator.cpp
+++ b/cpp/tests/iterator/value_iterator.cpp
@@ -13,7 +13,6 @@
  * the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp
index 6f9dfd06730..c9bb691907a 100644
--- a/cpp/tests/jit/parse_ptx_function.cpp
+++ b/cpp/tests/jit/parse_ptx_function.cpp
@@ -16,7 +16,6 @@
 
 #include "jit/parser.hpp"
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <algorithm>
diff --git a/cpp/tests/join/cross_join_tests.cpp b/cpp/tests/join/cross_join_tests.cpp
index d87f5e54153..971913443e5 100644
--- a/cpp/tests/join/cross_join_tests.cpp
+++ b/cpp/tests/join/cross_join_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 178edc52dd3..9070efa38fe 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -15,12 +15,8 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/filling.hpp>
@@ -31,7 +27,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <limits>
 #include <vector>
 
 template <typename T>
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 3431e941359..6a8a54c8465 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -20,17 +20,12 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/join.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 554d5754e39..ddc65c3f379 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -22,7 +22,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/join.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp
index 42a574ac5c0..53166e04173 100644
--- a/cpp/tests/json/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/json/json.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 7b61be113f9..f1404990354 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -16,12 +16,10 @@
 
 #include "large_strings_fixture.hpp"
 
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
index 007c08ce0fb..f47782a2d02 100644
--- a/cpp/tests/large_strings/parquet_tests.cpp
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -16,8 +16,6 @@
 
 #include "large_strings_fixture.hpp"
 
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 8fb2b403051..7ae7a6a7414 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index 92dd5df5ec7..2c24f695c29 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -21,12 +21,8 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/extract.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
index 74545903eb3..dcb906cd2ef 100644
--- a/cpp/tests/lists/sequences_tests.cpp
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
index 5625b47e7ea..18aa118bb81 100644
--- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/lists/extract.hpp>
 #include <cudf/lists/stream_compaction.hpp>
 
 namespace cudf::test {
diff --git a/cpp/tests/merge/merge_dictionary_test.cpp b/cpp/tests/merge/merge_dictionary_test.cpp
index dd528c19e4e..1d7a31fd797 100644
--- a/cpp/tests/merge/merge_dictionary_test.cpp
+++ b/cpp/tests/merge/merge_dictionary_test.cpp
@@ -17,9 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index bea044496b3..d9fdb6099f0 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -17,10 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
@@ -30,10 +28,6 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <limits>
 #include <memory>
 #include <vector>
 
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 6208d395f0a..fad390105d7 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
@@ -34,7 +33,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/merge.h>
 
 #include <vector>
 
diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp
index 89d23c39dca..3693cfbcc72 100644
--- a/cpp/tests/partitioning/round_robin_test.cpp
+++ b/cpp/tests/partitioning/round_robin_test.cpp
@@ -17,10 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/partitioning.hpp>
 #include <cudf/table/table.hpp>
@@ -30,12 +28,7 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <limits>
 #include <memory>
-#include <numeric>
 #include <vector>
 
 using cudf::test::fixed_width_column_wrapper;
diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp
index 6e88365b6e8..23b58618fe1 100644
--- a/cpp/tests/quantiles/quantile_test.cpp
+++ b/cpp/tests/quantiles/quantile_test.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/quantiles.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
 #include <limits>
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index 44d4ec61852..c7e11af8c85 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -16,7 +16,6 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp
index 09cec688509..1117b0d1acf 100644
--- a/cpp/tests/reductions/ewm_tests.cpp
+++ b/cpp/tests/reductions/ewm_tests.cpp
@@ -18,9 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 
 template <typename T>
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index f5470f7d881..cb412f1e925 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 
-#include <benchmarks/common/generate_input.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/filling.hpp>
 #include <cudf/reduction.hpp>
 
 struct ListRankScanTest : public cudf::test::BaseFixture {
@@ -136,7 +131,7 @@ TEST_F(ListRankScanTest, ListOfStruct)
      false,
      false}};
   auto col2 = cudf::test::strings_column_wrapper{
-    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
+    {"x", "x", "a", "a", "b", "", "a", "b", "a", "b", "a", "c", "a", "c", "", "", "b", "b"},
     {true,
      true,
      true,
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index 3ab1fc01eaa..130458548fc 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
@@ -126,7 +125,7 @@ auto make_input_column()
 {
   if constexpr (std::is_same_v<TypeParam, cudf::string_view>) {
     return cudf::test::strings_column_wrapper{
-      {"0", "0", "4", "4", "4", "5", "7", "7", "7", "9", "9", "9"},
+      {"0", "0", "4", "4", "4", "", "7", "7", "7", "9", "9", "9"},
       cudf::test::iterators::null_at(5)};
   } else {
     using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index bdb98372836..67083f19b3a 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -22,9 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -33,11 +31,9 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <iostream>
 #include <iterator>
 #include <vector>
 
@@ -1259,6 +1255,12 @@ TEST_P(StringReductionTest, MinMax)
   // data and valid arrays
   std::vector<std::string> host_strings(GetParam());
   std::vector<bool> host_bools({true, false, true, true, true, true, false, false, true});
+  std::transform(thrust::counting_iterator<std::size_t>(0),
+                 thrust::counting_iterator<std::size_t>(host_strings.size()),
+                 host_strings.begin(),
+                 [host_strings, host_bools](auto idx) {
+                   return host_bools[idx] ? host_strings[idx] : std::string{};
+                 });
   bool succeed(true);
   std::string initial_value = "init";
 
@@ -1385,7 +1387,7 @@ TEST_F(StringReductionTest, AllNull)
   std::vector<std::string> host_strings(
     {"one", "two", "three", "four", "five", "six", "seven", "eight", "nine"});
   std::vector<bool> host_bools(host_strings.size(), false);
-  auto initial_value = cudf::make_string_scalar("init");
+  auto initial_value = cudf::make_string_scalar("");
   initial_value->set_valid_async(false);
 
   // string column with nulls
@@ -3086,21 +3088,28 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
 
-  // `null` means null at child column.
-  // `NULL` means null at parent column.
   auto const input = [] {
     auto child1 = STRINGS_CW{{"año",
                               "bit",
-                              "₹1" /*null*/,
-                              "aaa" /*NULL*/,
+                              "",     // child null
+                              "aaa",  // parent null
                               "zit",
                               "bat",
                               "aab",
-                              "$1" /*null*/,
-                              "€1" /*NULL*/,
+                              "",    // child null
+                              "€1",  // parent null
                               "wut"},
                              nulls_at({2, 7})};
-    auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10},
+    auto child2 = INTS_CW{{1,
+                           2,
+                           0,  // child null
+                           4,  // parent null
+                           5,
+                           6,
+                           7,
+                           0,  // child null
+                           9,  // parent NULL
+                           10},
                           nulls_at({2, 7})};
     return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
   }();
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index c4463d68a68..5f911597b02 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -20,13 +20,11 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/reduction.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 
 #include <algorithm>
@@ -414,12 +412,13 @@ TEST_F(ScanStringsTest, MoreStringsMinMax)
 {
   int row_count = 512;
 
-  auto data_begin = cudf::detail::make_counting_transform_iterator(0, [](auto idx) {
+  auto validity = cudf::detail::make_counting_transform_iterator(
+    0, [](auto idx) -> bool { return (idx % 23) != 22; });
+  auto data_begin = cudf::detail::make_counting_transform_iterator(0, [validity](auto idx) {
+    if (validity[idx] == 0) return std::string{};
     char const s = static_cast<char>('a' + (idx % 26));
     return std::string{1, s};
   });
-  auto validity   = cudf::detail::make_counting_transform_iterator(
-    0, [](auto idx) -> bool { return (idx % 23) != 22; });
   cudf::test::strings_column_wrapper col(data_begin, data_begin + row_count, validity);
 
   thrust::host_vector<std::string> v(data_begin, data_begin + row_count);
@@ -622,21 +621,28 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
 
-  // `null` means null at child column.
-  // `NULL` means null at parent column.
   auto const input = [] {
     auto child1 = STRINGS_CW{{"año",
                               "bit",
-                              "₹1" /*null*/,
-                              "aaa" /*NULL*/,
+                              "",     // child null
+                              "aaa",  // parent null
                               "zit",
                               "bat",
                               "aab",
-                              "$1" /*null*/,
-                              "€1" /*NULL*/,
+                              "",    // child null
+                              "€1",  // parent null
                               "wut"},
                              nulls_at({2, 7})};
-    auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10},
+    auto child2 = INTS_CW{{1,
+                           2,
+                           0,  // child null
+                           4,  // parent null
+                           5,
+                           6,
+                           7,
+                           0,  // child null
+                           9,  // parent null
+                           10},
                           nulls_at({2, 7})};
     return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
   }();
@@ -694,25 +700,25 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
     auto const expected = [] {
       auto child1 = STRINGS_CW{{"año",
                                 "año",
-                                "" /*null*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/,
-                                "" /*NULL*/},
+                                "",   // child null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                "",   // parent null
+                                ""},  // parent null
                                null_at(2)};
       auto child2 = INTS_CW{{1,
                              1,
-                             0 /*null*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/,
-                             0 /*NULL*/},
+                             0,   // child null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0,   // parent null
+                             0},  // parent null
                             null_at(2)};
       return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
     }();
diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp
index 858697d8ef5..c2cce4bbbfa 100644
--- a/cpp/tests/reductions/scan_tests.hpp
+++ b/cpp/tests/reductions/scan_tests.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/strings/string_view.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -30,7 +28,6 @@
 
 #include <initializer_list>
 #include <type_traits>
-#include <vector>
 
 template <typename T>
 struct TypeParam_to_host_type {
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index 239c9ce6ddd..e972ea35ed0 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/replace/normalize_replace_tests.cpp b/cpp/tests/replace/normalize_replace_tests.cpp
index 2de17388ee8..c35f385329a 100644
--- a/cpp/tests/replace/normalize_replace_tests.cpp
+++ b/cpp/tests/replace/normalize_replace_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/replace.hpp>
 
 // This is the main test fixture
diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp
index 35232204db7..1b9fe92066a 100644
--- a/cpp/tests/replace/replace_nans_tests.cpp
+++ b/cpp/tests/replace/replace_nans_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index fcee27305f2..0c8ccea52a6 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -20,13 +20,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/dictionary/encode.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index b12bf08520f..ae4041bcfaf 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -20,20 +20,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/iterator/transform_iterator.h>
 
 #include <gtest/gtest.h>
 
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index b3d9b2e2f5f..59585c0e947 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/reshape.hpp>
diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp
index ed76b9d2ea5..25cfc5c5108 100644
--- a/cpp/tests/reshape/tile_tests.cpp
+++ b/cpp/tests/reshape/tile_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index 165e0347785..e8a36d9ab48 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp
index e7d1e3f0b10..2e1815671a9 100644
--- a/cpp/tests/rolling/empty_input_test.cpp
+++ b/cpp/tests/rolling/empty_input_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,7 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp
index fcfbd0eee78..2cb9b60000b 100644
--- a/cpp/tests/rolling/grouped_rolling_range_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,21 +17,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/bit.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 78d5daf7e83..78b444bcd93 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp
index de057e96320..6519b0ed4ee 100644
--- a/cpp/tests/rolling/lead_lag_test.cpp
+++ b/cpp/tests/rolling/lead_lag_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -26,7 +25,6 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp
index 2444992e68f..5f2b383ed55 100644
--- a/cpp/tests/rolling/nth_element_test.cpp
+++ b/cpp/tests/rolling/nth_element_test.cpp
@@ -17,22 +17,15 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/rolling.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <gtest/gtest-typed-test.h>
-
 #include <memory>
 #include <optional>
 
diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
index 0eaab0c9f7a..dcaa47e722b 100644
--- a/cpp/tests/rolling/offset_row_window_test.cpp
+++ b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -17,14 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/lists/explode.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 template <typename T>
 using fwcw = cudf::test::fixed_width_column_wrapper<T>;
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
index 461c41025e9..daf5fcc1d96 100644
--- a/cpp/tests/rolling/range_rolling_window_test.cpp
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -17,22 +17,17 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/bit.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <src/rolling/detail/range_window_bounds.hpp>
-#include <src/rolling/detail/rolling.hpp>
 
 #include <utility>
 #include <vector>
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
index b77451bf0bc..a67555280f4 100644
--- a/cpp/tests/rolling/range_window_bounds_test.cpp
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -15,9 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/rolling/range_window_bounds.hpp>
@@ -25,8 +22,6 @@
 
 #include <src/rolling/detail/range_window_bounds.hpp>
 
-#include <vector>
-
 struct RangeWindowBoundsTest : public cudf::test::BaseFixture {};
 
 template <typename Timestamp>
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 6e0dc16dca9..72a511fd5f1 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
@@ -30,7 +29,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/traits.hpp>
 
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index 5f132f3ace9..26987ea1b7b 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -22,11 +22,8 @@
 
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 class ScalarFactoryTest : public cudf::test::BaseFixture {};
 
 template <typename T>
diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp
index 78f79ccc648..a3bb1dfda10 100644
--- a/cpp/tests/search/search_dictionary_test.cpp
+++ b/cpp/tests/search/search_dictionary_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/search.hpp>
 
diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp
index 7584003e800..fb5d0fcc889 100644
--- a/cpp/tests/search/search_list_test.cpp
+++ b/cpp/tests/search/search_list_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index c35d359e75c..05b9deb3463 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 7550cc27161..8d750be5677 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/search.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp
index 109095192f9..e3c9f8d349e 100644
--- a/cpp/tests/sort/is_sorted_tests.cpp
+++ b/cpp/tests/sort/is_sorted_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index e08a2105aea..ded46cb1f31 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -18,10 +18,8 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp
index 8ab23936ceb..ce4148a941e 100644
--- a/cpp/tests/sort/sort_nested_types_tests.cpp
+++ b/cpp/tests/sort/sort_nested_types_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 6a35e977b46..e1505c7a474 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -28,7 +28,6 @@
 #include <cudf/types.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/sort.h>
 
 #include <type_traits>
 #include <vector>
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
index 655166e0d62..88de9d51523 100644
--- a/cpp/tests/sort/stable_sort_tests.cpp
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -25,9 +25,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/host_vector.h>
-#include <thrust/sort.h>
-
 #include <type_traits>
 #include <vector>
 
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 6c0582fb846..1204b019739 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -20,9 +20,7 @@
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/stream_compaction.hpp>
@@ -31,8 +29,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 
 struct ApplyBooleanMask : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
index a2dab649961..ee1bb3ead92 100644
--- a/cpp/tests/stream_compaction/distinct_count_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -15,16 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp
index 14d7d8789ac..c618ff68cbb 100644
--- a/cpp/tests/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
@@ -27,8 +26,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr null{0};  // null at current level
 auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
diff --git a/cpp/tests/stream_compaction/drop_nans_tests.cpp b/cpp/tests/stream_compaction/drop_nans_tests.cpp
index bf72da5c840..71321361564 100644
--- a/cpp/tests/stream_compaction/drop_nans_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nans_tests.cpp
@@ -15,12 +15,9 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
index dbac1d58195..d3b45c2323e 100644
--- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
@@ -15,12 +15,10 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
index 6c6c53331d4..cc847da6340 100644
--- a/cpp/tests/stream_compaction/stable_distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
@@ -15,20 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr null{0};  // null at current level
 auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp
index 640d159fc4f..bad93e92712 100644
--- a/cpp/tests/stream_compaction/unique_count_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_count_tests.cpp
@@ -15,16 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp
index d5b6915b520..e2b32b898b3 100644
--- a/cpp/tests/stream_compaction/unique_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_tests.cpp
@@ -15,22 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <algorithm>
-#include <cmath>
-
 using cudf::nan_policy;
 using cudf::null_equality;
 using cudf::null_policy;
diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp
index 2a7b52b1b6b..3dcc6f9e632 100644
--- a/cpp/tests/streams/binaryop_test.cpp
+++ b/cpp/tests/streams/binaryop_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/binaryop.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
 class BinaryopTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
index 42894a0ebcb..a74ee64f8de 100644
--- a/cpp/tests/streams/io/csv_test.cpp
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -17,13 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/io/csv.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
index f98e685ed0c..d352c6c3b2a 100644
--- a/cpp/tests/streams/io/json_test.cpp
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -19,9 +19,7 @@
 #include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp
index b0eff1d3340..5bb17226029 100644
--- a/cpp/tests/streams/io/multibyte_split_test.cpp
+++ b/cpp/tests/streams/io/multibyte_split_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/default_stream.hpp>
 
-#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
index cc43bf15b5d..10722557e6a 100644
--- a/cpp/tests/streams/io/orc_test.cpp
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -17,19 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/orc.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
-#include <cudf/io/orc_types.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
-#include <iostream>
-#include <random>
-#include <sstream>
 #include <string>
 #include <vector>
 
diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp
index 9d2dec2d697..18bb80e64af 100644
--- a/cpp/tests/streams/io/parquet_test.cpp
+++ b/cpp/tests/streams/io/parquet_test.cpp
@@ -17,13 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp
index 2811bb676fa..27bd7e080c9 100644
--- a/cpp/tests/streams/join_test.cpp
+++ b/cpp/tests/streams/join_test.cpp
@@ -19,11 +19,9 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <cstdint>
 #include <memory>
diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp
index e96224003f4..ed37a72545f 100644
--- a/cpp/tests/streams/null_mask_test.cpp
+++ b/cpp/tests/streams/null_mask_test.cpp
@@ -14,15 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/binaryop/util/runtime_support.h>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/scalar/scalar.hpp>
 
 class NullMaskTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/partitioning_test.cpp b/cpp/tests/streams/partitioning_test.cpp
new file mode 100644
index 00000000000..636c5c1f1f9
--- /dev/null
+++ b/cpp/tests/streams/partitioning_test.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/partitioning.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+
+using cudf::test::fixed_width_column_wrapper;
+using cudf::test::strings_column_wrapper;
+
+class PartitionTest : public cudf::test::BaseFixture {};
+
+TEST_F(PartitionTest, Struct)
+{
+  fixed_width_column_wrapper<numeric::decimal32, int32_t> A({1, 2}, {0, 1});
+  auto struct_col         = cudf::test::structs_column_wrapper({A}, {0, 1}).release();
+  auto table_to_partition = cudf::table_view{{*struct_col}};
+  fixed_width_column_wrapper<int32_t> map{9, 2};
+
+  auto num_partitions = 12;
+  auto result =
+    cudf::partition(table_to_partition, map, num_partitions, cudf::test::get_default_stream());
+}
+
+TEST_F(PartitionTest, EmptyInput)
+{
+  auto const empty_column    = fixed_width_column_wrapper<int32_t>{};
+  auto const num_partitions  = 5;
+  auto const start_partition = 0;
+  auto const [out_table, out_offsets] =
+    cudf::round_robin_partition(cudf::table_view{{empty_column}},
+                                num_partitions,
+                                start_partition,
+                                cudf::test::get_default_stream());
+}
+
+TEST_F(PartitionTest, ZeroPartitions)
+{
+  fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
+  fixed_width_column_wrapper<int16_t> integers({1, 2, 3, 4, 5, 6, 7, 8});
+  strings_column_wrapper strings({"a", "bb", "ccc", "d", "ee", "fff", "gg", "h"});
+  auto input = cudf::table_view({floats, integers, strings});
+
+  auto columns_to_hash = std::vector<cudf::size_type>({2});
+
+  cudf::size_type const num_partitions = 0;
+  auto [output, offsets]               = cudf::hash_partition(input,
+                                                columns_to_hash,
+                                                num_partitions,
+                                                cudf::hash_id::HASH_MURMUR3,
+                                                cudf::DEFAULT_HASH_SEED,
+                                                cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
index b4f013fc960..9ab972302e4 100644
--- a/cpp/tests/streams/reduction_test.cpp
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -17,11 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp
index b352ad2c0d2..4d9899870b4 100644
--- a/cpp/tests/streams/rolling_test.cpp
+++ b/cpp/tests/streams/rolling_test.cpp
@@ -17,12 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 
 class RollingTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/round_test.cpp b/cpp/tests/streams/round_test.cpp
new file mode 100644
index 00000000000..b8fda022db8
--- /dev/null
+++ b/cpp/tests/streams/round_test.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/round.hpp>
+
+#include <vector>
+
+class RoundTest : public cudf::test::BaseFixture {};
+
+TEST_F(RoundTest, RoundHalfToEven)
+{
+  std::vector<double> vals = {1.729, 17.29, 172.9, 1729};
+  cudf::test::fixed_width_column_wrapper<double> input(vals.begin(), vals.end());
+  cudf::round(input, 0, cudf::rounding_method::HALF_UP, cudf::test::get_default_stream());
+}
+
+TEST_F(RoundTest, RoundHalfAwayFromEven)
+{
+  std::vector<double> vals = {1.5, 2.5, 1.35, 1.45, 15, 25};
+  cudf::test::fixed_width_column_wrapper<double> input(vals.begin(), vals.end());
+  cudf::round(input, -1, cudf::rounding_method::HALF_EVEN, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 07b2d77cc04..e7b282601e1 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -15,20 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
 auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
 auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp
index 36e595ab9fa..449e0830b0c 100644
--- a/cpp/tests/streams/strings/factory_test.cpp
+++ b/cpp/tests/streams/strings/factory_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp
index 4b4d0a7aff5..154e1c1b715 100644
--- a/cpp/tests/streams/strings/reverse_test.cpp
+++ b/cpp/tests/streams/strings/reverse_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf/strings/reverse.hpp>
 
 #include <string>
-#include <vector>
 
 class StringsReverseTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp
index cf81dc6fb42..9f168abcb31 100644
--- a/cpp/tests/streams/transform_test.cpp
+++ b/cpp/tests/streams/transform_test.cpp
@@ -15,17 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/ast/expressions.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index 9c0ecaa52c0..06b9c2fa3c1 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,10 +23,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index bb57d6f5e8a..e53adcf373a 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp
index 00317146088..c92f1cfc8f8 100644
--- a/cpp/tests/strings/combine/join_list_elements_tests.cpp
+++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/types.hpp>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index 5cf4015b9e9..51dcc60d95e 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index b3dc3010c67..da0db0fc056 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/strings/convert/convert_datetime.hpp>
-#include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/wrappers/durations.hpp>
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 7e0338f1bf4..37b25d9b287 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/detail/iterator.cuh>
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4821a7fa999..7eb4b32d078 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
@@ -28,8 +27,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <vector>
-
 struct StringsFindallTests : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindallTests, FindallTest)
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 79054551498..b788c05c152 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -23,8 +23,6 @@
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <limits>
-
 struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 template <typename T>
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 26bcfe8028d..c08effdb969 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -24,9 +24,6 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index 219bd6d8b01..a34ff25cb69 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -17,28 +17,18 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
 
 #include <algorithm>
 #include <functional>
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index c33eedf9bd9..c0df2f01a63 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -14,21 +14,15 @@
  * limitations under the License.
  */
 
-#include "cudf_test/default_stream.hpp"
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp
index 5fa63c47cf0..216c4d7b6bb 100644
--- a/cpp/tests/table/row_operators_tests.cpp
+++ b/cpp/tests/table/row_operators_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp
index 1637ba7d7d3..363f1a0ba5d 100644
--- a/cpp/tests/table/table_tests.cpp
+++ b/cpp/tests/table/table_tests.cpp
@@ -17,17 +17,14 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <memory>
-#include <random>
 
 template <typename T>
 using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index e23f3f6e7d8..ef35a4472cf 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -21,13 +21,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
 #include <vector>
 
 struct MinHashTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 1acb4fc4265..c72c7cfc80e 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -28,8 +28,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <vector>
-
 struct TextGenerateNgramsTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextGenerateNgramsTest, Ngrams)
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index b0d41004e7e..2515cc917fa 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/normalize.hpp>
diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp
index a343913411c..82c4bf53cfc 100644
--- a/cpp/tests/text/stemmer_tests.cpp
+++ b/cpp/tests/text/stemmer_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/stemmer.hpp>
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index a615780c02a..782551ad66e 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -19,13 +19,11 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
 
 #include <fstream>
-#include <iostream>
 #include <vector>
 
 // Global environment for temporary files
diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp
index 2684123c08a..9437440f34d 100644
--- a/cpp/tests/transform/bools_to_mask_test.cpp
+++ b/cpp/tests/transform/bools_to_mask_test.cpp
@@ -20,10 +20,8 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
 
 #include <thrust/host_vector.h>
 
diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp
index ba16c100e7a..42ca872a936 100644
--- a/cpp/tests/transform/nans_to_null_test.cpp
+++ b/cpp/tests/transform/nans_to_null_test.cpp
@@ -17,12 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
 
 template <typename T>
 struct NaNsToNullTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 5a88c402b8c..7797b2b2cf8 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -22,7 +22,6 @@
 #include <cudf/transpose.hpp>
 
 #include <algorithm>
-#include <limits>
 #include <random>
 #include <string>
 
diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp
index 0d9092c33da..46468af515d 100644
--- a/cpp/tests/types/traits_test.cpp
+++ b/cpp/tests/types/traits_test.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 45b89b76070..ed4c1340dbb 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -20,18 +20,15 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cuda/std/limits>
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <type_traits>
 #include <vector>
 
 static auto const test_timestamps_D = std::vector<int32_t>{
diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp
index 5bfbf70d5f9..663a919f3f4 100644
--- a/cpp/tests/unary/math_ops_test.cpp
+++ b/cpp/tests/unary/math_ops_test.cpp
@@ -22,10 +22,6 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/unary.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
-#include <cuda/std/climits>
 
 #include <vector>
 
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index e7477c34642..3c616461c74 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -23,7 +23,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/unary.hpp>
 
-#include <cuda/std/limits>
 #include <thrust/iterator/counting_iterator.h>
 
 template <typename T>
diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp
index ab5a31ce161..555d89b7dc5 100644
--- a/cpp/tests/utilities/random_seed.cpp
+++ b/cpp/tests/utilities/random_seed.cpp
@@ -13,8 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/utilities/export.hpp>
 
-#include <cudf/types.hpp>
+#include <cstdint>
 
 namespace cudf {
 namespace test {
diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp
index 7aa05af4591..2a57d678d07 100644
--- a/cpp/tests/utilities_tests/column_debug_tests.cpp
+++ b/cpp/tests/utilities_tests/column_debug_tests.cpp
@@ -16,12 +16,9 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <type_traits>
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 9d6d5ccb9b5..a13ce825d0b 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -17,20 +17,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <type_traits>
-
 template <typename T>
 struct ColumnUtilitiesTest : public cudf::test::BaseFixture {
   cudf::test::UniformRandomGenerator<cudf::size_type> random;
diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
index 479c6687e75..339678f3be8 100644
--- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
index 5e3fda5e6f7..ff50dc39979 100644
--- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index fecb896f95a..c1c5776be74 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/wrappers/durations.hpp>
diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp
index 849457056e4..6c3a84763a0 100644
--- a/cpp/tests/utilities_tests/type_list_tests.cpp
+++ b/cpp/tests/utilities_tests/type_list_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 
 using namespace cudf::test;  // this will make reading code way easier
@@ -23,6 +22,7 @@ namespace {
 // Work around to remove parentheses surrounding a type
 template <typename T>
 struct argument_type;
+
 template <typename T, typename U>
 struct argument_type<T(U)> {
   using type = U;
diff --git a/dependencies.yaml b/dependencies.yaml
index ff97b67f0ce..90255ca674c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -232,7 +232,7 @@ files:
       key: cudf-pandas-tests
     includes:
       - test_python_cudf_pandas
-  py_rapids_build_cudf_polars:
+  py_build_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
     extras:
@@ -399,21 +399,21 @@ dependencies:
       - output_types: conda
         packages:
           # Align nvcomp version with rapids-cmake
-          - nvcomp==4.0.1
+          - nvcomp==4.1.0.6
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
             packages:
-              - nvidia-nvcomp-cu12==4.0.1
+              - nvidia-nvcomp-cu12==4.1.0.6
           - matrix:
               cuda: "11.*"
             packages:
-              - nvidia-nvcomp-cu11==4.0.1
+              - nvidia-nvcomp-cu11==4.1.0.6
           - matrix:
             packages:
-              - nvidia-nvcomp==4.0.1
+              - nvidia-nvcomp==4.1.0.6
   rapids_build_skbuild:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -727,7 +727,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.8,<1.9
+          - polars>=1.11,<1.13
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -828,6 +828,7 @@ dependencies:
           - pytest-benchmark
           - pytest-cases>=3.8.2
           - scipy
+          - mmh3
       - output_types: conda
         packages:
           - aiobotocore>=2.2.0
@@ -836,12 +837,14 @@ dependencies:
           - msgpack-python
           - moto>=4.0.8
           - s3fs>=2022.3.0
-      - output_types: pyproject
+          - python-xxhash
+      - output_types: [pyproject, requirements]
         packages:
           - msgpack
           - &tokenizers tokenizers==0.15.2
           - &transformers transformers==4.39.3
           - tzdata
+          - xxhash
     specific:
       - output_types: [conda, requirements]
         matrices:
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index ecf619ddc44..5942cc16850 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,10 +342,7 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer
-    # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory
-    # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv
-    "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 6fce268f309..f4d2c7319b3 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
-- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
+- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 95f5f9734dd..46221b6015b 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -38,10 +38,10 @@
     "import os\n",
     "\n",
     "import cupy as cp\n",
+    "import dask_cudf\n",
     "import pandas as pd\n",
     "\n",
     "import cudf\n",
-    "import dask_cudf\n",
     "\n",
     "cp.random.seed(12)\n",
     "\n",
diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index d05501f4a4a..f711327f9ed 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -19,7 +19,7 @@ This page provides a list of all publicly accessible modules, methods and classe
     general_utilities
     window
     io
-    subword_tokenize
+    tokenize_vocabulary
     string_handling
     list_handling
     struct_handling
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst
new file mode 100644
index 00000000000..6bd1fbd821b
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/hashing.rst
@@ -0,0 +1,6 @@
+=======
+hashing
+=======
+
+.. automodule:: pylibcudf.hashing
+    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 62e14a67ee5..997ece6d29c 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -19,6 +19,7 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
+    hashing
     interop
     join
     json
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst
new file mode 100644
index 00000000000..908fcc4fde6
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/byte_pair_encode.rst
@@ -0,0 +1,6 @@
+================
+byte_pair_encode
+================
+
+.. automodule:: pylibcudf.nvtext.byte_pair_encode
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 58303356336..9ba47fd8d70 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -8,4 +8,10 @@ nvtext
     generate_ngrams
     jaccard
     minhash
+    byte_pair_encode
     ngrams_tokenize
+    normalize
+    replace
+    stemmer
+    subword_tokenize
+    tokenize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
new file mode 100644
index 00000000000..e496f6a45da
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
@@ -0,0 +1,6 @@
+=========
+normalize
+=========
+
+.. automodule:: pylibcudf.nvtext.normalize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
new file mode 100644
index 00000000000..04cee972dc1
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
@@ -0,0 +1,6 @@
+=======
+replace
+=======
+
+.. automodule:: pylibcudf.nvtext.replace
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
new file mode 100644
index 00000000000..b407ff8451a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
@@ -0,0 +1,6 @@
+=======
+stemmer
+=======
+
+.. automodule:: pylibcudf.nvtext.stemmer
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
new file mode 100644
index 00000000000..818714bec6a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/subword_tokenize.rst
@@ -0,0 +1,6 @@
+================
+subword_tokenize
+================
+
+.. automodule:: pylibcudf.nvtext.subword_tokenize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst
new file mode 100644
index 00000000000..85c5a27b09d
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/tokenize.rst
@@ -0,0 +1,6 @@
+========
+tokenize
+========
+
+.. automodule:: pylibcudf.nvtext.tokenize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
index 9850ee10098..699e38ebbe5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst
@@ -1,6 +1,6 @@
-====
-find
-====
+=======
+findall
+=======
 
 .. automodule:: pylibcudf.strings.findall
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index c8c0016126d..ae670b5bd8a 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -16,6 +16,7 @@ strings
     regex_flags
     regex_program
     repeat
+    replace_re
     replace
     side_type
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
new file mode 100644
index 00000000000..5bf715ef657
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
@@ -0,0 +1,6 @@
+==========
+replace_re
+==========
+
+.. automodule:: pylibcudf.strings.replace_re
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
index e39ca18a12b..4de9bced86f 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
@@ -1,5 +1,5 @@
 =====
-Table
+table
 =====
 
 .. automodule:: pylibcudf.table
diff --git a/docs/cudf/source/user_guide/api_docs/string_handling.rst b/docs/cudf/source/user_guide/api_docs/string_handling.rst
index ab0f085e1a6..91d3e33960b 100644
--- a/docs/cudf/source/user_guide/api_docs/string_handling.rst
+++ b/docs/cudf/source/user_guide/api_docs/string_handling.rst
@@ -60,6 +60,7 @@ strings and apply several methods to it. These can be accessed like
    isupper
    istimestamp
    istitle
+   jaccard_index
    join
    len
    like
@@ -67,6 +68,7 @@ strings and apply several methods to it. These can be accessed like
    lower
    lstrip
    match
+   minhash
    ngrams
    ngrams_tokenize
    normalize_characters
@@ -90,7 +92,6 @@ strings and apply several methods to it. These can be accessed like
    slice_from
    slice_replace
    split
-   rsplit
    startswith
    strip
    swapcase
diff --git a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst b/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst
deleted file mode 100644
index cd240fe4db4..00000000000
--- a/docs/cudf/source/user_guide/api_docs/subword_tokenize.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-================
-SubwordTokenizer
-================
-.. currentmodule:: cudf.core.subword_tokenizer
-
-Constructor
-~~~~~~~~~~~
-.. autosummary::
-   :toctree: api/
-
-   SubwordTokenizer
-   SubwordTokenizer.__call__
diff --git a/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst
new file mode 100644
index 00000000000..1b5c965f3c9
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/tokenize_vocabulary.rst
@@ -0,0 +1,12 @@
+==================
+TokenizeVocabulary
+==================
+.. currentmodule:: cudf.core.tokenize_vocabulary
+
+Constructor
+~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   TokenizeVocabulary
+   TokenizeVocabulary.tokenize
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 75eafcc5387..abfe5a1b178 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -101,6 +101,8 @@
    "outputs": [],
    "source": [
     "# define a scalar function\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -247,6 +249,8 @@
    "outputs": [],
    "source": [
     "# redefine the same function from above\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -1622,6 +1626,8 @@
    "outputs": [],
    "source": [
     "# a user defined aggregation function.\n",
+    "\n",
+    "\n",
     "def udaf(df):\n",
     "    return df[\"b\"].max() - df[\"b\"].min() / 2"
    ]
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index e4106574a19..bfb959b12c1 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -155,6 +155,16 @@ public static HostMemoryBuffer allocate(long bytes) {
     return allocate(bytes, defaultPreferPinned);
   }
 
+  /**
+   * Allocate host memory bypassing the default allocator. This is intended to only be used by other allocators.
+   * Pinned memory will not be used for these allocations.
+   * @param bytes size in bytes to allocate
+   * @return the newly created buffer
+   */
+  public static HostMemoryBuffer allocateRaw(long bytes) {
+    return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes);
+  }
+
   /**
    * Create a host buffer that is memory-mapped to a file.
    * @param path path to the file to map into host memory
@@ -245,8 +255,10 @@ public final void copyFromHostBuffer(long destOffset, HostMemoryBuffer srcData,
    * @param destOffset  offset in bytes in this buffer to start copying to
    * @param in input stream to copy bytes from
    * @param byteLength number of bytes to copy
+   * @throws EOFException If there are not enough bytes in the stream to copy.
+   * @throws IOException If there is an error reading from the stream.
    */
-  final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException {
+  public final void copyFromStream(long destOffset, InputStream in, long byteLength) throws IOException {
     addressOutOfBoundsCheck(address + destOffset, byteLength, "copy from stream");
     byte[] arrayBuffer = new byte[(int) Math.min(1024 * 128, byteLength)];
     long left = byteLength;
@@ -254,7 +266,7 @@ final void copyFromStream(long destOffset, InputStream in, long byteLength) thro
       int amountToCopy = (int) Math.min(arrayBuffer.length, left);
       int amountRead = in.read(arrayBuffer, 0, amountToCopy);
       if (amountRead < 0) {
-        throw new EOFException();
+        throw new EOFException("Unexpected end of stream, expected " + left + " more bytes");
       }
       setBytes(destOffset, arrayBuffer, 0, amountRead);
       destOffset += amountRead;
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
deleted file mode 100644
index 72c2e659372..00000000000
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *
- *  Copyright (c) 2023, NVIDIA CORPORATION.
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- */
-
-package ai.rapids.cudf;
-
-/**
- * Represents some amount of host memory that has been reserved. A reservation guarantees that one
- * or more allocations up to the reserved amount, minus padding for alignment will succeed. A
- * reservation typically guarantees the amount can be allocated one, meaning when a buffer
- * allocated from a reservation is freed it is not returned to the reservation, but to the pool of
- * memory the reservation originally came from. If more memory is allocated from the reservation
- * an OutOfMemoryError may be thrown, but it is not guaranteed to happen.
- *
- * When the reservation is closed any unused reservation will be returned to the pool of memory
- * the reservation came from.
- */
-public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {}
diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
index 7ed8e0354c9..68a3856f37d 100644
--- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java
+++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
@@ -28,7 +28,16 @@ public enum RegexFlag {
   DEFAULT(0),   // default
   MULTILINE(8), // the '^' and '$' honor new-line characters
   DOTALL(16),   // the '.' matching includes new-line characters
-  ASCII(256);   // use only ASCII when matching built-in character classes
+  ASCII(256),   // use only ASCII when matching built-in character classes
+  /**
+   * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters
+   * - NEXT_LINE ('\u0085')
+   * - LINE_SEPARATOR ('\u2028')
+   * - PARAGRAPH_SEPARATOR ('\u2029')
+   * - CARRIAGE_RETURN ('\r')
+   * - NEW_LINE ('\n')
+   */
+  EXT_NEWLINE(512);
 
   final int nativeId; // Native id, for use with libcudf.
   private RegexFlag(int nativeId) { // Only constant values should be used
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index 76b2799aad6..ae8a0e17f9d 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -29,26 +29,52 @@ public class Schema {
   public static final Schema INFERRED = new Schema();
 
   private final DType topLevelType;
+
+  /**
+   * Default value for precision value, when it is not specified or the column type is not decimal.
+   */
+  private static final int UNKNOWN_PRECISION = -1;
+
+  /**
+   * Store precision for the top level column, only applicable if the column is a decimal type.
+   * <p/>
+   * This variable is not designed to be used by any libcudf's APIs since libcudf does not support
+   * precisions for fixed point numbers.
+   * Instead, it is used only to pass down the precision values from Spark's DecimalType to the
+   * JNI level, where some JNI functions require these values to perform their operations.
+   */
+  private final int topLevelPrecision;
+
   private final List<String> childNames;
   private final List<Schema> childSchemas;
   private boolean flattened = false;
   private String[] flattenedNames;
   private DType[] flattenedTypes;
+  private int[] flattenedPrecisions;
   private int[] flattenedCounts;
 
   private Schema(DType topLevelType,
+                 int topLevelPrecision,
                  List<String> childNames,
                  List<Schema> childSchemas) {
     this.topLevelType = topLevelType;
+    this.topLevelPrecision = topLevelPrecision;
     this.childNames = childNames;
     this.childSchemas = childSchemas;
   }
 
+  private Schema(DType topLevelType,
+                 List<String> childNames,
+                 List<Schema> childSchemas) {
+    this(topLevelType, UNKNOWN_PRECISION, childNames, childSchemas);
+  }
+
   /**
    * Inferred schema.
    */
   private Schema() {
     topLevelType = null;
+    topLevelPrecision = UNKNOWN_PRECISION;
     childNames = null;
     childSchemas = null;
   }
@@ -104,14 +130,17 @@ private void flattenIfNeeded() {
       if (flatLen == 0) {
         flattenedNames = null;
         flattenedTypes = null;
+        flattenedPrecisions = null;
         flattenedCounts = null;
       } else {
         String[] names = new String[flatLen];
         DType[] types = new DType[flatLen];
+        int[] precisions = new int[flatLen];
         int[] counts = new int[flatLen];
-        collectFlattened(names, types, counts, 0);
+        collectFlattened(names, types, precisions, counts, 0);
         flattenedNames = names;
         flattenedTypes = types;
+        flattenedPrecisions = precisions;
         flattenedCounts = counts;
       }
       flattened = true;
@@ -128,19 +157,20 @@ private int flattenedLength(int startingLength) {
     return startingLength;
   }
 
-  private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) {
+  private int collectFlattened(String[] names, DType[] types, int[] precisions, int[] counts, int offset) {
     if (childSchemas != null) {
       for (int i = 0; i < childSchemas.size(); i++) {
         Schema child = childSchemas.get(i);
         names[offset] = childNames.get(i);
         types[offset] = child.topLevelType;
+        precisions[offset] = child.topLevelPrecision;
         if (child.childNames != null) {
           counts[offset] = child.childNames.size();
         } else {
           counts[offset] = 0;
         }
         offset++;
-        offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset);
+        offset = this.childSchemas.get(i).collectFlattened(names, types, precisions, counts, offset);
       }
     }
     return offset;
@@ -226,6 +256,22 @@ public int[] getFlattenedTypeScales() {
     return ret;
   }
 
+  /**
+   * Get decimal precisions of the columns' types flattened from all levels in schema by
+   * depth-first traversal.
+   * <p/>
+   * This is used to pass down the decimal precisions from Spark to only the JNI layer, where
+   * some JNI functions require precision values to perform their operations.
+   * Decimal precisions should not be consumed by any libcudf's APIs since libcudf does not
+   * support precisions for fixed point numbers.
+   *
+   * @return An array containing decimal precision of all columns in schema.
+   */
+  public int[] getFlattenedDecimalPrecisions() {
+    flattenIfNeeded();
+    return flattenedPrecisions;
+  }
+
   /**
    * Get the types of the columns in schema flattened from all levels by depth-first traversal.
    * @return An array containing types of all columns in schema.
@@ -307,11 +353,13 @@ public HostColumnVector.DataType asHostDataType() {
 
   public static class Builder {
     private final DType topLevelType;
+    private final int topLevelPrecision;
     private final List<String> names;
     private final List<Builder> types;
 
-    private Builder(DType topLevelType) {
+    private Builder(DType topLevelType, int topLevelPrecision) {
       this.topLevelType = topLevelType;
+      this.topLevelPrecision = topLevelPrecision;
       if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) {
         // There can be children
         names = new ArrayList<>();
@@ -322,14 +370,19 @@ private Builder(DType topLevelType) {
       }
     }
 
+    private Builder(DType topLevelType) {
+      this(topLevelType, UNKNOWN_PRECISION);
+    }
+
     /**
      * Add a new column
      * @param type the type of column to add
      * @param name the name of the column to add (Ignored for list types)
+     * @param precision the decimal precision, only applicable for decimal types
      * @return the builder for the new column. This should really only be used when the type
      * passed in is a LIST or a STRUCT.
      */
-    public Builder addColumn(DType type, String name) {
+    public Builder addColumn(DType type, String name, int precision) {
       if (names == null) {
         throw new IllegalStateException("A column of type " + topLevelType +
             " cannot have children");
@@ -340,21 +393,31 @@ public Builder addColumn(DType type, String name) {
       if (names.contains(name)) {
         throw new IllegalStateException("Cannot add duplicate names to a schema");
       }
-      Builder ret = new Builder(type);
+      Builder ret = new Builder(type, precision);
       types.add(ret);
       names.add(name);
       return ret;
     }
 
+    public Builder addColumn(DType type, String name) {
+      return addColumn(type, name, UNKNOWN_PRECISION);
+    }
+
     /**
      * Adds a single column to the current schema. addColumn is preferred as it can be used
      * to support nested types.
      * @param type the type of the column.
      * @param name the name of the column.
+     * @param precision the decimal precision, only applicable for decimal types.
      * @return this for chaining.
      */
+    public Builder column(DType type, String name, int precision) {
+      addColumn(type, name, precision);
+      return this;
+    }
+
     public Builder column(DType type, String name) {
-      addColumn(type, name);
+      addColumn(type, name, UNKNOWN_PRECISION);
       return this;
     }
 
@@ -366,7 +429,7 @@ public Schema build() {
           children.add(b.build());
         }
       }
-      return new Schema(topLevelType, names, children);
+      return new Schema(topLevelType, topLevelPrecision, names, children);
     }
   }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 708744569df..14c290b300a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -31,6 +31,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.EnumSet;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -3877,6 +3878,43 @@ void testExtractRe() {
     }
   }
 
+  @Test
+void testExtractReWithMultiLineDelimiters() {
+    String NEXT_LINE = "\u0085";
+    String LINE_SEPARATOR = "\u2028";
+    String PARAGRAPH_SEPARATOR = "\u2029";
+    String CARRIAGE_RETURN = "\r";
+    String NEW_LINE = "\n";
+
+    try (ColumnVector input = ColumnVector.fromStrings(
+            "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::",
+            "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll",
+            "boo::",
+            "",
+            "boo::" + NEW_LINE,
+            "boo::" + CARRIAGE_RETURN,
+            "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR,
+            "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR,
+            "boo:" + NEXT_LINE + "boo::" + NEXT_LINE);
+         Table expected_ext_newline = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::")
+             .build();
+         Table expected_default = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", null, null, null, null)
+             .build()) {
+
+        // Regex pattern to match 'boo:' followed by one or more colons at the end of the string
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) {
+          assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]);
+        }
+
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) {
+          assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]);
+        }
+    }
+  }
+
+
   @Test
   void testExtractAllRecord() {
     String pattern = "([ab])(\\d)";
diff --git a/pyproject.toml b/pyproject.toml
index 661c68ee62e..6933484f4e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,8 @@ select = [
     "F",
     # pycodestyle Warning
     "W",
+    # isort
+    "I",
     # no-blank-line-before-function
     "D201",
     # one-blank-line-after-class
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 7b2b71cf216..0e4afadccf5 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -56,27 +56,23 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from config import cudf  # noqa: W0611, E402, F401
-from utils import (  # noqa: E402
-    OrderedSet,
-    collapse_fixtures,
-    column_generators,
-    make_fixture,
-)
-
 # Turn off isort until we upgrade to 5.8.0
 # https://github.com/pycqa/isort/issues/1594
-# isort: off
 from config import (  # noqa: W0611, E402, F401
     NUM_COLS,
     NUM_ROWS,
     collect_ignore,
+    cudf,  # noqa: W0611, E402, F401
     pytest_collection_modifyitems,
     pytest_sessionfinish,
     pytest_sessionstart,
 )
-
-# isort: on
+from utils import (  # noqa: E402
+    OrderedSet,
+    collapse_fixtures,
+    column_generators,
+    make_fixture,
+)
 
 
 @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"])
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 065655505b8..94dbdf5534d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -688,15 +688,18 @@ cdef class Column:
         # special case for string column
         is_string_column = (cv.type().id() == libcudf_types.type_id.STRING)
         if is_string_column:
-            # get the size from offset child column (device to host copy)
-            offsets_column_index = 0
-            offset_child_column = cv.child(offsets_column_index)
-            if offset_child_column.size() == 0:
+            if cv.num_children() == 0:
                 base_nbytes = 0
             else:
-                chars_size = get_element(
-                    offset_child_column, offset_child_column.size()-1).value
-                base_nbytes = chars_size
+                # get the size from offset child column (device to host copy)
+                offsets_column_index = 0
+                offset_child_column = cv.child(offsets_column_index)
+                if offset_child_column.size() == 0:
+                    base_nbytes = 0
+                else:
+                    chars_size = get_element(
+                        offset_child_column, offset_child_column.size()-1).value
+                    base_nbytes = chars_size
 
         if data_ptr:
             if data_owner is None:
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 265b92ff645..c26e1de23a1 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -4,7 +4,7 @@ import pickle
 
 from libc.stdint cimport uint8_t, uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique
 cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.gather cimport (
-    segmented_gather as cpp_segmented_gather,
-)
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
@@ -340,26 +336,6 @@ def get_element(Column input_column, size_type index):
     )
 
 
-@acquire_spill_lock()
-def segmented_gather(Column source_column, Column gather_map):
-    cdef shared_ptr[lists_column_view] source_LCV = (
-        make_shared[lists_column_view](source_column.view())
-    )
-    cdef shared_ptr[lists_column_view] gather_map_LCV = (
-        make_shared[lists_column_view](gather_map.view())
-    )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_segmented_gather(
-                source_LCV.get()[0], gather_map_LCV.get()[0])
-        )
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
-
 cdef class _CPackedColumns:
 
     @staticmethod
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 9b7ab0888d2..89309b36371 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -1,27 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
+import pylibcudf as plc
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.hash cimport (
-    md5,
-    murmurhash3_x86_32,
-    sha1,
-    sha224,
-    sha256,
-    sha384,
-    sha512,
-    xxhash_64,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.table cimport Table
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport table_view_from_columns
-
-import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -37,32 +22,26 @@ def hash_partition(list source_columns, list columns_to_hash,
 
 @acquire_spill_lock()
 def hash(list source_columns, str method, int seed=0):
-    cdef table_view c_source_view = table_view_from_columns(source_columns)
-    cdef unique_ptr[column] c_result
+    cdef Table ctbl = Table(
+        [c.to_pylibcudf(mode="read") for c in source_columns]
+    )
     if method == "murmur3":
-        with nogil:
-            c_result = move(murmurhash3_x86_32(c_source_view, seed))
+        return Column.from_pylibcudf(plc.hashing.murmurhash3_x86_32(ctbl, seed))
+    elif method == "xxhash64":
+        return Column.from_pylibcudf(plc.hashing.xxhash_64(ctbl, seed))
     elif method == "md5":
-        with nogil:
-            c_result = move(md5(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.md5(ctbl))
     elif method == "sha1":
-        with nogil:
-            c_result = move(sha1(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha1(ctbl))
     elif method == "sha224":
-        with nogil:
-            c_result = move(sha224(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha224(ctbl))
     elif method == "sha256":
-        with nogil:
-            c_result = move(sha256(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha256(ctbl))
     elif method == "sha384":
-        with nogil:
-            c_result = move(sha384(c_source_view))
+        return Column.from_pylibcudf(plc.hashing.sha384(ctbl))
     elif method == "sha512":
-        with nogil:
-            c_result = move(sha512(c_source_view))
-    elif method == "xxhash64":
-        with nogil:
-            c_result = move(xxhash_64(c_source_view, seed))
+        return Column.from_pylibcudf(plc.hashing.sha512(ctbl))
     else:
-        raise ValueError(f"Unsupported hash function: {method}")
-    return Column.from_unique_ptr(move(c_result))
+        raise ValueError(
+            f"Unsupported hashing algorithm {method}."
+        )
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 1dc586bb257..1c9d3a01b80 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,49 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 import pylibcudf
 
-from pylibcudf.libcudf.interop cimport (
-    DLManagedTensor,
-    from_dlpack as cpp_from_dlpack,
-    to_dlpack as cpp_to_dlpack,
-)
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
-from cudf._lib.utils cimport (
-    columns_from_pylibcudf_table,
-    columns_from_unique_ptr,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import ListDtype, StructDtype
 
 
-def from_dlpack(dlpack_capsule):
+def from_dlpack(object dlpack_capsule):
     """
     Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>pycapsule.\
-        PyCapsule_GetPointer(dlpack_capsule, 'dltensor')
-    pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor')
-
-    cdef unique_ptr[table] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_from_dlpack(dlpack_tensor)
-        )
-
-    res = columns_from_unique_ptr(move(c_result))
-    dlpack_tensor.deleter(dlpack_tensor)
-    return res
+    return columns_from_pylibcudf_table(
+        pylibcudf.interop.from_dlpack(dlpack_capsule)
+    )
 
 
 def to_dlpack(list source_columns):
@@ -52,39 +25,13 @@ def to_dlpack(list source_columns):
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    if any(column.null_count for column in source_columns):
-        raise ValueError(
-            "Cannot create a DLPack tensor with null values. \
-                Input is required to have null count as zero."
-        )
-
-    cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_columns(source_columns)
-
-    with nogil:
-        dlpack_tensor = cpp_to_dlpack(
-            source_table_view
+    return pylibcudf.interop.to_dlpack(
+        pylibcudf.Table(
+            [col.to_pylibcudf(mode="read") for col in source_columns]
         )
-
-    return pycapsule.PyCapsule_New(
-        dlpack_tensor,
-        'dltensor',
-        dlmanaged_tensor_pycapsule_deleter
     )
 
 
-cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>0
-    try:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'used_dltensor')
-        return  # we do not call a used capsule's deleter
-    except Exception:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'dltensor')
-    dlpack_tensor.deleter(dlpack_tensor)
-
-
 def gather_metadata(object cols_dtypes):
     """
     Generates a ColumnMetadata vector for each column.
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 7e8710bedb6..12432ac6d5d 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import pylibcudf
+import pylibcudf as plc
 
 from pylibcudf cimport Scalar
 
@@ -17,7 +17,7 @@ from pylibcudf cimport Scalar
 @acquire_spill_lock()
 def count_elements(Column col):
     return Column.from_pylibcudf(
-        pylibcudf.lists.count_elements(
+        plc.lists.count_elements(
             col.to_pylibcudf(mode="read"))
     )
 
@@ -25,8 +25,8 @@ def count_elements(Column col):
 @acquire_spill_lock()
 def explode_outer(list source_columns, int explode_column_idx):
     return columns_from_pylibcudf_table(
-        pylibcudf.lists.explode_outer(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
+        plc.lists.explode_outer(
+            plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
             explode_column_idx,
         )
     )
@@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx):
 @acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
-        pylibcudf.lists.distinct(
+        plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
             nulls_equal,
             nans_all_equal,
@@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
-        pylibcudf.lists.sort_lists(
+        plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
             ascending,
             null_order.BEFORE if na_position == "first" else null_order.AFTER,
@@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position):
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index,
         )
@@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index):
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index.to_pylibcudf(mode="read"),
         )
@@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index):
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.contains(
+        plc.lists.contains(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
         )
@@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key):
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
             True,
@@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key):
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
             True,
@@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys):
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_rows(
-            pylibcudf.Table([
+        plc.lists.concatenate_rows(
+            plc.Table([
                 c.to_pylibcudf(mode="read") for c in source_columns
             ])
         )
@@ -121,8 +121,18 @@ def concatenate_rows(list source_columns):
 @acquire_spill_lock()
 def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_list_elements(
+        plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
             dropna,
         )
     )
+
+
+@acquire_spill_lock()
+def segmented_gather(Column source_column, Column gather_map):
+    return Column.from_pylibcudf(
+        plc.lists.segmented_gather(
+            source_column.to_pylibcudf(mode="read"),
+            gather_map.to_pylibcudf(mode="read"),
+        )
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index 0d768e24f39..2b2762eead2 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -3,49 +3,22 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
-    bpe_merge_pairs as cpp_bpe_merge_pairs,
-    byte_pair_encoding as cpp_byte_pair_encoding,
-    load_merge_pairs as cpp_load_merge_pairs,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-
 
-cdef class BPEMergePairs:
-    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
-
-    def __cinit__(self, Column merge_pairs):
-        cdef column_view c_pairs = merge_pairs.view()
-        with nogil:
-            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+from pylibcudf import nvtext
+from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs  # no-cython-lint
 
 
 @acquire_spill_lock()
 def byte_pair_encoding(
     Column strings,
-    BPEMergePairs merge_pairs,
+    object merge_pairs,
     object separator
 ):
-    cdef column_view c_strings = strings.view()
-    cdef DeviceScalar d_separator = separator.device_value
-    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_byte_pair_encoding(
-                c_strings,
-                merge_pairs.c_obj.get()[0],
-                c_separator[0]
-            )
+    return Column.from_pylibcudf(
+        nvtext.byte_pair_encode.byte_pair_encoding(
+            strings.to_pylibcudf(mode="read"),
+            merge_pairs,
+            separator.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index 6521116eafe..c125d92a24e 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -14,10 +14,11 @@ def ngrams_tokenize(
     object py_delimiter,
     object py_separator
 ):
-    result = nvtext.ngrams_tokenize.ngrams_tokenize(
-        input.to_pylibcudf(mode="read"),
-        ngrams,
-        py_delimiter.device_value.c_value,
-        py_separator.device_value.c_value
+    return Column.from_pylibcudf(
+        nvtext.ngrams_tokenize.ngrams_tokenize(
+            input.to_pylibcudf(mode="read"),
+            ngrams,
+            py_delimiter.device_value.c_value,
+            py_separator.device_value.c_value
+        )
     )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 5e86a9ce959..cc45123dd0a 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -3,36 +3,26 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
 
 from cudf._lib.column cimport Column
 
-
-@acquire_spill_lock()
-def normalize_spaces(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_normalize_spaces(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def normalize_characters(Column strings, bool do_lower=True):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
+def normalize_spaces(Column input):
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_spaces(
+            input.to_pylibcudf(mode="read")
+        )
+    )
 
-    with nogil:
-        c_result = move(cpp_normalize_characters(c_strings, do_lower))
 
-    return Column.from_unique_ptr(move(c_result))
+@acquire_spill_lock()
+def normalize_characters(Column input, bool do_lower=True):
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_characters(
+            input.to_pylibcudf(mode="read"),
+            do_lower,
+        )
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index 61ae3da5782..bec56ade83c 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -2,20 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.replace cimport (
-    filter_tokens as cpp_filter_tokens,
-    replace_tokens as cpp_replace_tokens,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
@@ -30,27 +20,14 @@ def replace_tokens(Column strings,
     provided.
     """
 
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_targets = targets.view()
-    cdef column_view c_replacements = replacements.view()
-
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_replace_tokens(
-                c_strings,
-                c_targets,
-                c_replacements,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.replace_tokens(
+            strings.to_pylibcudf(mode="read"),
+            targets.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
@@ -65,24 +42,11 @@ def filter_tokens(Column strings,
     character provided.
     """
 
-    cdef DeviceScalar replacement = py_replacement.device_value
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_repl = <const string_scalar*>replacement\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_filter_tokens(
-                c_strings,
-                min_token_length,
-                c_repl[0],
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.filter_tokens(
+            strings.to_pylibcudf(mode="read"),
+            min_token_length,
+            py_replacement.device_value.c_value,
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 5bf25562fed..63a389b64d5 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,24 +1,19 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from enum import IntEnum
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
+from cudf.core.buffer import acquire_spill_lock
+
 from pylibcudf.libcudf.nvtext.stemmer cimport (
-    is_letter as cpp_is_letter,
     letter_type,
-    porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 class LetterType(IntEnum):
     CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
@@ -27,43 +22,34 @@ class LetterType(IntEnum):
 
 @acquire_spill_lock()
 def porter_stemmer_measure(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_porter_stemmer_measure(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        nvtext.stemmer.porter_stemmer_measure(
+            strings.to_pylibcudf(mode="read"),
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_letter(Column strings,
               object ltype,
               size_type index):
-    cdef column_view c_strings = strings.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            index,
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, index))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def is_letter_multi(Column strings,
                     object ltype,
                     Column indices):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            indices.to_pylibcudf(mode="read"),
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index ee442ece5c6..5e0bfb74705 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -5,35 +5,16 @@ from libc.stdint cimport uint32_t
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
-    hashed_vocabulary as cpp_hashed_vocabulary,
-    load_vocabulary_file as cpp_load_vocabulary_file,
-    move as tr_move,
-    subword_tokenize as cpp_subword_tokenize,
-    tokenizer_result as cpp_tokenizer_result,
-)
 
 from cudf._lib.column cimport Column
 
-
-cdef class Hashed_Vocabulary:
-    cdef unique_ptr[cpp_hashed_vocabulary] c_obj
-
-    def __cinit__(self, hash_file):
-        cdef string c_hash_file = <string>str(hash_file).encode()
-        with nogil:
-            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def subword_tokenize_inmem_hash(
     Column strings,
-    Hashed_Vocabulary hashed_vocabulary,
+    object hashed_vocabulary,
     uint32_t max_sequence_length=64,
     uint32_t stride=48,
     bool do_lower=True,
@@ -42,21 +23,16 @@ def subword_tokenize_inmem_hash(
     """
     Subword tokenizes text series by using the pre-loaded hashed vocabulary
     """
-    cdef column_view c_strings = strings.view()
-    cdef cpp_tokenizer_result c_result
-    with nogil:
-        c_result = tr_move(
-            cpp_subword_tokenize(
-                c_strings,
-                hashed_vocabulary.c_obj.get()[0],
-                max_sequence_length,
-                stride,
-                do_lower,
-                do_truncate,
-            )
-        )
+    result = nvtext.subword_tokenize.subword_tokenize(
+        strings.to_pylibcudf(mode="read"),
+        hashed_vocabulary,
+        max_sequence_length,
+        stride,
+        do_lower,
+        do_truncate,
+    )
     # return the 3 tensor components
-    tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
-    masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
-    metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
+    tokens = Column.from_pylibcudf(result[0])
+    masks = Column.from_pylibcudf(result[1])
+    metadata = Column.from_pylibcudf(result[2])
     return tokens, masks, metadata
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index a7e63f1e9ae..f473c48e2f7 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -2,162 +2,85 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.tokenize cimport (
-    character_tokenize as cpp_character_tokenize,
-    count_tokens as cpp_count_tokens,
-    detokenize as cpp_detokenize,
-    load_vocabulary as cpp_load_vocabulary,
-    tokenize as cpp_tokenize,
-    tokenize_vocabulary as cpp_tokenize_vocabulary,
-    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
+from pylibcudf.nvtext.tokenize import TokenizeVocabulary  # no-cython-lint
+
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def _tokenize_scalar(Column strings, object py_delimiter):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_scalar(
+            strings.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _tokenize_column(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiters = delimiters.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_delimiters
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_column(
+            strings.to_pylibcudf(mode="read"),
+            delimiters.to_pylibcudf(mode="read"),
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _count_tokens_scalar(Column strings, object py_delimiter):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_delimiter[0]
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.count_tokens_scalar(
+            strings.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def _count_tokens_column(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiters = delimiters.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_delimiters
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.count_tokens_column(
+            strings.to_pylibcudf(mode="read"),
+            delimiters.to_pylibcudf(mode="read")
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def character_tokenize(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_character_tokenize(c_strings)
+    return Column.from_pylibcudf(
+        nvtext.tokenize.character_tokenize(
+            strings.to_pylibcudf(mode="read")
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def detokenize(Column strings, Column indices, object py_separator):
-
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_detokenize(c_strings, c_indices, c_separator[0])
+    return Column.from_pylibcudf(
+        nvtext.tokenize.detokenize(
+            strings.to_pylibcudf(mode="read"),
+            indices.to_pylibcudf(mode="read"),
+            py_separator.device_value.c_value
         )
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-cdef class TokenizeVocabulary:
-    cdef unique_ptr[cpp_tokenize_vocabulary] c_obj
-
-    def __cinit__(self, Column vocab):
-        cdef column_view c_vocab = vocab.view()
-        with nogil:
-            self.c_obj = move(cpp_load_vocabulary(c_vocab))
+    )
 
 
 @acquire_spill_lock()
 def tokenize_with_vocabulary(Column strings,
-                             TokenizeVocabulary vocabulary,
+                             object vocabulary,
                              object py_delimiter,
                              size_type default_id):
-
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize_with_vocabulary(
-                c_strings,
-                vocabulary.c_obj.get()[0],
-                c_delimiter[0],
-                default_id
-            )
+    return Column.from_pylibcudf(
+        nvtext.tokenize.tokenize_with_vocabulary(
+            strings.to_pylibcudf(mode="read"),
+            vocabulary,
+            py_delimiter.device_value.c_value,
+            default_id
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index fffc8b7c3f6..462d5c903e8 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,26 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
+from pylibcudf.libcudf.types cimport size_type
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.replace_re cimport (
-    replace_re as cpp_replace_re,
-    replace_with_backrefs as cpp_replace_with_backrefs,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
@@ -34,28 +19,16 @@ def replace_re(Column source_strings,
     `n` indicates the number of resplacements to be made from
     start. (-1 indicates all)
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef const string_scalar* scalar_repl = \
-        <const string_scalar*>(repl.get_raw_ptr())
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_re(
-            source_view,
-            dereference(c_prog),
-            scalar_repl[0],
-            n
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        py_repl.device_value.c_value,
+        n
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -68,50 +41,29 @@ def replace_with_backrefs(
     new string with the extracted elements found using
     `pattern` regular expression in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef string repl_string = <string>str(repl).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_with_backrefs(
-            source_view,
-            dereference(c_prog),
-            repl_string
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_with_backrefs(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        repl
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
 def replace_multi_re(Column source_strings,
-                     object patterns,
+                     list patterns,
                      Column repl_strings):
     """
     Returns a Column after replacing occurrences of multiple
     regular expressions `patterns` with their corresponding
     strings in `repl_strings` in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view repl_view = repl_strings.view()
-
-    cdef int pattern_size = len(patterns)
-    cdef vector[string] patterns_vector
-    patterns_vector.reserve(pattern_size)
-
-    for pattern in patterns:
-        patterns_vector.push_back(str.encode(pattern))
-
-    with nogil:
-        c_result = move(cpp_replace_re(
-            source_view,
-            patterns_vector,
-            repl_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        patterns,
+        repl_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 40d0c9eac3a..1589e23f716 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -7,20 +7,11 @@ from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.utils import cudautils
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-cimport pylibcudf.libcudf.transform as libcudf_transform
 from pylibcudf cimport transform as plc_transform
 from pylibcudf.expressions cimport Expression
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.expressions cimport expression
-from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport table_view_from_columns
 
 import pylibcudf as plc
 
@@ -121,13 +112,8 @@ def compute_column(list columns, tuple column_names, expr: str):
 
     # At the end, all the stack contains is the expression to evaluate.
     cdef Expression cudf_expr = visitor.expression
-    cdef table_view tbl = table_view_from_columns(columns)
-    cdef unique_ptr[column] col
-    with nogil:
-        col = move(
-            libcudf_transform.compute_column(
-                tbl,
-                <expression &> dereference(cudf_expr.c_obj.get())
-            )
-        )
-    return Column.from_unique_ptr(move(col))
+    result = plc_transform.compute_column(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        cudf_expr,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 6e8ad556b08..3b13cc258ab 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import sys
-from collections.abc import Callable
-from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
@@ -42,7 +42,7 @@
 SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"]
 
 # Groupby aggregation
-AggType = Union[str, Callable]
-MultiColumnAggType = Union[
-    AggType, Iterable[AggType], Dict[Any, Iterable[AggType]]
+AggType = Union[str, Callable]  # noqa: UP007
+MultiColumnAggType = Union[  # noqa: UP007
+    AggType, Iterable[AggType], dict[Any, Iterable[AggType]]
 ]
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index caff019f575..ffa306bf93f 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Literal, Mapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
 from typing_extensions import Self
@@ -18,6 +18,9 @@
 from cudf.core.abc import Serializable
 from cudf.utils.string import format_bytes
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 def host_memory_allocation(nbytes: int) -> memoryview:
     """Allocate host memory using NumPy
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 0bd8d6054b3..ecf9807cfc2 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -2,13 +2,16 @@
 
 from __future__ import annotations
 
-from typing import Literal, Mapping
+from typing import TYPE_CHECKING, Literal
 
 from typing_extensions import Self
 
 import cudf
 from cudf.core.buffer.buffer import Buffer, BufferOwner
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 class ExposureTrackedBuffer(Buffer):
     """An exposure tracked buffer.
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 6ca64a0a2be..8d38a5f2272 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -2,9 +2,10 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
+
 import cudf
 from cudf._lib.nvtext.byte_pair_encode import (
-    BPEMergePairs as cpp_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
 )
 
@@ -25,7 +26,9 @@ class BytePairEncoder:
     """
 
     def __init__(self, merges_pair: "cudf.Series"):
-        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+        self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs(
+            merges_pair._column.to_pylibcudf(mode="read")
+        )
 
     def __call__(self, text, separator: str = " ") -> cudf.Series:
         """
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 06791df7dc0..a1e87d04bc9 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -29,4 +29,3 @@
     Decimal128Column,
     DecimalBaseColumn,
 )
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 864e87b5377..087d0ed65f5 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -4,7 +4,7 @@
 
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from collections import abc
+    from collections.abc import Mapping, Sequence
 
     import numba.cuda
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7674565e2c3..d2f9d208c77 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,10 +4,11 @@
 
 import pickle
 from collections import abc
+from collections.abc import MutableSequence, Sequence
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
-from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
@@ -579,8 +580,8 @@ def _wrap_binop_normalization(self, other):
         if cudf.utils.utils.is_na_like(other):
             return cudf.Scalar(other, dtype=self.dtype)
         if isinstance(other, np.ndarray) and other.ndim == 0:
-            # Try and maintain the dtype
-            other = other.dtype.type(other.item())
+            # Return numpy scalar
+            other = other[()]
         return self.normalize_binop_value(other)
 
     def _scatter_by_slice(
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 2c9b0baa9b6..b6dc250e64d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,7 +8,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 import pandas as pd
@@ -31,6 +31,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import (
         ColumnBinaryOperand,
         DatetimeLikeScalar,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 8803ebd6791..8ae06f72d1e 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -3,8 +3,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 from decimal import Decimal
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import cupy as cp
 import numpy as np
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c6a39199e3b..6b25e568f00 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -11,7 +11,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
     concatenate_list_elements,
     concatenate_rows,
@@ -22,6 +21,7 @@
     extract_element_scalar,
     index_of_column,
     index_of_scalar,
+    segmented_gather,
     sort_lists,
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column
@@ -34,6 +34,8 @@
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
 
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 05a0ab2e09a..a91c080fe21 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Union, overload
-
-from typing_extensions import Literal
+from typing import Literal, Union, overload
 
 import cudf
 import cudf.core.column
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 78d2814ed26..620cae65374 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -28,7 +28,7 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
     from cudf._typing import (
         ColumnBinaryOperand,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 45d1a8b087b..856ce0f75de 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,7 +5,7 @@
 import re
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast, overload
+from typing import TYPE_CHECKING, cast, overload
 
 import numpy as np
 import pandas as pd
@@ -35,6 +35,8 @@ def str_to_boolean(column: StringColumn):
 
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     import cupy
     import numba.cuda
 
@@ -998,7 +1000,7 @@ def replace(
             return self._return_or_inplace(
                 libstrings.replace_multi_re(
                     self._column,
-                    pat,
+                    list(pat),
                     column.as_column(repl, dtype="str"),
                 )
                 if regex
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 6b6f3e517a8..087d6474e7f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -19,6 +19,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 
 _unit_to_nanoseconds_conversion = {
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index bc093fdaa9a..496e86ed709 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -5,8 +5,9 @@
 import itertools
 import sys
 from collections import abc
+from collections.abc import Mapping
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Mapping, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7d4d34f5b04..bf1c39b23da 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,8 +13,8 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Callable, Iterator
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from collections.abc import Callable, Iterator, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numba
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 5250a741d3d..aa601a2b322 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -3,7 +3,7 @@
 
 import enum
 from collections import abc
-from typing import Any, Iterable, Mapping, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import cupy as cp
 import numpy as np
@@ -20,6 +20,9 @@
     build_column,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping, Sequence
+
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
@@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum):
     _DtypeKind.BOOL,
     _DtypeKind.STRING,
 }
-ProtoDtype = Tuple[_DtypeKind, int, str, str]
+ProtoDtype = tuple[_DtypeKind, int, str, str]
 
 
 class _CuDFBuffer:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 37ad6b8fabb..205edd91d9d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -36,6 +36,7 @@
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
     from types import ModuleType
 
     from cudf._typing import Dtype, ScalarLike
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 81b20488d8d..e59b948aba9 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy as cp
 import numpy as np
@@ -36,6 +36,8 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+
     from cudf._typing import (
         AggType,
         DataFrameOrSeries,
@@ -479,6 +481,11 @@ def get_group(self, name, obj=None):
                 "instead of ``gb.get_group(name, obj=df)``.",
                 FutureWarning,
             )
+        if is_list_like(self._by):
+            if isinstance(name, tuple) and len(name) == 1:
+                name = name[0]
+            else:
+                raise KeyError(name)
         return obj.iloc[self.indices[name]]
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd07c58c5d9..1b90e9f9df0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,10 +5,10 @@
 import operator
 import pickle
 import warnings
-from collections.abc import Hashable
+from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5952815deef..e031f2a4e8e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -10,9 +10,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Literal,
-    MutableMapping,
     TypeVar,
     cast,
 )
@@ -63,6 +61,8 @@
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, MutableMapping
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 8182e5cede2..ce6a5c960dd 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -3,9 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, List, Union
-
-from typing_extensions import TypeAlias
+from typing import Any, TypeAlias
 
 import cudf
 from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
@@ -46,11 +44,11 @@ class ScalarIndexer:
     key: GatherMap
 
 
-IndexingSpec: TypeAlias = Union[
-    EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer
-]
+IndexingSpec: TypeAlias = (
+    EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer
+)
 
-ColumnLabels: TypeAlias = List[str]
+ColumnLabels: TypeAlias = list[str]
 
 
 def destructure_iloc_key(
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 92d094d9de5..bfff62f0a89 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,7 +8,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, MutableMapping
+from typing import TYPE_CHECKING, Any
 
 import cupy as cp
 import numpy as np
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable
+    from collections.abc import Generator, Hashable, MutableMapping
 
     from typing_extensions import Self
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 29ed18ac0ce..9b60424c924 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,7 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy
 import numpy as np
@@ -71,6 +71,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
     import pyarrow as pa
 
     from cudf._typing import (
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 9e59b134b73..dda1f199078 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -6,8 +6,9 @@
 
 import cupy as cp
 
+import pylibcudf as plc
+
 from cudf._lib.nvtext.subword_tokenize import (
-    Hashed_Vocabulary as cpp_hashed_vocabulary,
     subword_tokenize_inmem_hash as cpp_subword_tokenize,
 )
 
@@ -50,7 +51,9 @@ class SubwordTokenizer:
 
     def __init__(self, hash_file: str, do_lower_case: bool = True):
         self.do_lower_case = do_lower_case
-        self.vocab_file = cpp_hashed_vocabulary(hash_file)
+        self.vocab_file = plc.nvtext.subword_tokenize.HashedVocabulary(
+            hash_file
+        )
 
     def __call__(
         self,
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index 99d85c0c5c0..1e31376cce8 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -2,9 +2,10 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
+
 import cudf
 from cudf._lib.nvtext.tokenize import (
-    TokenizeVocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
 
@@ -20,7 +21,9 @@ class TokenizeVocabulary:
     """
 
     def __init__(self, vocabulary: "cudf.Series"):
-        self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
+        self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
+            vocabulary._column.to_pylibcudf(mode="read")
+        )
 
     def tokenize(
         self, text, delimiter: str = "", default_id: int = -1
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 68f34fa28ff..885e7b16644 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -4,7 +4,7 @@
 import math
 import re
 import warnings
-from typing import Literal, Sequence
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -20,6 +20,9 @@
 from cudf.core import column
 from cudf.core.index import ensure_index
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
     "year": "year",
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c364d55e677..73afde407db 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -10,9 +10,9 @@
 import pickle
 import types
 import warnings
-from collections.abc import Callable, Iterator
+from collections.abc import Callable, Iterator, Mapping
 from enum import IntEnum
-from typing import Any, Literal, Mapping
+from typing import Any, Literal
 
 import numpy as np
 
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index f82e300e83d..38103a71908 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, NamedTuple
+from typing import Any, ContextManager, NamedTuple  # noqa: UP035
 
 from typing_extensions import Self
 
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index 8870fbc5c28..bb2fc00d9fc 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -9,6 +9,7 @@
     python analyze-test-failures.py <path-to-test-log> <file-or-pattern>
 
 Example:
+-------
     python analyze-test-failures.py log.json frame/*
 """
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index d12d2697729..59966a5ff0c 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs):
 
 @pytest.fixture(scope="session", autouse=True)  # type: ignore
 def patch_testing_functions():
-    tm.assert_produces_warning = null_assert_warnings
+    tm.assert_produces_warning = null_assert_warnings  # noqa: F821
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 4ea0b3b4413..a0ad872e4c7 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -5,7 +5,8 @@
 """
 Summarizes the test results per module.
 
-Examples:
+Examples
+--------
     python summarize-test-results.py log.json
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 949fa909b5b..71b6bbd688d 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -3431,3 +3431,16 @@ def test_binop_eq_ne_index_series(data1, data2):
     expected = gi.to_pandas() != gs.to_pandas()
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("scalar", [np.datetime64, np.timedelta64])
+def test_binop_lhs_numpy_datetimelike_scalar(scalar):
+    slr1 = scalar(1, "ms")
+    slr2 = scalar(1, "ns")
+    result = slr1 < cudf.Series([slr2])
+    expected = slr1 < pd.Series([slr2])
+    assert_eq(result, expected)
+
+    result = slr2 < cudf.Series([slr1])
+    expected = slr2 < pd.Series([slr1])
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 6b222841622..e4422e204bc 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -4059,3 +4059,19 @@ def test_ndim():
     pgb = pser.groupby([0, 0, 1])
     ggb = gser.groupby(cudf.Series([0, 0, 1]))
     assert pgb.ndim == ggb.ndim
+
+
+@pytest.mark.skipif(
+    not PANDAS_GE_220, reason="pandas behavior applicable in >=2.2"
+)
+def test_get_group_list_like():
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    result = df.groupby(["a"]).get_group((1,))
+    expected = df.to_pandas().groupby(["a"]).get_group((1,))
+    assert_eq(result, expected)
+
+    with pytest.raises(KeyError):
+        df.groupby(["a"]).get_group((1, 2))
+
+    with pytest.raises(KeyError):
+        df.groupby(["a"]).get_group([1])
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index a75a20a4681..63fd9601fc1 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -387,7 +387,8 @@ def test_dir_bound_method(
 ):
     """This test will fail because dir for bound methods is currently
     incorrect, but we have no way to fix it without materializing the slow
-    type, which is unnecessarily expensive."""
+    type, which is unnecessarily expensive.
+    """
     Fast, FastIntermediate = fast_and_intermediate_with_doc
     Slow, SlowIntermediate = slow_and_intermediate_with_doc
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index feab04ffadc..b6105c17b3e 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -53,6 +53,7 @@ test = [
     "cramjam",
     "fastavro>=0.22.9",
     "hypothesis",
+    "mmh3",
     "msgpack",
     "pytest-benchmark",
     "pytest-cases>=3.8.2",
@@ -63,6 +64,7 @@ test = [
     "tokenizers==0.15.2",
     "transformers==4.39.3",
     "tzdata",
+    "xxhash",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
     "ipython",
@@ -81,50 +83,6 @@ cudf-pandas-tests = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "pylibcudf"
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
-
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
@@ -174,3 +132,18 @@ wheel.packages = ["cudf"]
 provider = "scikit_build_core.metadata.regex"
 input = "cudf/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "pylibcudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 87e19a2bccf..667cd7b1db8 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -32,51 +32,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-    "streamz",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "cudf_kafka",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf_kafka"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda", "streamz"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 2c195f6637c..08bc9d0ea3f 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -9,10 +9,11 @@
 from typing import TYPE_CHECKING, cast
 
 import pyarrow as pa
-import pylibcudf as plc
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.utils import dtypes
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index b8b18ec5039..2af9fdaacc5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
@@ -30,14 +31,13 @@
 
 
 class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request", "children")
+    __slots__ = ("name", "options", "op", "request")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self, dtype: plc.DataType, name: str, options: Any, *children: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.name = name
         self.options = options
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 8d021b0231d..effe8cb2378 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -13,9 +13,10 @@
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
+from cudf_polars.dsl.nodebase import Node
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
+    from collections.abc import Mapping
 
     from cudf_polars.containers import Column, DataFrame
 
@@ -32,100 +33,16 @@ class ExecutionContext(IntEnum):
     ROLLING = enum.auto()
 
 
-class Expr:
-    """
-    An abstract expression object.
+class Expr(Node["Expr"]):
+    """An abstract expression object."""
 
-    This contains a (potentially empty) tuple of child expressions,
-    along with non-child data. For uniform reconstruction and
-    implementation of hashing and equality schemes, child classes need
-    to provide a certain amount of metadata when they are defined.
-    Specifically, the ``_non_child`` attribute must list, in-order,
-    the names of the slots that are passed to the constructor. The
-    constructor must take arguments in the order ``(*_non_child,
-    *children).``
-    """
-
-    __slots__ = ("dtype", "_hash_value", "_repr_value")
+    __slots__ = ("dtype",)
     dtype: plc.DataType
     """Data type of the expression."""
-    _hash_value: int
-    """Caching slot for the hash of the expression."""
-    _repr_value: str
-    """Caching slot for repr of the expression."""
-    children: tuple[Expr, ...] = ()
-    """Children of the expression."""
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
     """Names of non-child data (not Exprs) for reconstruction."""
 
-    # Constructor must take arguments in order (*_non_child, *children)
-    def __init__(self, dtype: plc.DataType) -> None:
-        self.dtype = dtype
-
-    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
-        return (*(getattr(self, attr) for attr in self._non_child), *children)
-
-    def get_hash(self) -> int:
-        """
-        Return the hash of this expr.
-
-        Override this in subclasses, rather than __hash__.
-
-        Returns
-        -------
-        The integer hash value.
-        """
-        return hash((type(self), self._ctor_arguments(self.children)))
-
-    def __hash__(self) -> int:
-        """Hash of an expression with caching."""
-        try:
-            return self._hash_value
-        except AttributeError:
-            self._hash_value = self.get_hash()
-            return self._hash_value
-
-    def is_equal(self, other: Any) -> bool:
-        """
-        Equality of two expressions.
-
-        Override this in subclasses, rather than __eq__.
-
-        Parameter
-        ---------
-        other
-            object to compare to
-
-        Returns
-        -------
-        True if the two expressions are equal, false otherwise.
-        """
-        if type(self) is not type(other):
-            return False  # pragma: no cover; __eq__ trips first
-        return self._ctor_arguments(self.children) == other._ctor_arguments(
-            other.children
-        )
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of expressions."""
-        if type(self) is not type(other) or hash(self) != hash(other):
-            return False
-        else:
-            return self.is_equal(other)
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def __repr__(self) -> str:
-        """String representation of an expression with caching."""
-        try:
-            return self._repr_value
-        except AttributeError:
-            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
-            self._repr_value = f"{type(self).__name__}({args})"
-            return self._repr_value
-
     def do_evaluate(
         self,
         df: DataFrame,
@@ -311,11 +228,11 @@ class Col(Expr):
     __slots__ = ("name",)
     _non_child = ("dtype", "name")
     name: str
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, name: str) -> None:
         self.dtype = dtype
         self.name = name
+        self.children = ()
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
index 19baae3611d..245bdbefe88 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -8,10 +8,10 @@
 
 from typing import TYPE_CHECKING, ClassVar
 
-import pylibcudf as plc
-
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
 
@@ -24,9 +24,8 @@
 
 
 class BinOp(Expr):
-    __slots__ = ("op", "children")
+    __slots__ = ("op",)
     _non_child = ("dtype", "op")
-    children: tuple[Expr, Expr]
 
     def __init__(
         self,
@@ -35,7 +34,7 @@ def __init__(
         left: Expr,
         right: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         if plc.traits.is_boolean(self.dtype):
             # For boolean output types, bitand and bitor implement
             # boolean logic, so translate. bitxor also does, but the
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index ff9973a47d5..8db8172ebd1 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -10,10 +10,11 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
-import pylibcudf as plc
 
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import (
     ExecutionContext,
@@ -31,9 +32,8 @@
 
 
 class BooleanFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -42,7 +42,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index f752a23b628..65fa4bfa62f 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -9,10 +9,11 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
-import pylibcudf as plc
 
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 
@@ -25,7 +26,7 @@
 
 
 class TemporalFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
         pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
         pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
@@ -39,7 +40,6 @@ class TemporalFunction(Expr):
         pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
     }
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -48,7 +48,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index 562a2255033..c16313bf83c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -9,6 +9,7 @@
 from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
@@ -16,7 +17,7 @@
 from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping
+    from collections.abc import Hashable, Mapping
 
     import pyarrow as pa
 
@@ -31,12 +32,12 @@ class Literal(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
     value: pa.Scalar[Any]
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         assert value.type == plc.interop.to_arrow(dtype)
         self.value = value
+        self.children = ()
 
     def do_evaluate(
         self,
@@ -58,19 +59,19 @@ class LiteralColumn(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
     value: pa.Array[Any, Any]
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+        self.children = ()
 
-    def get_hash(self) -> int:
+    def get_hashable(self) -> Hashable:
         """Compute a hash of the column."""
         # This is stricter than necessary, but we only need this hash
         # for identity in groupby replacements so it's OK. And this
         # way we avoid doing potentially expensive compute.
-        return hash((type(self), self.dtype, id(self.value)))
+        return (type(self), self.dtype, id(self.value))
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
index f7dcc3c542c..fa68bcb9426 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -17,24 +17,22 @@
 
 
 class RollingWindow(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr]
 
     def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (agg,)
         raise NotImplementedError("Rolling window not implemented")
 
 
 class GroupedRollingWindow(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
 
     def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (agg, *by)
         raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index a7a3e68a28c..77d7d4c0d22 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -9,6 +9,7 @@
 from typing import TYPE_CHECKING
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
@@ -23,12 +24,11 @@
 
 
 class Gather(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
 
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (values, indices)
 
     def do_evaluate(
@@ -65,12 +65,11 @@ def do_evaluate(
 
 
 class Filter(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
 
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (values, indices)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
index 861b73ce6a0..99512e2ef52 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -23,14 +23,13 @@
 
 
 class Sort(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr]
 
     def __init__(
         self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (column,)
 
@@ -59,9 +58,8 @@ def do_evaluate(
 
 
 class SortBy(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -70,7 +68,7 @@ def __init__(
         column: Expr,
         *by: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (column, *by)
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 6669669aadc..8b66c9d4676 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -10,11 +10,12 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 
 from polars.exceptions import InvalidOperationError
 from polars.polars import _expr_nodes as pl_expr
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
@@ -28,9 +29,8 @@
 
 
 class StringFunction(Expr):
-    __slots__ = ("name", "options", "children", "_regex_program")
+    __slots__ = ("name", "options", "_regex_program")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -39,7 +39,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
index c7d7a802ded..d2b5d6bae29 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -26,14 +26,13 @@
 
 
 class Ternary(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr, Expr]
 
     def __init__(
         self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (when, then, otherwise)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 3d4d15be1ce..6f22544c050 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -8,6 +8,7 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
+
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
@@ -26,12 +27,11 @@
 class Cast(Expr):
     """Class representing a cast of an expression."""
 
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr]
 
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (value,)
         if not dtypes.can_cast(value.dtype, self.dtype):
             raise NotImplementedError(
@@ -60,7 +60,9 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Len(Expr):
     """Class representing the length of an expression."""
 
-    children: tuple[()]
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+        self.children = ()
 
     def do_evaluate(
         self,
@@ -90,9 +92,8 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class UnaryFunction(Expr):
     """Class representing unary functions of an expression."""
 
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     # Note: log, and pow are handled via translation to binops
     _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
@@ -142,7 +143,7 @@ class UnaryFunction(Expr):
     def __init__(
         self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.name = name
         self.options = options
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e319c363a23..04aa74024cd 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -13,24 +13,27 @@
 
 from __future__ import annotations
 
-import dataclasses
 import itertools
+import json
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
-import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
+import pylibcudf as plc
+
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.dsl.nodebase import Node
+from cudf_polars.dsl.to_ast import to_parquet_filter
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, MutableMapping
+    from collections.abc import Callable, Hashable, MutableMapping, Sequence
     from typing import Literal
 
     from cudf_polars.typing import Schema
@@ -121,16 +124,27 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
     ]
 
 
-@dataclasses.dataclass
-class IR:
+class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
 
+    __slots__ = ("schema",)
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
+    _non_child: ClassVar[tuple[str, ...]] = ("schema",)
     schema: Schema
     """Mapping from column names to their data types."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
-        pass  # noqa: PIE790
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of node, treating schema dictionary.
+
+        Since the schema is a dictionary, even though it is morally
+        immutable, it is not hashable. We therefore convert it to
+        tuples for hashing purposes.
+        """
+        # Schema is the first constructor argument
+        args = self._ctor_arguments(self.children)[1:]
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, args)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
@@ -159,24 +173,50 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )  # pragma: no cover
 
 
-@dataclasses.dataclass
 class PythonScan(IR):
     """Representation of input from a python function."""
 
+    __slots__ = ("options", "predicate")
+    _non_child = ("schema", "options", "predicate")
     options: Any
     """Arbitrary options."""
     predicate: expr.NamedExpr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
+    def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | None):
+        self.schema = schema
+        self.options = options
+        self.predicate = predicate
+        self.children = ()
         raise NotImplementedError("PythonScan not implemented")
 
 
-@dataclasses.dataclass
 class Scan(IR):
     """Input from files."""
 
+    __slots__ = (
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
+    _non_child = (
+        "schema",
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
     reader_options: dict[str, Any]
@@ -185,7 +225,7 @@ class Scan(IR):
     """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
-    with_columns: list[str]
+    with_columns: list[str] | None
     """Projected columns to return."""
     skip_rows: int
     """Rows to skip at the start when reading."""
@@ -196,9 +236,30 @@ class Scan(IR):
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        typ: str,
+        reader_options: dict[str, Any],
+        cloud_options: dict[str, Any] | None,
+        paths: list[str],
+        with_columns: list[str] | None,
+        skip_rows: int,
+        n_rows: int,
+        row_index: tuple[str, int] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.typ = typ
+        self.reader_options = reader_options
+        self.cloud_options = cloud_options
+        self.paths = paths
+        self.with_columns = with_columns
+        self.skip_rows = skip_rows
+        self.n_rows = n_rows
+        self.row_index = row_index
+        self.predicate = predicate
+        self.children = ()
         if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
             # This line is unhittable ATM since IPC/Anonymous scan raise
             # on the polars side
@@ -258,6 +319,28 @@ def __post_init__(self) -> None:
                 "Reading only parquet metadata to produce row index."
             )
 
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The options dictionaries are serialised for hashing purposes
+        as json strings.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (
+            type(self),
+            schema_hash,
+            self.typ,
+            json.dumps(self.reader_options),
+            json.dumps(self.cloud_options),
+            tuple(self.paths),
+            tuple(self.with_columns) if self.with_columns is not None else None,
+            self.skip_rows,
+            self.n_rows,
+            self.row_index,
+            self.predicate,
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         with_columns = self.with_columns
@@ -337,9 +420,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 colnames[0],
             )
         elif self.typ == "parquet":
+            filters = None
+            if self.predicate is not None and self.row_index is None:
+                # Can't apply filters during read if we have a row index.
+                filters = to_parquet_filter(self.predicate.value)
             tbl_w_meta = plc.io.parquet.read_parquet(
                 plc.io.SourceInfo(self.paths),
                 columns=with_columns,
+                filters=filters,
                 nrows=n_rows,
                 skip_rows=self.skip_rows,
             )
@@ -348,6 +436,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 # TODO: consider nested column names?
                 tbl_w_meta.column_names(include_children=False),
             )
+            if filters is not None:
+                # Mask must have been applied.
+                return df
         elif self.typ == "ndjson":
             json_schema: list[tuple[str, str, list]] = [
                 (name, typ, []) for name, typ in self.schema.items()
@@ -401,7 +492,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Cache(IR):
     """
     Return a cached plan node.
@@ -409,20 +499,25 @@ class Cache(IR):
     Used for CSE at the plan level.
     """
 
+    __slots__ = ("key",)
+    _non_child = ("schema", "key")
     key: int
     """The cache key."""
-    value: IR
-    """The unevaluated node to cache."""
+
+    def __init__(self, schema: Schema, key: int, value: IR):
+        self.schema = schema
+        self.key = key
+        self.children = (value,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         try:
             return cache[self.key]
         except KeyError:
-            return cache.setdefault(self.key, self.value.evaluate(cache=cache))
+            (value,) = self.children
+            return cache.setdefault(self.key, value.evaluate(cache=cache))
 
 
-@dataclasses.dataclass
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -430,13 +525,38 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
+    __slots__ = ("df", "projection", "predicate")
+    _non_child = ("schema", "df", "projection", "predicate")
     df: Any
     """Polars LazyFrame object."""
-    projection: list[str]
+    projection: tuple[str, ...] | None
     """List of columns to project out."""
     predicate: expr.NamedExpr | None
     """Mask to apply."""
 
+    def __init__(
+        self,
+        schema: Schema,
+        df: Any,
+        projection: Sequence[str] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.df = df
+        self.projection = tuple(projection) if projection is not None else None
+        self.predicate = predicate
+        self.children = ()
+
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The (heavy) dataframe object is hashed as its id, so this is
+        not stable across runs, or repeat instances of the same equal dataframes.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
@@ -454,28 +574,39 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclasses.dataclass
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs", "should_broadcast")
+    _non_child = ("schema", "exprs", "should_broadcast")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
     should_broadcast: bool
     """Should columns be broadcast?"""
 
+    def __init__(
+        self,
+        schema: Schema,
+        exprs: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # Handle any broadcasting
-        columns = [e.evaluate(df) for e in self.expr]
+        columns = [e.evaluate(df) for e in self.exprs]
         if self.should_broadcast:
             columns = broadcast(*columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -483,36 +614,73 @@ class Reduce(IR):
     This is a special case of :class:`Select` where all outputs are a single row.
     """
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs",)
+    _non_child = ("schema", "exprs")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
 
+    def __init__(
+        self, schema: Schema, exprs: Sequence[expr.NamedExpr], df: IR
+    ):  # pragma: no cover; polars doesn't emit this node yet
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.children = (df,)
+
     def evaluate(
         self, *, cache: MutableMapping[int, DataFrame]
     ) -> DataFrame:  # pragma: no cover; polars doesn't emit this node yet
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
-        columns = broadcast(*(e.evaluate(df) for e in self.expr))
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
+        columns = broadcast(*(e.evaluate(df) for e in self.exprs))
         assert all(column.obj.size() == 1 for column in columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
 
-    df: IR
-    """Input dataframe."""
-    agg_requests: list[expr.NamedExpr]
-    """List of expressions to evaluate groupwise."""
-    keys: list[expr.NamedExpr]
-    """List of expressions forming the keys."""
+    __slots__ = (
+        "agg_requests",
+        "keys",
+        "maintain_order",
+        "options",
+        "agg_infos",
+    )
+    _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
+    keys: tuple[expr.NamedExpr, ...]
+    """Grouping keys."""
+    agg_requests: tuple[expr.NamedExpr, ...]
+    """Aggregation expressions."""
     maintain_order: bool
-    """Should the order of the input dataframe be maintained?"""
+    """Preserve order in groupby."""
     options: Any
-    """Options controlling style of groupby."""
-    agg_infos: list[expr.AggInfo] = dataclasses.field(init=False)
+    """Arbitrary options."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keys: Sequence[expr.NamedExpr],
+        agg_requests: Sequence[expr.NamedExpr],
+        maintain_order: bool,  # noqa: FBT001
+        options: Any,
+        df: IR,
+    ):
+        self.schema = schema
+        self.keys = tuple(keys)
+        self.agg_requests = tuple(agg_requests)
+        self.maintain_order = maintain_order
+        self.options = options
+        self.children = (df,)
+        if self.options.rolling:
+            raise NotImplementedError(
+                "rolling window/groupby"
+            )  # pragma: no cover; rollingwindow constructor has already raised
+        if self.options.dynamic:
+            raise NotImplementedError("dynamic group by")
+        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
+            raise NotImplementedError("Nested aggregations in groupby")
+        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
 
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
@@ -542,22 +710,10 @@ def check_agg(agg: expr.Expr) -> int:
         else:
             raise NotImplementedError(f"No handler for {agg=}")
 
-    def __post_init__(self) -> None:
-        """Check whether all the aggregations are implemented."""
-        super().__post_init__()
-        if self.options.rolling:
-            raise NotImplementedError(
-                "rolling window/groupby"
-            )  # pragma: no cover; rollingwindow constructor has already raised
-        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
-            raise NotImplementedError("Nested aggregations in groupby")
-        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
-        if len(self.keys) == 0:
-            raise NotImplementedError("dynamic groupby")
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         keys = broadcast(
             *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
         )
@@ -646,23 +802,20 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(broadcasted).slice(self.options.slice)
 
 
-@dataclasses.dataclass
 class Join(IR):
     """A join of two dataframes."""
 
-    left: IR
-    """Left frame."""
-    right: IR
-    """Right frame."""
-    left_on: list[expr.NamedExpr]
+    __slots__ = ("left_on", "right_on", "options")
+    _non_child = ("schema", "left_on", "right_on", "options")
+    left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
-    right_on: list[expr.NamedExpr]
+    right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
+        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
         bool,
         tuple[int, int] | None,
-        str | None,
+        str,
         bool,
     ]
     """
@@ -674,9 +827,20 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        left_on: Sequence[expr.NamedExpr],
+        right_on: Sequence[expr.NamedExpr],
+        options: Any,
+        left: IR,
+        right: IR,
+    ):
+        self.schema = schema
+        self.left_on = tuple(left_on)
+        self.right_on = tuple(right_on)
+        self.options = options
+        self.children = (left, right)
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -686,7 +850,7 @@ def __post_init__(self) -> None:
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "right", "full", "semi", "anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -708,13 +872,13 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "leftsemi":
+        elif how == "semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "leftanti":
+        elif how == "anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -777,10 +941,8 @@ def _reorder_maps(
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        left = self.left.evaluate(cache=cache)
-        right = self.right.evaluate(cache=cache)
+        left, right = (c.evaluate(cache=cache) for c in self.children)
         how, join_nulls, zlice, suffix, coalesce = self.options
-        suffix = "_right" if suffix is None else suffix
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
@@ -802,7 +964,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     columns[left.num_columns :], right.column_names, strict=True
                 )
             ]
-            return DataFrame([*left_cols, *right_cols])
+            return DataFrame([*left_cols, *right_cols]).slice(zlice)
         # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
         left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
         right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
@@ -866,20 +1028,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclasses.dataclass
 class HStack(IR):
     """Add new columns to a dataframe."""
 
-    df: IR
-    """Input dataframe."""
-    columns: list[expr.NamedExpr]
-    """List of expressions to produce new columns."""
+    __slots__ = ("columns", "should_broadcast")
+    _non_child = ("schema", "columns", "should_broadcast")
     should_broadcast: bool
-    """Should columns be broadcast?"""
+    """Should the resulting evaluated columns be broadcast to the same length."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        columns: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.columns = tuple(columns)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         columns = [c.evaluate(df) for c in self.columns]
         if self.should_broadcast:
             columns = broadcast(*columns, target_length=df.num_rows)
@@ -895,20 +1067,36 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclasses.dataclass
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
-    df: IR
-    """Input dataframe."""
+    __slots__ = ("keep", "subset", "zlice", "stable")
+    _non_child = ("schema", "keep", "subset", "zlice", "stable")
     keep: plc.stream_compaction.DuplicateKeepOption
-    """Which rows to keep."""
-    subset: set[str] | None
-    """Which columns to inspect when computing distinct rows."""
+    """Which distinct value to keep."""
+    subset: frozenset[str] | None
+    """Which columns should be used to define distinctness. If None,
+    then all columns are used."""
     zlice: tuple[int, int] | None
-    """Optional slice to perform after compaction."""
+    """Optional slice to apply to the result."""
     stable: bool
-    """Should order be preserved?"""
+    """Should the result maintain ordering."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        subset: frozenset[str] | None,
+        zlice: tuple[int, int] | None,
+        stable: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.keep = keep
+        self.subset = subset
+        self.zlice = zlice
+        self.stable = stable
+        self.children = (df,)
 
     _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = {
         "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
@@ -917,18 +1105,10 @@ class Distinct(IR):
         "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
     }
 
-    def __init__(self, schema: Schema, df: IR, options: Any) -> None:
-        self.schema = schema
-        self.df = df
-        (keep, subset, maintain_order, zlice) = options
-        self.keep = Distinct._KEEP_MAP[keep]
-        self.subset = set(subset) if subset is not None else None
-        self.stable = maintain_order
-        self.zlice = zlice
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         if self.subset is None:
             indices = list(range(df.num_columns))
             keys_sorted = all(c.is_sorted for c in df.column_map.values())
@@ -967,46 +1147,44 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Sort(IR):
     """Sort a dataframe."""
 
-    df: IR
-    """Input."""
-    by: list[expr.NamedExpr]
-    """List of expressions to produce sort keys."""
-    do_sort: Callable[..., plc.Table]
-    """pylibcudf sorting function."""
+    __slots__ = ("by", "order", "null_order", "stable", "zlice")
+    _non_child = ("schema", "by", "order", "null_order", "stable", "zlice")
+    by: tuple[expr.NamedExpr, ...]
+    """Sort keys."""
+    order: tuple[plc.types.Order, ...]
+    """Sort order for each sort key."""
+    null_order: tuple[plc.types.NullOrder, ...]
+    """Null sorting location for each sort key."""
+    stable: bool
+    """Should the sort be stable?"""
     zlice: tuple[int, int] | None
-    """Optional slice to apply after sorting."""
-    order: list[plc.types.Order]
-    """Order keys should be sorted in."""
-    null_order: list[plc.types.NullOrder]
-    """Where nulls sort to."""
+    """Optional slice to apply to the result."""
 
     def __init__(
         self,
         schema: Schema,
-        df: IR,
-        by: list[expr.NamedExpr],
-        options: Any,
+        by: Sequence[expr.NamedExpr],
+        order: Sequence[plc.types.Order],
+        null_order: Sequence[plc.types.NullOrder],
+        stable: bool,  # noqa: FBT001
         zlice: tuple[int, int] | None,
-    ) -> None:
+        df: IR,
+    ):
         self.schema = schema
-        self.df = df
-        self.by = by
+        self.by = tuple(by)
+        self.order = tuple(order)
+        self.null_order = tuple(null_order)
+        self.stable = stable
         self.zlice = zlice
-        stable, nulls_last, descending = options
-        self.order, self.null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(by)
-        )
-        self.do_sort = (
-            plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
-        )
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         sort_keys = broadcast(
             *(k.evaluate(df) for k in self.by), target_length=df.num_rows
         )
@@ -1016,11 +1194,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             for i, k in enumerate(sort_keys)
             if k.name in df.column_map and k.obj is df.column_map[k.name].obj
         }
-        table = self.do_sort(
+        do_sort = (
+            plc.sorting.stable_sort_by_key if self.stable else plc.sorting.sort_by_key
+        )
+        table = do_sort(
             df.table,
             plc.Table([k.obj for k in sort_keys]),
-            self.order,
-            self.null_order,
+            list(self.order),
+            list(self.null_order),
         )
         columns: list[Column] = []
         for name, c in zip(df.column_map, table.columns(), strict=True):
@@ -1037,49 +1218,64 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Slice(IR):
     """Slice a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("offset", "length")
+    _non_child = ("schema", "offset", "length")
     offset: int
     """Start of the slice."""
     length: int
     """Length of the slice."""
 
+    def __init__(self, schema: Schema, offset: int, length: int, df: IR):
+        self.schema = schema
+        self.offset = offset
+        self.length = length
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         return df.slice((self.offset, self.length))
 
 
-@dataclasses.dataclass
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("mask",)
+    _non_child = ("schema", "mask")
     mask: expr.NamedExpr
-    """Expression evaluating to a mask."""
+    """Expression to produce the filter mask."""
+
+    def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR):
+        self.schema = schema
+        self.mask = mask
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows)
         return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, df: IR):
+        self.schema = schema
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # This can reorder things.
         columns = broadcast(
             *(df.column_map[name] for name in self.schema), target_length=df.num_rows
@@ -1087,16 +1283,15 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("name", "options")
+    _non_child = ("schema", "name", "options")
     name: str
-    """Function name."""
+    """Name of the function to apply"""
     options: Any
-    """Arbitrary options, interpreted per function."""
+    """Arbitrary name-specific options"""
 
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
@@ -1111,9 +1306,11 @@ class MapFunction(IR):
         ]
     )
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(self, schema: Schema, name: str, options: Any, df: IR):
+        self.schema = schema
+        self.name = name
+        self.options = options
+        self.children = (df,)
         if self.name not in MapFunction._NAMES:
             raise NotImplementedError(f"Unhandled map function {self.name}")
         if self.name == "explode":
@@ -1127,7 +1324,7 @@ def __post_init__(self) -> None:
             old, new, _ = self.options
             # TODO: perhaps polars should validate renaming in the IR?
             if len(new) != len(set(new)) or (
-                set(new) & (set(self.df.schema.keys()) - set(old))
+                set(new) & (set(df.schema.keys()) - set(old))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
         elif self.name == "unpivot":
@@ -1136,31 +1333,31 @@ def __post_init__(self) -> None:
             variable_name = "variable" if variable_name is None else variable_name
             if len(pivotees) == 0:
                 index = frozenset(indices)
-                pivotees = [name for name in self.df.schema if name not in index]
+                pivotees = [name for name in df.schema if name not in index]
             if not all(
-                dtypes.can_cast(self.df.schema[p], self.schema[value_name])
-                for p in pivotees
+                dtypes.can_cast(df.schema[p], self.schema[value_name]) for p in pivotees
             ):
                 raise NotImplementedError(
                     "Unpivot cannot cast all input columns to "
                     f"{self.schema[value_name].id()}"
                 )
-            self.options = (indices, pivotees, variable_name, value_name)
+            self.options = (tuple(indices), tuple(pivotees), variable_name, value_name)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
+        (child,) = self.children
         if self.name == "rechunk":
             # No-op in our data model
             # Don't think this appears in a plan tree from python
-            return self.df.evaluate(cache=cache)  # pragma: no cover
+            return child.evaluate(cache=cache)  # pragma: no cover
         elif self.name == "rename":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             # final tag is "swapping" which is useful for the
             # optimiser (it blocks some pushdown operations)
             old, new, _ = self.options
             return df.rename_columns(dict(zip(old, new, strict=True)))
         elif self.name == "explode":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             ((to_explode,),) = self.options
             index = df.column_names.index(to_explode)
             subset = df.column_names_set - {to_explode}
@@ -1170,7 +1367,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         elif self.name == "unpivot":
             indices, pivotees, variable_name, value_name = self.options
             npiv = len(pivotees)
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             index_columns = [
                 Column(col, name=name)
                 for col, name in zip(
@@ -1209,37 +1406,40 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
 
-@dataclasses.dataclass
 class Union(IR):
     """Concatenate dataframes vertically."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ("zlice",)
+    _non_child = ("schema", "zlice")
     zlice: tuple[int, int] | None
-    """Optional slice to apply after concatenation."""
+    """Optional slice to apply to the result."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
-        schema = self.dfs[0].schema
-        if not all(s.schema == schema for s in self.dfs[1:]):
+    def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR):
+        self.schema = schema
+        self.zlice = zlice
+        self.children = children
+        schema = self.children[0].schema
+        if not all(s.schema == schema for s in self.children[1:]):
             raise NotImplementedError("Schema mismatch")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         return DataFrame.from_table(
             plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names
         ).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, *children: IR):
+        self.schema = schema
+        self.children = children
 
     @staticmethod
     def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
@@ -1271,7 +1471,7 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         max_rows = max(df.num_rows for df in dfs)
         # Horizontal concatenation extends shorter tables with nulls
         dfs = [
diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py
new file mode 100644
index 00000000000..228d300f467
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Base class for IR nodes, and utilities."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, Sequence
+
+    from typing_extensions import Self
+
+
+__all__: list[str] = ["Node"]
+
+T = TypeVar("T", bound="Node[Any]")
+
+
+class Node(Generic[T]):
+    """
+    An abstract node type.
+
+    Nodes are immutable!
+
+    This contains a (potentially empty) tuple of child nodes,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
+    __slots__ = ("_hash_value", "_repr_value", "children")
+    _hash_value: int
+    _repr_value: str
+    children: tuple[T, ...]
+    _non_child: ClassVar[tuple[str, ...]] = ()
+
+    def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def reconstruct(
+        self, children: Sequence[T]
+    ) -> Self:  # pragma: no cover; not yet used
+        """
+        Rebuild this node with new children.
+
+        Parameters
+        ----------
+        children
+            New children
+
+        Returns
+        -------
+        New node with new children. Non-child data is shared with the input.
+        """
+        return type(self)(*self._ctor_arguments(children))
+
+    def get_hashable(self) -> Hashable:
+        """
+        Return a hashable object for the node.
+
+        Returns
+        -------
+        Hashable object.
+
+        Notes
+        -----
+        This method is used by the :meth:`__hash__` implementation
+        (which does caching). If your node type needs special-case
+        handling for some of its attributes, override this method, not
+        :meth:`__hash__`.
+        """
+        return (type(self), self._ctor_arguments(self.children))
+
+    def __hash__(self) -> int:
+        """
+        Hash of an expression with caching.
+
+        See Also
+        --------
+        get_hashable
+        """
+        try:
+            return self._hash_value
+        except AttributeError:
+            self._hash_value = hash(self.get_hashable())
+            return self._hash_value
+
+    def is_equal(self, other: Self) -> bool:
+        """
+        Equality of two nodes of equal type.
+
+        Override this in subclasses, rather than :meth:`__eq__`.
+
+        Parameter
+        ---------
+        other
+            object of same type to compare to.
+
+        Notes
+        -----
+        Since nodes are immutable, this does common subexpression
+        elimination when two nodes are determined to be equal.
+
+        :meth:`__eq__` handles the case where the objects being
+        compared are not of the same type, so in this method, we only
+        need to implement equality of equal types.
+
+        Returns
+        -------
+        True if the two nodes are equal, false otherwise.
+        """
+        if self is other:
+            return True
+        result = self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+        # Eager CSE for nodes that match.
+        if result:
+            self.children = other.children
+        return result
+
+    def __eq__(self, other: Any) -> bool:
+        """
+        Equality of expressions.
+
+        See Also
+        --------
+        is_equal
+        """
+        if type(self) is not type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self) -> str:
+        """String representation of an expression with caching."""
+        try:
+            return self._repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py
new file mode 100644
index 00000000000..9a0838631cc
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py
@@ -0,0 +1,265 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Conversion of expression nodes to libcudf AST nodes."""
+
+from __future__ import annotations
+
+from functools import partial, reduce, singledispatch
+from typing import TYPE_CHECKING, TypeAlias
+
+from polars.polars import _expr_nodes as pl_expr
+
+import pylibcudf as plc
+from pylibcudf import expressions as plc_expr
+
+from cudf_polars.dsl import expr
+from cudf_polars.dsl.traversal import CachingVisitor
+from cudf_polars.typing import GenericTransformer
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+# Can't merge these op-mapping dictionaries because scoped enum values
+# are exposed by cython with equality/hash based one their underlying
+# representation type. So in a dict they are just treated as integers.
+BINOP_TO_ASTOP = {
+    plc.binaryop.BinaryOperator.EQUAL: plc_expr.ASTOperator.EQUAL,
+    plc.binaryop.BinaryOperator.NULL_EQUALS: plc_expr.ASTOperator.NULL_EQUAL,
+    plc.binaryop.BinaryOperator.NOT_EQUAL: plc_expr.ASTOperator.NOT_EQUAL,
+    plc.binaryop.BinaryOperator.LESS: plc_expr.ASTOperator.LESS,
+    plc.binaryop.BinaryOperator.LESS_EQUAL: plc_expr.ASTOperator.LESS_EQUAL,
+    plc.binaryop.BinaryOperator.GREATER: plc_expr.ASTOperator.GREATER,
+    plc.binaryop.BinaryOperator.GREATER_EQUAL: plc_expr.ASTOperator.GREATER_EQUAL,
+    plc.binaryop.BinaryOperator.ADD: plc_expr.ASTOperator.ADD,
+    plc.binaryop.BinaryOperator.SUB: plc_expr.ASTOperator.SUB,
+    plc.binaryop.BinaryOperator.MUL: plc_expr.ASTOperator.MUL,
+    plc.binaryop.BinaryOperator.DIV: plc_expr.ASTOperator.DIV,
+    plc.binaryop.BinaryOperator.TRUE_DIV: plc_expr.ASTOperator.TRUE_DIV,
+    plc.binaryop.BinaryOperator.FLOOR_DIV: plc_expr.ASTOperator.FLOOR_DIV,
+    plc.binaryop.BinaryOperator.PYMOD: plc_expr.ASTOperator.PYMOD,
+    plc.binaryop.BinaryOperator.BITWISE_AND: plc_expr.ASTOperator.BITWISE_AND,
+    plc.binaryop.BinaryOperator.BITWISE_OR: plc_expr.ASTOperator.BITWISE_OR,
+    plc.binaryop.BinaryOperator.BITWISE_XOR: plc_expr.ASTOperator.BITWISE_XOR,
+    plc.binaryop.BinaryOperator.LOGICAL_AND: plc_expr.ASTOperator.LOGICAL_AND,
+    plc.binaryop.BinaryOperator.LOGICAL_OR: plc_expr.ASTOperator.LOGICAL_OR,
+    plc.binaryop.BinaryOperator.NULL_LOGICAL_AND: plc_expr.ASTOperator.NULL_LOGICAL_AND,
+    plc.binaryop.BinaryOperator.NULL_LOGICAL_OR: plc_expr.ASTOperator.NULL_LOGICAL_OR,
+}
+
+UOP_TO_ASTOP = {
+    plc.unary.UnaryOperator.SIN: plc_expr.ASTOperator.SIN,
+    plc.unary.UnaryOperator.COS: plc_expr.ASTOperator.COS,
+    plc.unary.UnaryOperator.TAN: plc_expr.ASTOperator.TAN,
+    plc.unary.UnaryOperator.ARCSIN: plc_expr.ASTOperator.ARCSIN,
+    plc.unary.UnaryOperator.ARCCOS: plc_expr.ASTOperator.ARCCOS,
+    plc.unary.UnaryOperator.ARCTAN: plc_expr.ASTOperator.ARCTAN,
+    plc.unary.UnaryOperator.SINH: plc_expr.ASTOperator.SINH,
+    plc.unary.UnaryOperator.COSH: plc_expr.ASTOperator.COSH,
+    plc.unary.UnaryOperator.TANH: plc_expr.ASTOperator.TANH,
+    plc.unary.UnaryOperator.ARCSINH: plc_expr.ASTOperator.ARCSINH,
+    plc.unary.UnaryOperator.ARCCOSH: plc_expr.ASTOperator.ARCCOSH,
+    plc.unary.UnaryOperator.ARCTANH: plc_expr.ASTOperator.ARCTANH,
+    plc.unary.UnaryOperator.EXP: plc_expr.ASTOperator.EXP,
+    plc.unary.UnaryOperator.LOG: plc_expr.ASTOperator.LOG,
+    plc.unary.UnaryOperator.SQRT: plc_expr.ASTOperator.SQRT,
+    plc.unary.UnaryOperator.CBRT: plc_expr.ASTOperator.CBRT,
+    plc.unary.UnaryOperator.CEIL: plc_expr.ASTOperator.CEIL,
+    plc.unary.UnaryOperator.FLOOR: plc_expr.ASTOperator.FLOOR,
+    plc.unary.UnaryOperator.ABS: plc_expr.ASTOperator.ABS,
+    plc.unary.UnaryOperator.RINT: plc_expr.ASTOperator.RINT,
+    plc.unary.UnaryOperator.BIT_INVERT: plc_expr.ASTOperator.BIT_INVERT,
+    plc.unary.UnaryOperator.NOT: plc_expr.ASTOperator.NOT,
+}
+
+SUPPORTED_STATISTICS_BINOPS = {
+    plc.binaryop.BinaryOperator.EQUAL,
+    plc.binaryop.BinaryOperator.NOT_EQUAL,
+    plc.binaryop.BinaryOperator.LESS,
+    plc.binaryop.BinaryOperator.LESS_EQUAL,
+    plc.binaryop.BinaryOperator.GREATER,
+    plc.binaryop.BinaryOperator.GREATER_EQUAL,
+}
+
+REVERSED_COMPARISON = {
+    plc.binaryop.BinaryOperator.EQUAL: plc.binaryop.BinaryOperator.EQUAL,
+    plc.binaryop.BinaryOperator.NOT_EQUAL: plc.binaryop.BinaryOperator.NOT_EQUAL,
+    plc.binaryop.BinaryOperator.LESS: plc.binaryop.BinaryOperator.GREATER,
+    plc.binaryop.BinaryOperator.LESS_EQUAL: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+    plc.binaryop.BinaryOperator.GREATER: plc.binaryop.BinaryOperator.LESS,
+    plc.binaryop.BinaryOperator.GREATER_EQUAL: plc.binaryop.BinaryOperator.LESS_EQUAL,
+}
+
+
+Transformer: TypeAlias = GenericTransformer[expr.Expr, plc_expr.Expression]
+
+
+@singledispatch
+def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression:
+    """
+    Translate an expression to a pylibcudf Expression.
+
+    Parameters
+    ----------
+    node
+        Expression to translate.
+    self
+        Recursive transformer. The state dictionary should contain a
+       `for_parquet` key indicating if this transformation should
+        provide an expression suitable for use in parquet filters.
+
+        If `for_parquet` is `False`, the dictionary should contain a
+        `name_to_index` mapping that maps column names to their
+        integer index in the table that will be used for evaluation of
+        the expression.
+
+    Returns
+    -------
+    pylibcudf Expression.
+
+    Raises
+    ------
+    NotImplementedError or KeyError if the expression cannot be translated.
+    """
+    raise NotImplementedError(f"Unhandled expression type {type(node)}")
+
+
+@_to_ast.register
+def _(node: expr.Col, self: Transformer) -> plc_expr.Expression:
+    if self.state["for_parquet"]:
+        return plc_expr.ColumnNameReference(node.name)
+    return plc_expr.ColumnReference(self.state["name_to_index"][node.name])
+
+
+@_to_ast.register
+def _(node: expr.Literal, self: Transformer) -> plc_expr.Expression:
+    return plc_expr.Literal(plc.interop.from_arrow(node.value))
+
+
+@_to_ast.register
+def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression:
+    if node.op == plc.binaryop.BinaryOperator.NULL_NOT_EQUALS:
+        return plc_expr.Operation(
+            plc_expr.ASTOperator.NOT,
+            self(
+                # Reconstruct and apply, rather than directly
+                # constructing the right expression so we get the
+                # handling of parquet special cases for free.
+                expr.BinOp(
+                    node.dtype, plc.binaryop.BinaryOperator.NULL_EQUALS, *node.children
+                )
+            ),
+        )
+    if self.state["for_parquet"]:
+        op1_col, op2_col = (isinstance(op, expr.Col) for op in node.children)
+        if op1_col ^ op2_col:
+            op = node.op
+            if op not in SUPPORTED_STATISTICS_BINOPS:
+                raise NotImplementedError(
+                    f"Parquet filter binop with column doesn't support {node.op!r}"
+                )
+            op1, op2 = node.children
+            if op2_col:
+                (op1, op2) = (op2, op1)
+                op = REVERSED_COMPARISON[op]
+            if not isinstance(op2, expr.Literal):
+                raise NotImplementedError(
+                    "Parquet filter binops must have form 'col binop literal'"
+                )
+            return plc_expr.Operation(BINOP_TO_ASTOP[op], self(op1), self(op2))
+        elif op1_col and op2_col:
+            raise NotImplementedError(
+                "Parquet filter binops must have one column reference not two"
+            )
+    return plc_expr.Operation(BINOP_TO_ASTOP[node.op], *map(self, node.children))
+
+
+@_to_ast.register
+def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression:
+    if node.name == pl_expr.BooleanFunction.IsIn:
+        needles, haystack = node.children
+        if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16:
+            # 16 is an arbitrary limit
+            needle_ref = self(needles)
+            values = [
+                plc_expr.Literal(plc.interop.from_arrow(v)) for v in haystack.value
+            ]
+            return reduce(
+                partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_OR),
+                (
+                    plc_expr.Operation(plc_expr.ASTOperator.EQUAL, needle_ref, value)
+                    for value in values
+                ),
+            )
+    if self.state["for_parquet"] and isinstance(node.children[0], expr.Col):
+        raise NotImplementedError(
+            f"Parquet filters don't support {node.name} on columns"
+        )
+    if node.name == pl_expr.BooleanFunction.IsNull:
+        return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0]))
+    elif node.name == pl_expr.BooleanFunction.IsNotNull:
+        return plc_expr.Operation(
+            plc_expr.ASTOperator.NOT,
+            plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])),
+        )
+    elif node.name == pl_expr.BooleanFunction.Not:
+        return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0]))
+    raise NotImplementedError(f"AST conversion does not support {node.name}")
+
+
+@_to_ast.register
+def _(node: expr.UnaryFunction, self: Transformer) -> plc_expr.Expression:
+    if isinstance(node.children[0], expr.Col) and self.state["for_parquet"]:
+        raise NotImplementedError(
+            "Parquet filters don't support {node.name} on columns"
+        )
+    return plc_expr.Operation(
+        UOP_TO_ASTOP[node._OP_MAPPING[node.name]], self(node.children[0])
+    )
+
+
+def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None:
+    """
+    Convert an expression to libcudf AST nodes suitable for parquet filtering.
+
+    Parameters
+    ----------
+    node
+        Expression to convert.
+
+    Returns
+    -------
+    pylibcudf Expression if conversion is possible, otherwise None.
+    """
+    mapper = CachingVisitor(_to_ast, state={"for_parquet": True})
+    try:
+        return mapper(node)
+    except (KeyError, NotImplementedError):
+        return None
+
+
+def to_ast(
+    node: expr.Expr, *, name_to_index: Mapping[str, int]
+) -> plc_expr.Expression | None:
+    """
+    Convert an expression to libcudf AST nodes suitable for compute_column.
+
+    Parameters
+    ----------
+    node
+        Expression to convert.
+    name_to_index
+        Mapping from column names to their index in the table that
+        will be used for expression evaluation.
+
+    Returns
+    -------
+    pylibcudf Expressoin if conversion is possible, otherwise None.
+    """
+    mapper = CachingVisitor(
+        _to_ast, state={"for_parquet": False, "name_to_index": name_to_index}
+    )
+    try:
+        return mapper(node)
+    except (KeyError, NotImplementedError):
+        return None
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a0291037f01..5181214819e 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,22 +5,28 @@
 
 from __future__ import annotations
 
+import functools
 import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
-import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
+import pylibcudf as plc
+
 from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
 from cudf_polars.typing import NodeTraverser
-from cudf_polars.utils import dtypes
+from cudf_polars.utils import dtypes, sorting
+
+if TYPE_CHECKING:
+    from cudf_polars.typing import ExprTransformer
 
 __all__ = ["translate_ir", "translate_named_expr"]
 
@@ -148,7 +154,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Select(schema, inp, exprs, node.should_broadcast)
+    return ir.Select(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -161,11 +167,11 @@ def _(
         keys = [translate_named_expr(visitor, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
-        inp,
-        aggs,
         keys,
+        aggs,
         node.maintain_order,
         node.options,
+        inp,
     )
 
 
@@ -182,7 +188,71 @@ def _(
     with set_node(visitor, node.input_right):
         inp_right = translate_ir(visitor, n=None)
         right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
-    return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
+    if (how := node.options[0]) in {
+        "inner",
+        "left",
+        "right",
+        "full",
+        "cross",
+        "semi",
+        "anti",
+    }:
+        return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
+    else:
+        how, op1, op2 = how
+        if how != "ie_join":
+            raise NotImplementedError(
+                f"Unsupported join type {how}"
+            )  # pragma: no cover; asof joins not yet exposed
+        # No exposure of mixed/conditional joins in pylibcudf yet, so in
+        # the first instance, implement by doing a cross join followed by
+        # a filter.
+        _, join_nulls, zlice, suffix, coalesce = node.options
+        cross = ir.Join(
+            schema,
+            [],
+            [],
+            ("cross", join_nulls, None, suffix, coalesce),
+            inp_left,
+            inp_right,
+        )
+        dtype = plc.DataType(plc.TypeId.BOOL8)
+        if op2 is None:
+            ops = [op1]
+        else:
+            ops = [op1, op2]
+        suffix = cross.options[3]
+
+        # Column references in the right table refer to the post-join
+        # names, so with suffixes.
+        def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr:
+            if isinstance(e, expr.Col) and e.name in inp_left.schema:
+                return type(e)(e.dtype, f"{e.name}{suffix}")
+            return reuse_if_unchanged(e, rec)
+
+        mapper = make_recursive(_rename)
+        right_on = [
+            expr.NamedExpr(
+                f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new
+            )
+            for new, old in zip(
+                (mapper(e.value) for e in right_on), right_on, strict=True
+            )
+        ]
+        mask = functools.reduce(
+            functools.partial(
+                expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND
+            ),
+            (
+                expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value)
+                for op, left, right in zip(ops, left_on, right_on, strict=True)
+            ),
+        )
+        filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross)
+        if zlice is not None:
+            offset, length = zlice
+            return ir.Slice(schema, offset, length, filtered)
+        return filtered
 
 
 @_translate_ir.register
@@ -192,7 +262,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
-    return ir.HStack(schema, inp, exprs, node.should_broadcast)
+    return ir.HStack(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -202,17 +272,23 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Reduce(schema, inp, exprs)
+    return ir.Reduce(schema, exprs, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
+    (keep, subset, maintain_order, zlice) = node.options
+    keep = ir.Distinct._KEEP_MAP[keep]
+    subset = frozenset(subset) if subset is not None else None
     return ir.Distinct(
         schema,
+        keep,
+        subset,
+        zlice,
+        maintain_order,
         translate_ir(visitor, n=node.input),
-        node.options,
     )
 
 
@@ -223,14 +299,18 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         by = [translate_named_expr(visitor, n=e) for e in node.by_column]
-    return ir.Sort(schema, inp, by, node.sort_options, node.slice)
+    stable, nulls_last, descending = node.sort_options
+    order, null_order = sorting.sort_order(
+        descending, nulls_last=nulls_last, num_keys=len(by)
+    )
+    return ir.Sort(schema, by, order, null_order, stable, node.slice, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len)
+    return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input))
 
 
 @_translate_ir.register
@@ -240,7 +320,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         mask = translate_named_expr(visitor, n=node.predicate)
-    return ir.Filter(schema, inp, mask)
+    return ir.Filter(schema, mask, inp)
 
 
 @_translate_ir.register
@@ -259,10 +339,10 @@ def _(
     name, *options = node.function
     return ir.MapFunction(
         schema,
-        # TODO: merge_sorted breaks this pattern
-        translate_ir(visitor, n=node.input),
         name,
         options,
+        # TODO: merge_sorted breaks this pattern
+        translate_ir(visitor, n=node.input),
     )
 
 
@@ -271,7 +351,7 @@ def _(
     node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.Union(
-        schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
+        schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs)
     )
 
 
@@ -279,7 +359,7 @@ def _(
 def _(
     node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+    return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs))
 
 
 def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
@@ -309,8 +389,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     # IR is versioned with major.minor, minor is bumped for backwards
     # compatible changes (e.g. adding new nodes), major is bumped for
     # incompatible changes (e.g. renaming nodes).
-    # Polars 1.7 changes definition of the CSV reader options schema name.
-    if (version := visitor.version()) >= (3, 0):
+    if (version := visitor.version()) >= (4, 0):
         raise NotImplementedError(
             f"No support for polars IR {version=}"
         )  # pragma: no cover; no such version for now.
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
new file mode 100644
index 00000000000..be8338cb9a9
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Traversal and visitor utilities for nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Generic
+
+from cudf_polars.typing import U_contra, V_co
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Generator, Mapping, MutableMapping
+
+    from cudf_polars.typing import GenericTransformer, NodeT
+
+
+__all__: list[str] = [
+    "traversal",
+    "reuse_if_unchanged",
+    "make_recursive",
+    "CachingVisitor",
+]
+
+
+def traversal(node: NodeT) -> Generator[NodeT, None, None]:
+    """
+    Pre-order traversal of nodes in an expression.
+
+    Parameters
+    ----------
+    node
+        Root of expression to traverse.
+
+    Yields
+    ------
+    Unique nodes in the expression, parent before child, children
+    in-order from left to right.
+    """
+    seen = {node}
+    lifo = [node]
+
+    while lifo:
+        node = lifo.pop()
+        yield node
+        for child in reversed(node.children):
+            if child not in seen:
+                seen.add(child)
+                lifo.append(child)
+
+
+def reuse_if_unchanged(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT:
+    """
+    Recipe for transforming nodes that returns the old object if unchanged.
+
+    Parameters
+    ----------
+    node
+         Node to recurse on
+    fn
+         Function to transform children
+
+    Notes
+    -----
+    This can be used as a generic "base case" handler when
+    writing transforms that take nodes and produce new nodes.
+
+    Returns
+    -------
+    Existing node `e` if transformed children are unchanged, otherwise
+    reconstructed node with new children.
+    """
+    new_children = [fn(c) for c in node.children]
+    if all(new == old for new, old in zip(new_children, node.children, strict=True)):
+        return node
+    return node.reconstruct(new_children)
+
+
+def make_recursive(
+    fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+    *,
+    state: Mapping[str, Any] | None = None,
+) -> GenericTransformer[U_contra, V_co]:
+    """
+    No-op wrapper for recursive visitors.
+
+    Facilitates using visitors that don't need caching but are written
+    in the same style.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument a callable from input to output.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Usually, prefer a :class:`CachingVisitor`, but if we know that we
+    don't need caching in a transformation and then this no-op
+    approach is slightly cheaper.
+
+    Returns
+    -------
+    Recursive function without caching.
+
+    See Also
+    --------
+    CachingVisitor
+    """
+
+    def rec(node: U_contra) -> V_co:
+        return fn(node, rec)  # type: ignore[arg-type]
+
+    rec.state = state if state is not None else {}  # type: ignore[attr-defined]
+    return rec  # type: ignore[return-value]
+
+
+class CachingVisitor(Generic[U_contra, V_co]):
+    """
+    Caching wrapper for recursive visitors.
+
+    Facilitates writing visitors where already computed results should
+    be cached and reused. The cache is managed automatically, and is
+    tied to the lifetime of the wrapper.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument the recursive cache manager.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Returns
+    -------
+    Recursive function with caching.
+    """
+
+    def __init__(
+        self,
+        fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+        *,
+        state: Mapping[str, Any] | None = None,
+    ) -> None:
+        self.fn = fn
+        self.cache: MutableMapping[U_contra, V_co] = {}
+        self.state = state if state is not None else {}
+
+    def __call__(self, value: U_contra) -> V_co:
+        """
+        Apply the function to a value.
+
+        Parameters
+        ----------
+        value
+            The value to transform.
+
+        Returns
+        -------
+        A transformed value.
+        """
+        try:
+            return self.cache[value]
+        except KeyError:
+            return self.cache.setdefault(value, self.fn(value, self))
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 7b6f3848fc4..7b45c1eaa06 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -151,7 +151,7 @@ def assert_collect_raises(
     collect_kwargs: dict[OptimizationArgs, bool] | None = None,
     polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
     cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
-):
+) -> None:
     """
     Assert that collecting the result of a query raises the expected exceptions.
 
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 05b76d76808..e01ccd05527 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -16,7 +16,7 @@
     from collections.abc import Mapping
 
 
-def pytest_addoption(parser: pytest.Parser):
+def pytest_addoption(parser: pytest.Parser) -> None:
     """Add plugin-specific options."""
     group = parser.getgroup(
         "cudf-polars", "Plugin to set GPU as default engine for polars tests"
@@ -28,7 +28,7 @@ def pytest_addoption(parser: pytest.Parser):
     )
 
 
-def pytest_configure(config: pytest.Config):
+def pytest_configure(config: pytest.Config) -> None:
     """Enable use of this module as a pytest plugin to enable GPU collection."""
     no_fallback = config.getoption("--cudf-polars-no-fallback")
     collect = polars.LazyFrame.collect
@@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
@@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
     "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
     "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
@@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
@@ -148,7 +172,7 @@ def pytest_configure(config: pytest.Config):
 
 def pytest_collection_modifyitems(
     session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
-):
+) -> None:
     """Mark known failing tests."""
     if config.getoption("--cudf-polars-no-fallback"):
         # Don't xfail tests if running without fallback
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 240b11bdf59..57c5fdaa7cf 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -5,20 +5,32 @@
 
 from __future__ import annotations
 
-from collections.abc import Mapping
-from typing import TYPE_CHECKING, Literal, Protocol, Union
-
-import pylibcudf as plc
+from collections.abc import Hashable, Mapping
+from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union
 
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
+import pylibcudf as plc
+
 if TYPE_CHECKING:
     from collections.abc import Callable
     from typing import TypeAlias
 
     import polars as pl
 
-IR: TypeAlias = Union[
+    from cudf_polars.dsl import expr, ir, nodebase
+
+__all__: list[str] = [
+    "PolarsIR",
+    "PolarsExpr",
+    "NodeTraverser",
+    "OptimizationArgs",
+    "GenericTransformer",
+    "ExprTransformer",
+    "IRTransformer",
+]
+
+PolarsIR: TypeAlias = Union[
     pl_ir.PythonScan,
     pl_ir.Scan,
     pl_ir.Cache,
@@ -38,7 +50,7 @@
     pl_ir.ExtContext,
 ]
 
-Expr: TypeAlias = Union[
+PolarsExpr: TypeAlias = Union[
     pl_expr.Function,
     pl_expr.Window,
     pl_expr.Literal,
@@ -68,7 +80,7 @@ def set_node(self, n: int) -> None:
         """Set the current plan node to n."""
         ...
 
-    def view_current_node(self) -> IR:
+    def view_current_node(self) -> PolarsIR:
         """Convert current plan node to python rep."""
         ...
 
@@ -80,7 +92,7 @@ def get_dtype(self, n: int) -> pl.DataType:
         """Get the datatype of the given expression id."""
         ...
 
-    def view_expression(self, n: int) -> Expr:
+    def view_expression(self, n: int) -> PolarsExpr:
         """Convert the given expression to python rep."""
         ...
 
@@ -107,3 +119,29 @@ def set_udf(
     "cluster_with_columns",
     "no_optimization",
 ]
+
+
+U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True)
+V_co = TypeVar("V_co", covariant=True)
+NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]")
+
+
+class GenericTransformer(Protocol[U_contra, V_co]):
+    """Abstract protocol for recursive visitors."""
+
+    def __call__(self, __value: U_contra) -> V_co:
+        """Apply the visitor to the node."""
+        ...
+
+    @property
+    def state(self) -> Mapping[str, Any]:
+        """Arbitrary immutable state."""
+        ...
+
+
+# Quotes to avoid circular import
+ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"]
+"""Protocol for transformation of Expr nodes."""
+
+IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"]
+"""Protocol for transformation of IR nodes."""
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 4154a404e98..1d0479802ca 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -8,11 +8,12 @@
 from functools import cache
 
 import pyarrow as pa
-import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
+import pylibcudf as plc
+
 __all__ = ["from_polars", "downcast_arrow_lists", "can_cast"]
 
 
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 4a7ad6b3cf2..a119cab3b74 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,11 +12,12 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8")
+POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11")
+POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12")
 
 
 def _ensure_polars_version():
-    if POLARS_VERSION_LT_18:
+    if POLARS_VERSION_LT_111:
         raise ImportError(
-            "cudf_polars requires py-polars v1.8 or greater."
+            "cudf_polars requires py-polars v1.11 or greater."
         )  # pragma: no cover
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 7837a275f20..74b2cd4e5de 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -11,14 +11,17 @@ You will need:
    environment](https://github.com/rapidsai/cudf/blob/branch-24.12/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
-> ![NOTE] These instructions will get simpler as we merge code in.
+:::{note}
+These instructions will get simpler as we merge code in.
+:::
 
 ## Installing polars
 
-`cudf-polars` works with polars >= 1.3, as long as the internal IR
-version doesn't get a major version bump. So `pip install polars>=1.3`
-should work. For development, if we're adding things to the polars
-side of things, we will need to build polars from source:
+The `cudf-polars` `pyproject.toml` advertises which polars versions it
+works with. So for pure `cudf-polars` development, installing as
+normal and satisfying the dependencies in the repository is
+sufficient. For development, if we're adding things to the polars side
+of things, we will need to build polars from source:
 
 ```sh
 git clone https://github.com/pola-rs/polars
@@ -36,7 +39,9 @@ pip install --upgrade uv
 uv pip install --upgrade -r py-polars/requirements-dev.txt
 ```
 
-> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster!
+:::{note}
+plain `pip install` works fine, but `uv` is _much_ faster!
+:::
 
 Now we have the necessary machinery to build polars
 ```sh
@@ -83,7 +88,7 @@ representation (IR). Second, an execution phase which executes using
 our IR.
 
 The translation phase receives the a low-level Rust `NodeTraverser`
-object which delivers Python representations of the plan nodes (and
+object that delivers Python representations of the plan nodes (and
 expressions) one at a time. During translation, we endeavour to raise
 `NotImplementedError` for any unsupported functionality. This way, if
 we can't execute something, we just don't modify the logical plan at
@@ -126,7 +131,6 @@ arguments, at the moment, `raise_on_fail` is also supported, which
 raises, rather than falling back, during translation:
 
 ```python
-
 result = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
 ```
 
@@ -144,13 +148,73 @@ changes. We can therefore attempt to detect the IR version
 appropriately. This should be done during IR translation in
 `translate.py`.
 
-## Adding a handler for a new plan node
+# IR design
+
+As noted, we translate the polars DSL into our own IR. This is both so
+that we can smooth out minor version differences (advertised by
+`NodeTraverser` version changes) within `cudf-polars`, and so that we
+have the freedom to introduce new IR nodes and rewrite rules as might
+be appropriate for GPU execution.
+
+To that end, we provide facilities for definition of nodes as well as
+writing traversals and rewrite rules. The abstract base class `Node`
+in `dsl/nodebase.py` defines the interface for implementing new nodes,
+and provides many useful default methods. See also the docstrings of
+the `Node` class.
+
+:::{note}
+This generic implementation relies on nodes being treated as
+*immutable*. Do not implement in-place modification of nodes, bad
+things will happen.
+:::
+
+## Defining nodes
+
+A concrete node type (`cudf-polars` has expression nodes, `Expr`;
+and plan nodes, `IR`), should inherit from `Node`. Nodes have
+two types of data:
+
+1. `children`: a tuple (possibly empty) of concrete nodes;
+2. non-child: arbitrary data attached to the node that is _not_ a
+   concrete node.
+
+The base `Node` class requires that one advertise the names of the
+non-child attributes in the `_non_child` class variable. The
+constructor of the concrete node should take its arguments in the
+order `*_non_child` (ordered as the class variable does) and then
+`*children`. For example, the `Sort` node, which sorts a column
+generated by an expression, has this definition:
+
+```python
+class Expr(Node):
+    children: tuple[Expr, ...]
+
+class Sort(Expr):
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+    def __init__(self, dtype, options, column: Expr):
+        self.dtype = dtype
+        self.options = options
+        self.children = (column,)
+```
+
+By following this pattern, we get an automatic (caching)
+implementation of `__hash__` and `__eq__`, as well as a useful
+`reconstruct` method that will rebuild the node with new children.
+
+If you want to control the behaviour of `__hash__` and `__eq__` for a
+single node, override (respectively) the `get_hashable` and `is_equal`
+methods.
+
+## Adding new translation rules from the polars IR
+
+### Plan nodes
 
-Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
-`dataclasses` that inherit from the base `IR` node. The evaluation of
-a plan node is done by implementing the `evaluate` method.
+Plan node definitions live in `cudf_polars/dsl/ir.py`, these all
+inherit from the base `IR` node. The evaluation of a plan node is done
+by implementing the `evaluate` method.
 
-To translate the plan node, add a case handler in `translate_ir` which
+To translate the plan node, add a case handler in `translate_ir` that
 lives in `cudf_polars/dsl/translate.py`.
 
 As well as child nodes that are plans, most plan nodes contain child
@@ -163,25 +227,12 @@ translating a `Join` node, the left keys (expressions) should be
 translated with the left input active (and right keys with right
 input). To facilitate this, use the `set_node` context manager.
 
-## Adding a handler for a new expression node
+### Expression nodes
 
 Adding a handle for an expression node is very similar to a plan node.
-Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit
-from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it
-is simpler for us to implement efficient hashing, repr, and equality if we
-can write that ourselves.
-
-Every expression consists of two types of data:
-1. child data (other `Expr`s)
-2. non-child data (anything other than an `Expr`)
-The generic implementations of special methods in the base `Expr` base
-class require that the subclasses advertise which arguments to the
-constructor are non-child in a `_non_child` class slot. The
-constructor should then take arguments:
-```python
-def __init__(self, *non_child_data: Any, *children: Expr):
-```
-Read the docstrings in the `Expr` class for more details.
+Expressions are defined in `cudf_polars/dsl/expressions/` and exported
+into the `dsl` namespace via `expr.py`. They inherit
+from `Expr`.
 
 Expressions are evaluated by implementing a `do_evaluate` method that
 takes a `DataFrame` as context (this provides columns) along with an
@@ -198,6 +249,124 @@ To simplify state tracking, all columns should be considered immutable
 on construction. This matches the "functional" description coming from
 the logical plan in any case, so is reasonably natural.
 
+## Traversing and transforming nodes
+
+In addition to representing and evaluating nodes. We also provide
+facilities for traversing a tree of nodes and defining transformation
+rules in `dsl/traversal.py`. The simplest is `traversal`, a
+[pre-order](https://en.wikipedia.org/wiki/Tree_traversal) visit of all
+unique nodes in an expression. Use this if you want to know some
+specific thing about an expression. For example, to determine if an
+expression contains a `Literal` node:
+
+```python
+def has_literal(node: Expr) -> bool:
+    return any(isinstance(e, Literal) for e in traversal(node))
+```
+
+It is often convenient to provide (immutable) state to a visitor, as
+well as some facility to perform DAG-aware rewrites (reusing a
+transformation for an expression if we have already seen it). We
+therefore adopt the following pattern of writing DAG-aware visitors.
+Suppose we want a rewrite rule (`rewrite`) between expressions
+(`Expr`) and some new type `T`. We define our general transformation
+function `rewrite` with type `Expr -> (Expr -> T) -> T`:
+
+```python
+from cudf_polars.typing import GenericTransformer
+
+@singledispatch
+def rewrite(e: Expr, rec: GenericTransformer[Expr, T]) -> T:
+    ...
+```
+
+Note in particular that the function to perform the recursion is
+passed as the second argument. Rather than defining methods on each
+node in turn for a particular rewrite rule, we prefer free functions
+and use `functools.singledispatch` to provide dispatching. We now, in
+the usual fashion, register handlers for different expression types.
+To use this function, we need to be able to provide both the
+expression to convert and the recursive function itself. To do this we
+must convert our `rewrite` function into something that only takes a
+single argument (the expression to rewrite), but carries around
+information about how to perform the recursion. To this end, we have
+two utilities in `traversal.py`:
+
+- `make_recursive` and
+- `CachingVisitor`.
+
+These both implement the `GenericTransformer` protocol, and can be
+wrapped around a transformation function like `rewrite` to provide a
+function `Expr -> T`. They also allow us to attach arbitrary
+*immutable* state to our visitor by passing a `state` dictionary. This
+dictionary can then be inspected by the concrete transformation
+function. `make_recursive` is very simple, and provides no caching of
+intermediate results (so any DAGs that are visited will be viewed as
+trees). `CachingVisitor` provides the same interface, but maintains a
+cache of intermediate results, and reuses them if the same expression
+is seen again.
+
+Finally, for writing transformations that take nodes and deliver new
+nodes (e.g. rewrite rules), we have a final utility
+`reuse_if_unchanged` that can be used as a base case transformation
+for node to node rewrites. It is a depth-first visit that transforms
+children but only returns a new node with new children if the rewrite
+of children returned new nodes.
+
+To see how these pieces fit together, let us consider writing a
+`rename` function that takes an expression (potentially with
+references to columns) along with a mapping defining a renaming
+between (some subset of) column names. The goal is to deliver a new
+expression with appropriate columns renamed.
+
+To start, we define the dispatch function
+```python
+from collections.abc import Mapping
+from functools import singledispatch
+from cudf_polars.dsl.traversal import (
+    CachingVisitor, make_recursive, reuse_if_unchanged
+)
+from cudf_polars.dsl.expr import Col, Expr
+from cudf_polars.typing import ExprTransformer
+
+
+@singledispatch
+def _rename(e: Expr, rec: ExprTransformer) -> Expr:
+    raise NotImplementedError(f"No handler for {type(e)}")
+```
+then we register specific handlers, first for columns:
+```python
+@_rename.register
+def _(e: Col, rec: ExprTransformer) -> Expr:
+    mapping = rec.state["mapping"] # state set on rec
+    if e.name in mapping:
+        # If we have a rename, return a new Col reference
+        # with a new name
+        return type(e)(e.dtype, mapping[e.name])
+    return e
+```
+and then for the remaining expressions
+```python
+_rename.register(Expr)(reuse_if_unchanged)
+```
+
+:::{note}
+In this case, we could have put the generic handler in the `_rename`
+function, however, then we would not get a nice error message if we
+accidentally sent in an object of the incorrect type.
+:::
+
+Finally we tie everything together with a public function:
+
+```python
+def rename(e: Expr, mapping: Mapping[str, str]) -> Expr:
+    """Rename column references in an expression."""
+    mapper = CachingVisitor(_rename, state={"mapping": mapping})
+    # or
+    # mapper = make_recursive(_rename, state={"mapping": mapping})
+    return mapper(e)
+```
+
 # Containers
 
 Containers should be constructed as relatively lightweight objects
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 5345fad41a2..a2c62ef9460 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.8,<1.9",
+    "polars>=1.11,<1.13",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -60,7 +60,7 @@ xfail_strict = true
 [tool.coverage.report]
 exclude_also = [
   "if TYPE_CHECKING:",
-  "class .*\\bProtocol\\):",
+  "class .*\\bProtocol(?:\\[[^]]+\\])?\\):",
   "assert_never\\("
 ]
 # The cudf_polars test suite doesn't exercise the plugin, so we omit
@@ -188,7 +188,7 @@ required-imports = ["from __future__ import annotations"]
 
 [tool.ruff.lint.isort.sections]
 polars = ["polars"]
-rapids = ["rmm", "cudf"]
+rapids = ["rmm", "pylibcudf"]
 
 [tool.ruff.format]
 docstring-code-format = true
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 1f26ab1af9f..95541b4ecc3 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -4,9 +4,10 @@
 from __future__ import annotations
 
 import pyarrow
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 
 
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 5c68fb8f0aa..d68c8d90163 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -3,11 +3,12 @@
 
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column, DataFrame
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
index b7d4672daca..de8fec301fe 100644
--- a/python/cudf_polars/tests/dsl/test_expr.py
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -3,9 +3,10 @@
 
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 from cudf_polars.dsl import expr
 
 
@@ -73,3 +74,24 @@ def test_namedexpr_repr_stable():
     b2 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
 
     assert repr(b1) == repr(b2)
+
+
+def test_equality_cse():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    def make_expr(n1, n2):
+        a = expr.Col(plc.DataType(plc.TypeId.INT8), n1)
+        b = expr.Col(plc.DataType(plc.TypeId.INT8), n2)
+
+        return expr.BinOp(dt, plc.binaryop.BinaryOperator.ADD, a, b)
+
+    e1 = make_expr("a", "b")
+    e2 = make_expr("a", "b")
+    e3 = make_expr("a", "c")
+
+    assert e1.children is not e2.children
+    assert e1 == e2
+    assert e1.children is e2.children
+    assert e1 == e2
+    assert e1 != e3
+    assert e2 != e3
diff --git a/python/cudf_polars/tests/dsl/test_to_ast.py b/python/cudf_polars/tests/dsl/test_to_ast.py
new file mode 100644
index 00000000000..57d794d4890
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_to_ast.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+import pylibcudf as plc
+
+import cudf_polars.dsl.ir as ir_nodes
+from cudf_polars import translate_ir
+from cudf_polars.containers.dataframe import DataFrame, NamedColumn
+from cudf_polars.dsl.to_ast import to_ast
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "c": ["a", "b", "c", "d", "e", "f"],
+            "a": [1, 2, 3, None, 4, 5],
+            "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64),
+            "d": [False, True, True, None, False, False],
+        }
+    )
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in([0, 1]),
+        pl.col("a").is_between(0, 2),
+        (pl.col("a") < pl.col("b")).not_(),
+        pl.lit(2) > pl.col("a"),
+        pl.lit(2) >= pl.col("a"),
+        pl.lit(2) < pl.col("a"),
+        pl.lit(2) <= pl.col("a"),
+        pl.lit(0) == pl.col("a"),
+        pl.lit(1) != pl.col("a"),
+        (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()),
+        (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0),
+        pl.col("a").is_null(),
+        pl.col("a").is_not_null(),
+        pl.col("b").is_finite(),
+        pytest.param(
+            pl.col("a").sin(),
+            marks=pytest.mark.xfail(reason="Need to insert explicit casts"),
+        ),
+        pl.col("b").cos(),
+        pl.col("a").abs().is_between(0, 2),
+        pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)),
+        [pl.col("a") * 2, pl.col("b") + pl.col("a")],
+        pl.col("d").not_(),
+    ],
+)
+def test_compute_column(expr, df):
+    q = df.select(expr)
+    ir = translate_ir(q._ldf.visit())
+
+    assert isinstance(ir, ir_nodes.Select)
+    table = ir.children[0].evaluate(cache={})
+    name_to_index = {c.name: i for i, c in enumerate(table.columns)}
+
+    def compute_column(e):
+        ast = to_ast(e.value, name_to_index=name_to_index)
+        if ast is not None:
+            return NamedColumn(
+                plc.transform.compute_column(table.table, ast), name=e.name
+            )
+        return e.evaluate(table)
+
+    got = DataFrame(map(compute_column, ir.exprs)).to_polars()
+
+    expect = q.collect()
+
+    assert_frame_equal(expect, got)
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
new file mode 100644
index 00000000000..15c644d7978
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from functools import singledispatch
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+import pylibcudf as plc
+
+from cudf_polars import translate_ir
+from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import (
+    CachingVisitor,
+    make_recursive,
+    reuse_if_unchanged,
+    traversal,
+)
+from cudf_polars.typing import ExprTransformer, IRTransformer
+
+
+def make_expr(dt, n1, n2):
+    a1 = expr.Col(dt, n1)
+    a2 = expr.Col(dt, n2)
+
+    return expr.BinOp(dt, plc.binaryop.BinaryOperator.MUL, a1, a2)
+
+
+def test_traversal_unique():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "a")
+    unique_exprs = list(traversal(e1))
+
+    assert len(unique_exprs) == 2
+    assert set(unique_exprs) == {expr.Col(dt, "a"), e1}
+    assert unique_exprs == [e1, expr.Col(dt, "a")]
+
+    e2 = make_expr(dt, "a", "b")
+    unique_exprs = list(traversal(e2))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2}
+    assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")]
+
+    e3 = make_expr(dt, "b", "a")
+    unique_exprs = list(traversal(e3))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3}
+    assert unique_exprs == [e3, expr.Col(dt, "b"), expr.Col(dt, "a")]
+
+
+def rename(e, rec):
+    mapping = rec.state["mapping"]
+    if isinstance(e, expr.Col) and e.name in mapping:
+        return type(e)(e.dtype, mapping[e.name])
+    return reuse_if_unchanged(e, rec)
+
+
+def test_caching_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+    assert len(mapper.cache) == 3
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    assert len(mapper.cache) == 2
+    mapper = CachingVisitor(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+    assert len(mapper.cache) == 2
+
+
+def test_noop_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+
+
+def test_rewrite_ir_node():
+    df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]})
+    q = df.group_by("a").agg(pl.col("b").sum()).sort("b")
+
+    orig = translate_ir(q._ldf.visit())
+
+    new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]})
+
+    def replace_df(node, rec):
+        if isinstance(node, ir.DataFrameScan):
+            return ir.DataFrameScan(
+                node.schema, new_df._df, node.projection, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_df)
+
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = pl.DataFrame({"a": [2, 1], "b": [-4, -3]})
+
+    assert_frame_equal(result, expect)
+
+
+def test_rewrite_scan_node(tmp_path):
+    left = pl.LazyFrame({"a": [1, 2, 3], "b": [1, 3, 4]})
+    right = pl.DataFrame({"a": [1, 4, 2], "c": [1, 2, 3]})
+
+    right.write_parquet(tmp_path / "right.pq")
+
+    right_s = pl.scan_parquet(tmp_path / "right.pq")
+
+    q = left.join(right_s, on="a", how="inner")
+
+    def replace_scan(node, rec):
+        if isinstance(node, ir.Scan):
+            return ir.DataFrameScan(
+                node.schema, right._df, node.with_columns, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_scan)
+
+    orig = translate_ir(q._ldf.visit())
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = q.collect()
+
+    assert_frame_equal(result, expect, check_row_order=False)
+
+
+def test_rewrite_names_and_ops():
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]})
+
+    q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d")
+
+    # We will replace a -> d, c -> d, and addition with multiplication
+    expect = (
+        df.select(
+            (pl.col("d") - (pl.col("b") * pl.col("d") * 2)).alias("a"), pl.col("d")
+        )
+        .sort("d")
+        .collect()
+    )
+
+    qir = translate_ir(q._ldf.visit())
+
+    @singledispatch
+    def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr:
+        raise NotImplementedError("Unhandled")
+
+    @_transform.register
+    def _(e: expr.Col, fn: ExprTransformer):
+        mapping = fn.state["mapping"]
+        if e.name in mapping:
+            return type(e)(e.dtype, mapping[e.name])
+        return e
+
+    @_transform.register
+    def _(e: expr.BinOp, fn: ExprTransformer):
+        if e.op == plc.binaryop.BinaryOperator.ADD:
+            return type(e)(
+                e.dtype, plc.binaryop.BinaryOperator.MUL, *map(fn, e.children)
+            )
+        return reuse_if_unchanged(e, fn)
+
+    _transform.register(expr.Expr)(reuse_if_unchanged)
+
+    @singledispatch
+    def _rewrite(node: ir.IR, fn: IRTransformer) -> ir.IR:
+        raise NotImplementedError("Unhandled")
+
+    @_rewrite.register
+    def _(node: ir.Select, fn: IRTransformer):
+        expr_mapper = fn.state["expr_mapper"]
+        return type(node)(
+            node.schema,
+            [expr.NamedExpr(e.name, expr_mapper(e.value)) for e in node.exprs],
+            node.should_broadcast,
+            fn(node.children[0]),
+        )
+
+    _rewrite.register(ir.IR)(reuse_if_unchanged)
+
+    rewriter = CachingVisitor(
+        _rewrite,
+        state={
+            "expr_mapper": CachingVisitor(
+                _transform, state={"mapping": {"a": "d", "c": "d"}}
+            )
+        },
+    )
+
+    new_ir = rewriter(qir)
+
+    got = new_ir.evaluate(cache={}).to_polars()
+
+    assert_frame_equal(expect, got)
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index ced49bdc254..52bc4a9ac71 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -2,11 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 2a37683478b..62df8ce1498 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -4,11 +4,12 @@
 
 import itertools
 
-import pylibcudf as plc
 import pytest
 
 import polars as pl
 
+import pylibcudf as plc
+
 from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 3c3986be19b..9900f598e5f 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -10,7 +10,7 @@
 
 import rmm
 
-from cudf_polars.dsl.ir import IR
+from cudf_polars.dsl.ir import DataFrameScan
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -18,10 +18,10 @@
 
 
 def test_polars_verbose_warns(monkeypatch):
-    def raise_unimplemented(self):
+    def raise_unimplemented(self, *args):
         raise NotImplementedError("We don't support this")
 
-    monkeypatch.setattr(IR, "__post_init__", raise_unimplemented)
+    monkeypatch.setattr(DataFrameScan, "__init__", raise_unimplemented)
     q = pl.LazyFrame({})
     # Ensure that things raise
     assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 7d9ec98db97..8ca7a7b9264 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -2,14 +2,18 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from contextlib import nullcontext
+
 import pytest
 
 import polars as pl
+from polars.testing import assert_frame_equal
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils.versions import POLARS_VERSION_LT_112
 
 
 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
@@ -22,6 +26,11 @@ def how(request):
     return request.param
 
 
+@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)])
+def zlice(request):
+    return request.param
+
+
 @pytest.fixture
 def left():
     return pl.LazyFrame(
@@ -37,8 +46,9 @@ def left():
 def right():
     return pl.LazyFrame(
         {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
+            "a": [1, 4, 3, 7, None, None, 1],
+            "c": [2, 3, 4, 5, 6, 7, 8],
+            "d": [6, None, 7, 8, -1, 2, 4],
         }
     )
 
@@ -70,11 +80,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
-def test_cross_join(left, right):
+def test_left_join_with_slice(left, right, join_nulls, zlice):
+    q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True)
+    ctx = nullcontext()
+    if zlice is not None:
+        q_expect = q.collect().slice(*zlice)
+        q = q.slice(*zlice)
+        if POLARS_VERSION_LT_112 and (zlice == (1, 5) or zlice == (0, 2)):
+            # https://github.com/pola-rs/polars/issues/19403
+            # https://github.com/pola-rs/polars/issues/19405
+            ctx = pytest.raises(AssertionError)
+            assert_frame_equal(
+                q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True))
+            )
+
+    with ctx:
+        assert_gpu_result_equal(q)
+
+
+def test_cross_join(left, right, zlice):
     q = left.join(right, how="cross")
+    if zlice is not None:
+        q = q.slice(*zlice)
 
     assert_gpu_result_equal(q)
 
@@ -86,3 +116,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "conditions",
+    [
+        [pl.col("a") < pl.col("a_right")],
+        [pl.col("a_right") <= pl.col("a") * 2],
+        [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
+        [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+    ],
+)
+def test_join_where(left, right, conditions, zlice):
+    q = left.join_where(right, *conditions)
+
+    assert_gpu_result_equal(q, check_row_order=False)
+
+    if zlice is not None:
+        q_len = q.slice(*zlice).select(pl.len())
+        # Can't compare result, since row order is not guaranteed and
+        # therefore we only check the length
+
+        assert_gpu_result_equal(q_len)
diff --git a/python/cudf_polars/tests/test_parquet_filters.py b/python/cudf_polars/tests/test_parquet_filters.py
new file mode 100644
index 00000000000..545a89250fc
--- /dev/null
+++ b/python/cudf_polars/tests/test_parquet_filters.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.DataFrame(
+        {
+            "c": ["a", "b", "c", "d", "e", "f"],
+            "a": [1, 2, 3, None, 4, 5],
+            "b": pl.Series([None, None, 3, float("inf"), 4, 0], dtype=pl.Float64),
+            "d": [-1, 2, -3, None, 4, -5],
+        }
+    )
+
+
+@pytest.fixture(scope="module")
+def pq_file(tmp_path_factory, df):
+    tmp_path = tmp_path_factory.mktemp("parquet_filter")
+    df.write_parquet(tmp_path / "tmp.pq", row_group_size=3)
+    return pl.scan_parquet(tmp_path / "tmp.pq")
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in([0, 1]),
+        pl.col("a").is_between(0, 2),
+        (pl.col("a") < 2).not_(),
+        pl.lit(2) > pl.col("a"),
+        pl.lit(2) >= pl.col("a"),
+        pl.lit(2) < pl.col("a"),
+        pl.lit(2) <= pl.col("a"),
+        pl.lit(0) == pl.col("a"),
+        pl.lit(1) != pl.col("a"),
+        pl.col("a") == pl.col("d"),
+        (pl.col("b") < pl.lit(2, dtype=pl.Float64).sqrt()),
+        (pl.col("a") >= pl.lit(2)) & (pl.col("b") > 0),
+        pl.col("b").is_finite(),
+        pl.col("a").is_null(),
+        pl.col("a").is_not_null(),
+        pl.col("a").abs().is_between(0, 2),
+        pl.col("a").ne_missing(pl.lit(None, dtype=pl.Int64)),
+    ],
+)
+@pytest.mark.parametrize("selection", [["c", "b"], ["a"], ["a", "c"], ["b"], "c"])
+def test_scan_by_hand(expr, selection, pq_file):
+    df = pq_file.collect()
+    q = pq_file.filter(expr).select(*selection)
+    # Not using assert_gpu_result_equal because
+    # https://github.com/pola-rs/polars/issues/19238
+    got = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
+    expect = df.filter(expr).select(*selection)
+    assert_frame_equal(got, expect)
diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py
index e7770bfadac..3b3b4f0f8db 100644
--- a/python/cudf_polars/tests/utils/test_broadcast.py
+++ b/python/cudf_polars/tests/utils/test_broadcast.py
@@ -3,9 +3,10 @@
 
 from __future__ import annotations
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 from cudf_polars.containers import Column
 from cudf_polars.dsl.ir import broadcast
 
diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py
index 1cda9b71387..c5135bc6414 100644
--- a/python/custreamz/custreamz/tests/conftest.py
+++ b/python/custreamz/custreamz/tests/conftest.py
@@ -2,6 +2,7 @@
 import socket
 
 import pytest
+
 from custreamz import kafka
 
 
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index af45f49d9b4..a8ab05a3922 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -65,50 +65,20 @@ include = [
 ]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "streamz",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["streamz"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 04c2ad65b99..f9df22cc436 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -7,15 +7,15 @@
 # do anything for dask==2024.2.0)
 config.set({"dataframe.query-planning-warning": False})
 
-import dask.dataframe as dd
-from dask.dataframe import from_delayed
+import dask.dataframe as dd  # noqa: E402
+from dask.dataframe import from_delayed  # noqa: E402
 
-import cudf
+import cudf  # noqa: E402
 
-from . import backends
-from ._version import __git_commit__, __version__
-from .core import concat, from_cudf, from_dask_dataframe
-from .expr import QUERY_PLANNING_ON
+from . import backends  # noqa: E402, F401
+from ._version import __git_commit__, __version__  # noqa: E402, F401
+from .core import concat, from_cudf, from_dask_dataframe  # noqa: E402
+from .expr import QUERY_PLANNING_ON  # noqa: E402
 
 
 def read_csv(*args, **kwargs):
@@ -55,9 +55,9 @@ def inner_func(*args, **kwargs):
     to_orc = raise_not_implemented_error("to_orc")
 
 else:
-    from .core import DataFrame, Index, Series
-    from .groupby import groupby_agg
-    from .io import read_text, to_orc
+    from .core import DataFrame, Index, Series  # noqa: F401
+    from .groupby import groupby_agg  # noqa: F401
+    from .io import read_text, to_orc  # noqa: F401
 
 
 __all__ = [
diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index a76b655ef42..6dadadd5263 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -12,8 +12,8 @@
     config.set({"dataframe.shuffle.method": "tasks"})
 
     try:
-        import dask_cudf.expr._collection
-        import dask_cudf.expr._expr
+        import dask_cudf.expr._collection  # noqa: F401
+        import dask_cudf.expr._expr  # noqa: F401
 
     except ImportError as err:
         # Dask *should* raise an error before this.
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index af83a01da98..c7cf66fbffd 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -6,11 +6,20 @@
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
+from dask_expr._groupby import (
+    DecomposableGroupbyAggregation,
+    GroupbyAggregation,
+)
 from dask_expr._reductions import Reduction, Var
 from dask_expr.io.io import FusedParquetIO
-from dask_expr.io.parquet import ReadParquetPyarrowFS
-
-from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
+from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS
+
+from dask.dataframe.core import (
+    _concat,
+    is_dataframe_like,
+    make_meta,
+    meta_nonempty,
+)
 from dask.dataframe.dispatch import is_categorical_dtype
 from dask.typing import no_default
 
@@ -21,6 +30,210 @@
 ##
 
 
+def _get_spec_info(gb):
+    if isinstance(gb.arg, (dict, list)):
+        aggs = gb.arg.copy()
+    else:
+        aggs = gb.arg
+
+    if gb._slice and not isinstance(aggs, dict):
+        aggs = {gb._slice: aggs}
+
+    gb_cols = gb._by_columns
+    if isinstance(gb_cols, str):
+        gb_cols = [gb_cols]
+    columns = [c for c in gb.frame.columns if c not in gb_cols]
+    if not isinstance(aggs, dict):
+        aggs = {col: aggs for col in columns}
+
+    # Assert if our output will have a MultiIndex; this will be the case if
+    # any value in the `aggs` dict is not a string (i.e. multiple/named
+    # aggregations per column)
+    str_cols_out = True
+    aggs_renames = {}
+    for col in aggs:
+        if isinstance(aggs[col], str) or callable(aggs[col]):
+            aggs[col] = [aggs[col]]
+        elif isinstance(aggs[col], dict):
+            str_cols_out = False
+            col_aggs = []
+            for k, v in aggs[col].items():
+                aggs_renames[col, v] = k
+                col_aggs.append(v)
+            aggs[col] = col_aggs
+        else:
+            str_cols_out = False
+        if col in gb_cols:
+            columns.append(col)
+
+    return {
+        "aggs": aggs,
+        "columns": columns,
+        "str_cols_out": str_cols_out,
+        "aggs_renames": aggs_renames,
+    }
+
+
+def _get_meta(gb):
+    spec_info = gb.spec_info
+    gb_cols = gb._by_columns
+    aggs = spec_info["aggs"].copy()
+    aggs_renames = spec_info["aggs_renames"]
+    if spec_info["str_cols_out"]:
+        # Metadata should use `str` for dict values if that is
+        # what the user originally specified (column names will
+        # be str, rather than tuples).
+        for col in aggs:
+            aggs[col] = aggs[col][0]
+    _meta = gb.frame._meta.groupby(gb_cols).agg(aggs)
+    if aggs_renames:
+        col_array = []
+        agg_array = []
+        for col, agg in _meta.columns:
+            col_array.append(col)
+            agg_array.append(aggs_renames.get((col, agg), agg))
+        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
+    return _meta
+
+
+class DecomposableCudfGroupbyAgg(DecomposableGroupbyAggregation):
+    sep = "___"
+
+    @functools.cached_property
+    def spec_info(self):
+        return _get_spec_info(self)
+
+    @functools.cached_property
+    def _meta(self):
+        return _get_meta(self)
+
+    @property
+    def shuffle_by_index(self):
+        return False  # We always group by column(s)
+
+    @classmethod
+    def chunk(cls, df, *by, **kwargs):
+        from dask_cudf.groupby import _groupby_partition_agg
+
+        return _groupby_partition_agg(df, **kwargs)
+
+    @classmethod
+    def combine(cls, inputs, **kwargs):
+        from dask_cudf.groupby import _tree_node_agg
+
+        return _tree_node_agg(_concat(inputs), **kwargs)
+
+    @classmethod
+    def aggregate(cls, inputs, **kwargs):
+        from dask_cudf.groupby import _finalize_gb_agg
+
+        return _finalize_gb_agg(_concat(inputs), **kwargs)
+
+    @property
+    def chunk_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        return {
+            "gb_cols": self._by_columns,
+            "aggs": self.spec_info["aggs"],
+            "columns": self.spec_info["columns"],
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+        }
+
+    @property
+    def combine_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        return {
+            "gb_cols": self._by_columns,
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+        }
+
+    @property
+    def aggregate_kwargs(self) -> dict:
+        dropna = True if self.dropna is None else self.dropna
+        final_columns = self._slice or self._meta.columns
+        return {
+            "gb_cols": self._by_columns,
+            "aggs": self.spec_info["aggs"],
+            "columns": self.spec_info["columns"],
+            "final_columns": final_columns,
+            "as_index": True,
+            "dropna": dropna,
+            "sort": self.sort,
+            "sep": self.sep,
+            "str_cols_out": self.spec_info["str_cols_out"],
+            "aggs_renames": self.spec_info["aggs_renames"],
+        }
+
+
+class CudfGroupbyAgg(GroupbyAggregation):
+    @functools.cached_property
+    def spec_info(self):
+        return _get_spec_info(self)
+
+    @functools.cached_property
+    def _meta(self):
+        return _get_meta(self)
+
+    def _lower(self):
+        return DecomposableCudfGroupbyAgg(
+            self.frame,
+            self.arg,
+            self.observed,
+            self.dropna,
+            self.split_every,
+            self.split_out,
+            self.sort,
+            self.shuffle_method,
+            self._slice,
+            *self.by,
+        )
+
+
+def _maybe_get_custom_expr(
+    gb,
+    aggs,
+    split_every=None,
+    split_out=None,
+    shuffle_method=None,
+    **kwargs,
+):
+    from dask_cudf.groupby import (
+        OPTIMIZED_AGGS,
+        _aggs_optimized,
+        _redirect_aggs,
+    )
+
+    if kwargs:
+        # Unsupported key-word arguments
+        return None
+
+    if not hasattr(gb.obj._meta, "to_pandas"):
+        # Not cuDF-backed data
+        return None
+
+    _aggs = _redirect_aggs(aggs)
+    if not _aggs_optimized(_aggs, OPTIMIZED_AGGS):
+        # One or more aggregations are unsupported
+        return None
+
+    return CudfGroupbyAgg(
+        gb.obj.expr,
+        _aggs,
+        gb.observed,
+        gb.dropna,
+        split_every,
+        split_out,
+        gb.sort,
+        shuffle_method,
+        gb._slice,
+        *gb.by,
+    )
+
+
 class CudfFusedParquetIO(FusedParquetIO):
     @staticmethod
     def _load_multiple_files(
@@ -89,16 +302,34 @@ def _dataset_info(self):
         return dataset_info
 
     @staticmethod
-    def _table_to_pandas(
-        table,
-        index_name,
-        *args,
-    ):
+    def _table_to_pandas(table, index_name):
         df = cudf.DataFrame.from_arrow(table)
         if index_name is not None:
             df = df.set_index(index_name)
         return df
 
+    def _filtered_task(self, index: int):
+        columns = self.columns.copy()
+        index_name = self.index.name
+        if self.index is not None:
+            index_name = self.index.name
+        schema = self._dataset_info["schema"].remove_metadata()
+        if index_name:
+            if columns is None:
+                columns = list(schema.names)
+            columns.append(index_name)
+        return (
+            self._table_to_pandas,
+            (
+                self._fragment_to_table,
+                FragmentWrapper(self.fragments[index], filesystem=self.fs),
+                self.filters,
+                columns,
+                schema,
+            ),
+            index_name,
+        )
+
     def _tune_up(self, parent):
         if self._fusion_compression_factor >= 1:
             return
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 65688115b59..8a16fe7615d 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from dask_expr._collection import new_collection
 from dask_expr._groupby import (
     GroupBy as DXGroupBy,
     SeriesGroupBy as DXSeriesGroupBy,
@@ -11,6 +12,8 @@
 
 from cudf.core.groupby.groupby import _deprecate_collect
 
+from dask_cudf.expr._expr import _maybe_get_custom_expr
+
 ##
 ## Custom groupby classes
 ##
@@ -54,9 +57,16 @@ def _translate_arg(arg):
         return arg
 
 
-# TODO: These classes are mostly a work-around for missing
-# `observed=False` support.
-# See: https://github.com/rapidsai/cudf/issues/15173
+# We define our own GroupBy classes in Dask cuDF for
+# the following reasons:
+#  (1) We want to use a custom `aggregate` algorithm
+#      that performs multiple aggregations on the
+#      same dataframe partition at once. The upstream
+#      algorithm breaks distinct aggregations into
+#      separate tasks.
+#  (2) We need to work around missing `observed=False`
+#      support:
+#      https://github.com/rapidsai/cudf/issues/15173
 
 
 class GroupBy(DXGroupBy):
@@ -89,8 +99,15 @@ def collect(self, **kwargs):
         _deprecate_collect()
         return self._single_agg(ListAgg, **kwargs)
 
-    def aggregate(self, arg, **kwargs):
-        return super().aggregate(_translate_arg(arg), **kwargs)
+    def aggregate(self, arg, fused=True, **kwargs):
+        if (
+            fused
+            and (expr := _maybe_get_custom_expr(self, arg, **kwargs))
+            is not None
+        ):
+            return new_collection(expr)
+        else:
+            return super().aggregate(_translate_arg(arg), **kwargs)
 
 
 class SeriesGroupBy(DXSeriesGroupBy):
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 76bb2ea99b4..0421bd755f4 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from .csv import read_csv
-from .json import read_json
-from .orc import read_orc, to_orc
-from .text import read_text
+from .csv import read_csv  # noqa: F401
+from .json import read_json  # noqa: F401
+from .orc import read_orc, to_orc  # noqa: F401
+from .text import read_text  # noqa: F401
 
 try:
-    from .parquet import read_parquet, to_parquet
+    from .parquet import read_parquet, to_parquet  # noqa: F401
 except ImportError:
     pass
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index a781b8242fe..39ac6474958 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -383,6 +383,12 @@ def write_metadata(parts, fmd, fs, path, append=False, **kwargs):
             metadata_path = fs.sep.join([path, "_metadata"])
             _meta = []
             if append and fmd is not None:
+                # Convert to bytes: <https://github.com/rapidsai/cudf/issues/17177>
+                if isinstance(fmd, pq.FileMetaData):
+                    with BytesIO() as myio:
+                        fmd.write_metadata_file(myio)
+                        myio.seek(0)
+                        fmd = np.frombuffer(myio.read(), dtype="uint8")
                 _meta = [fmd]
             _meta.extend([parts[i][0]["meta"] for i in range(len(parts))])
             _meta = (
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 896c4169f5b..a29cf9a342a 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -15,7 +15,11 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    require_dask_expr,
+    skip_dask_expr,
+    xfail_dask_expr,
+)
 
 # Check if create_metadata_file is supported by
 # the current dask.dataframe version
@@ -615,3 +619,48 @@ def test_timezone_column(tmpdir):
     got = dask_cudf.read_parquet(path)
     expect = cudf.read_parquet(path)
     dd.assert_eq(got, expect)
+
+
+@require_dask_expr()
+@pytest.mark.skipif(
+    not dask_cudf.backends.PYARROW_GE_15,
+    reason="Requires pyarrow 15",
+)
+@pytest.mark.parametrize("min_part_size", ["1B", "1GB"])
+def test_read_parquet_arrow_filesystem(tmpdir, min_part_size):
+    tmp_path = str(tmpdir)
+    with dask.config.set(
+        {
+            "dataframe.backend": "cudf",
+            "dataframe.parquet.minimum-partition-size": min_part_size,
+        }
+    ):
+        dd.from_dict(
+            {"x": range(1000), "y": ["a", "b", "c", "d"] * 250},
+            npartitions=10,
+        ).to_parquet(tmp_path, write_index=False)
+        df = cudf.read_parquet(tmp_path)
+        ddf = dask_cudf.read_parquet(tmp_path, filesystem="arrow")
+        dd.assert_eq(df, ddf, check_index=False)
+        assert isinstance(ddf._meta, cudf.DataFrame)
+        assert isinstance(ddf.compute(), cudf.DataFrame)
+
+
+@pytest.mark.parametrize("write_metadata_file", [True, False])
+def test_to_parquet_append(tmpdir, write_metadata_file):
+    df = cudf.DataFrame({"a": [1, 2, 3]})
+    ddf = dask_cudf.from_cudf(df, npartitions=1)
+    ddf.to_parquet(
+        tmpdir,
+        append=True,
+        write_metadata_file=write_metadata_file,
+        write_index=False,
+    )
+    ddf.to_parquet(
+        tmpdir,
+        append=True,
+        write_metadata_file=write_metadata_file,
+        write_index=False,
+    )
+    ddf2 = dask_cudf.read_parquet(tmpdir)
+    dd.assert_eq(cudf.concat([df, df]), ddf2)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index cf8af82e112..90907f6fb99 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -11,6 +11,8 @@
 
 from dask.dataframe import assert_eq
 
+import cudf
+
 import dask_cudf
 from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
@@ -168,6 +170,8 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem):
                 filesystem=filesystem,
             )
         assert df.b.sum().compute() == 9
+        assert isinstance(df._meta, cudf.DataFrame)
+        assert isinstance(df.compute(), cudf.DataFrame)
 
 
 def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf):
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index e30474f6b94..042e69d86f4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -14,7 +14,11 @@
 
 import dask_cudf
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
-from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    QUERY_PLANNING_ON,
+    require_dask_expr,
+    xfail_dask_expr,
+)
 
 
 def assert_cudf_groupby_layers(ddf):
@@ -556,10 +560,22 @@ def test_groupby_categorical_key():
         ),
     ],
 )
+@pytest.mark.parametrize(
+    "fused",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=require_dask_expr("Not supported by legacy API"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
-def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
+def test_groupby_agg_params(
+    npartitions, split_every, split_out, fused, as_index
+):
     df = cudf.datasets.randomdata(
         nrows=150,
         dtypes={"name": str, "a": int, "b": int, "c": float},
@@ -574,6 +590,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
         "c": ["mean", "std", "var"],
     }
 
+    fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {}
     split_kwargs = {"split_every": split_every, "split_out": split_out}
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
@@ -593,6 +610,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
             ddf.groupby(["name", "a"], sort=True, **maybe_as_index)
             .aggregate(
                 agg_dict,
+                **fused_kwarg,
                 **split_kwargs,
             )
             .compute()
@@ -614,6 +632,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     # Full check (`sort=False`)
     gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate(
         agg_dict,
+        **fused_kwarg,
         **split_kwargs,
     )
     pr = pddf.groupby(["name", "a"], sort=False).agg(
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index fbcd7ae5dfb..862e8f36eaa 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -69,50 +69,17 @@ version = {file = "dask_cudf/VERSION"}
 [tool.setuptools.packages.find]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
+[tool.ruff]
+extend = "../../pyproject.toml"
 
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-]
-known_first_party = [
-    "dask_cudf",
-]
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["dask_cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
 
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-]
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
@@ -126,5 +93,8 @@ filterwarnings = [
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
+    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
+    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 xfail_strict = true
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 84660cbc276..c6d9ae56467 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA",
 ]
 dependencies = [
-    "nvidia-nvcomp==4.0.1",
+    "nvidia-nvcomp==4.1.0.6",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index 15dd2b4c34f..b1d9656afc2 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -26,6 +26,7 @@ set(cython_sources
     filling.pyx
     gpumemoryview.pyx
     groupby.pyx
+    hashing.pyx
     interop.pyx
     join.pyx
     json.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index aa67b4b1149..aa2ce957173 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,8 @@ from . cimport (
     expressions,
     filling,
     groupby,
+    hashing,
+    interop,
     join,
     json,
     labeling,
@@ -62,6 +64,8 @@ __all__ = [
     "filling",
     "gpumemoryview",
     "groupby",
+    "hashing",
+    "interop",
     "join",
     "json",
     "lists",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 4033062b7e2..62a2170f83e 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -22,6 +22,7 @@
     expressions,
     filling,
     groupby,
+    hashing,
     interop,
     io,
     join,
@@ -73,6 +74,7 @@
     "filling",
     "gpumemoryview",
     "groupby",
+    "hashing",
     "interop",
     "io",
     "join",
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 51b2b4cfaa3..eef73bf4e9d 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -100,6 +100,7 @@ cpdef bool is_supported_operation(
         The right hand side data type.
     op : BinaryOperator
         The operation to check.
+
     Returns
     -------
     bool
diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd
index 72ce680ba7a..335ef435f9b 100644
--- a/python/pylibcudf/pylibcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/datetime.pxd
@@ -1,15 +1,56 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from pylibcudf.libcudf.datetime cimport datetime_component
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.datetime cimport datetime_component, rounding_frequency
+from pylibcudf.scalar cimport Scalar
 
-from .column cimport Column
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
+cpdef Column extract_millisecond_fraction(
+    Column input
+)
+
+cpdef Column extract_microsecond_fraction(
+    Column input
+)
 
-cpdef Column extract_year(
-    Column col
+cpdef Column extract_nanosecond_fraction(
+    Column input
 )
 
 cpdef Column extract_datetime_component(
-    Column col,
+    Column input,
     datetime_component component
 )
+
+cpdef Column ceil_datetimes(
+    Column input,
+    rounding_frequency freq
+)
+
+cpdef Column floor_datetimes(
+    Column input,
+    rounding_frequency freq
+)
+
+cpdef Column round_datetimes(
+    Column input,
+    rounding_frequency freq
+)
+
+cpdef Column add_calendrical_months(
+    Column timestamps,
+    ColumnOrScalar months,
+)
+
+cpdef Column day_of_year(Column input)
+
+cpdef Column is_leap_year(Column input)
+
+cpdef Column last_day_of_month(Column input)
+
+cpdef Column extract_quarter(Column input)
+
+cpdef Column days_in_month(Column input)
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index ac4335cca56..9e5e709d81d 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -3,41 +3,106 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.datetime cimport (
+    add_calendrical_months as cpp_add_calendrical_months,
+    ceil_datetimes as cpp_ceil_datetimes,
     datetime_component,
+    day_of_year as cpp_day_of_year,
+    days_in_month as cpp_days_in_month,
     extract_datetime_component as cpp_extract_datetime_component,
-    extract_year as cpp_extract_year,
+    extract_microsecond_fraction as cpp_extract_microsecond_fraction,
+    extract_millisecond_fraction as cpp_extract_millisecond_fraction,
+    extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
+    extract_quarter as cpp_extract_quarter,
+    floor_datetimes as cpp_floor_datetimes,
+    is_leap_year as cpp_is_leap_year,
+    last_day_of_month as cpp_last_day_of_month,
+    round_datetimes as cpp_round_datetimes,
+    rounding_frequency,
 )
 
 from pylibcudf.libcudf.datetime import \
     datetime_component as DatetimeComponent  # no-cython-lint
+from pylibcudf.libcudf.datetime import \
+    rounding_frequency as RoundingFrequency  # no-cython-lint
+
+from cython.operator cimport dereference
 
 from .column cimport Column
 
+cpdef Column extract_millisecond_fraction(
+    Column input
+):
+    """
+    Extract the millisecond from a datetime column.
+
+    For details, see :cpp:func:`extract_millisecond_fraction`.
+
+    Parameters
+    ----------
+    input : Column
+        The column to extract the millisecond from.
+
+    Returns
+    -------
+    Column
+        Column with the extracted milliseconds.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_extract_millisecond_fraction(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column extract_microsecond_fraction(
+    Column input
+):
+    """
+    Extract the microsecond fraction from a datetime column.
+
+    For details, see :cpp:func:`extract_microsecond_fraction`.
+
+    Parameters
+    ----------
+    input : Column
+        The column to extract the microsecond fraction from.
+
+    Returns
+    -------
+    Column
+        Column with the extracted microsecond fractions.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_extract_microsecond_fraction(input.view())
+    return Column.from_libcudf(move(result))
 
-cpdef Column extract_year(
-    Column values
+cpdef Column extract_nanosecond_fraction(
+    Column input
 ):
     """
-    Extract the year from a datetime column.
+    Extract the nanosecond fraction from a datetime column.
+
+    For details, see :cpp:func:`extract_nanosecond_fraction`.
 
     Parameters
     ----------
-    values : Column
-        The column to extract the year from.
+    input : Column
+        The column to extract the nanosecond fraction from.
 
     Returns
     -------
     Column
-        Column with the extracted years.
+        Column with the extracted nanosecond fractions.
     """
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_extract_year(values.view())
+        result = cpp_extract_nanosecond_fraction(input.view())
     return Column.from_libcudf(move(result))
 
 cpdef Column extract_datetime_component(
-    Column values,
+    Column input,
     datetime_component component
 ):
     """
@@ -47,7 +112,7 @@ cpdef Column extract_datetime_component(
 
     Parameters
     ----------
-    values : Column
+    input : Column
         The column to extract the component from.
     component : DatetimeComponent
         The datetime component to extract.
@@ -60,5 +125,237 @@ cpdef Column extract_datetime_component(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_extract_datetime_component(values.view(), component)
+        result = cpp_extract_datetime_component(input.view(), component)
+    return Column.from_libcudf(move(result))
+
+cpdef Column ceil_datetimes(
+    Column input,
+    rounding_frequency freq
+):
+    """
+    Round datetimes up to the nearest multiple of the given frequency.
+
+    For details, see :cpp:func:`ceil_datetimes`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+    freq : rounding_frequency
+        The frequency to round up to.
+
+    Returns
+    -------
+    Column
+        Column of the same datetime resolution as the input column.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_ceil_datetimes(input.view(), freq)
+    return Column.from_libcudf(move(result))
+
+cpdef Column floor_datetimes(
+    Column input,
+    rounding_frequency freq
+):
+    """
+    Round datetimes down to the nearest multiple of the given frequency.
+
+    For details, see :cpp:func:`floor_datetimes`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+    freq : rounding_frequency
+        The frequency to round down to.
+
+    Returns
+    -------
+    Column
+        Column of the same datetime resolution as the input column.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_floor_datetimes(input.view(), freq)
+    return Column.from_libcudf(move(result))
+
+cpdef Column round_datetimes(
+    Column input,
+    rounding_frequency freq
+):
+    """
+    Round datetimes to the nearest multiple of the given frequency.
+
+    For details, see :cpp:func:`round_datetimes`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+    freq : rounding_frequency
+        The frequency to round to.
+
+    Returns
+    -------
+    Column
+        Column of the same datetime resolution as the input column.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_round_datetimes(input.view(), freq)
+    return Column.from_libcudf(move(result))
+
+cpdef Column add_calendrical_months(
+    Column input,
+    ColumnOrScalar months,
+):
+    """
+    Adds or subtracts a number of months from the datetime
+    type and returns a timestamp column that is of the same
+    type as the input timestamps column.
+
+    For details, see :cpp:func:`add_calendrical_months`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input timestamp values.
+    months : ColumnOrScalar
+        The number of months to add.
+
+    Returns
+    -------
+    Column
+        Column of computed timestamps.
+    """
+    if not isinstance(months, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_add_calendrical_months(
+            input.view(),
+            months.view() if ColumnOrScalar is Column else
+            dereference(months.get())
+        )
+    return Column.from_libcudf(move(result))
+
+cpdef Column day_of_year(Column input):
+    """
+    Computes the day number since the start of
+    the year from the datetime. The value is between
+    [1, {365-366}].
+
+    For details, see :cpp:func:`day_of_year`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of day numbers.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_day_of_year(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column is_leap_year(Column input):
+    """
+    Check if the year of the given date is a leap year.
+
+    For details, see :cpp:func:`is_leap_year`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of bools indicating whether the given year
+        is a leap year.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_is_leap_year(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column last_day_of_month(Column input):
+    """
+    Computes the last day of the month.
+
+    For details, see :cpp:func:`last_day_of_month`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of ``TIMESTAMP_DAYS`` representing the last day
+        of the month.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_last_day_of_month(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column extract_quarter(Column input):
+    """
+    Returns the quarter (ie. a value from {1, 2, 3, 4})
+    that the date is in.
+
+    For details, see :cpp:func:`extract_quarter`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column indicating which quarter the date is in.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_extract_quarter(input.view())
+    return Column.from_libcudf(move(result))
+
+cpdef Column days_in_month(Column input):
+    """
+    Extract the number of days in the month.
+
+    For details, see :cpp:func:`days_in_month`.
+
+    Parameters
+    ----------
+    input : Column
+        The column of input datetime values.
+
+    Returns
+    -------
+    Column
+        Column of the number of days in the given month.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_days_in_month(input.view())
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
index a44c9e25987..1535f68366b 100644
--- a/python/pylibcudf/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -5,7 +5,17 @@ from pylibcudf.libcudf.expressions import \
     table_reference as TableReference  # no-cython-lint
 
 from cython.operator cimport dereference
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport (
+    int8_t,
+    int16_t,
+    int32_t,
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
+from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -18,12 +28,14 @@ from pylibcudf.libcudf.scalar.scalar cimport (
 )
 from pylibcudf.libcudf.types cimport size_type, type_id
 from pylibcudf.libcudf.wrappers.durations cimport (
+    duration_D,
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
 from pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_D,
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
@@ -78,6 +90,34 @@ cdef class Literal(Expression):
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[int32_t] &>dereference(self.scalar.c_obj)
             ))
+        elif tid == type_id.INT16:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int16_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.INT8:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int8_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint64_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint32_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT16:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint16_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.UINT8:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[uint8_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.BOOL8:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[bool] &>dereference(self.scalar.c_obj)
+            ))
         elif tid == type_id.FLOAT64:
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[double] &>dereference(self.scalar.c_obj)
@@ -110,6 +150,10 @@ cdef class Literal(Expression):
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <timestamp_scalar[timestamp_s] &>dereference(self.scalar.c_obj)
             ))
+        elif tid == type_id.TIMESTAMP_DAYS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_D] &>dereference(self.scalar.c_obj)
+            ))
         elif tid == type_id.DURATION_NANOSECONDS:
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <duration_scalar[duration_ns] &>dereference(self.scalar.c_obj)
@@ -130,6 +174,10 @@ cdef class Literal(Expression):
             self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <duration_scalar[duration_s] &>dereference(self.scalar.c_obj)
             ))
+        elif tid == type_id.DURATION_DAYS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_D] &>dereference(self.scalar.c_obj)
+            ))
         else:
             raise NotImplementedError(
                 f"Don't know how to make literal with type id {tid}"
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index 0372e1132cc..a47004a1e42 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -77,6 +77,10 @@ cpdef void fill_in_place(
         The index at which to stop filling.
     value : Scalar
         The value to fill with.
+
+    Returns
+    -------
+    None
     """
 
     with nogil:
@@ -101,6 +105,7 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step):
         The initial value of the sequence
     step : Scalar
         The step of the sequence
+
     Returns
     -------
     pylibcudf.Column
diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd
new file mode 100644
index 00000000000..2d070ddda69
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/hashing.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t, uint64_t
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Column murmurhash3_x86_32(
+    Table input,
+    uint32_t seed=*
+)
+
+cpdef Table murmurhash3_x64_128(
+    Table input,
+    uint64_t seed=*
+)
+
+
+cpdef Column xxhash_64(
+    Table input,
+    uint64_t seed=*
+)
+
+cpdef Column md5(Table input)
+cpdef Column sha1(Table input)
+cpdef Column sha224(Table input)
+cpdef Column sha256(Table input)
+cpdef Column sha384(Table input)
+cpdef Column sha512(Table input)
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
new file mode 100644
index 00000000000..9ea3d4d1bda
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -0,0 +1,240 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.hash cimport (
+    DEFAULT_HASH_SEED,
+    md5 as cpp_md5,
+    murmurhash3_x64_128 as cpp_murmurhash3_x64_128,
+    murmurhash3_x86_32 as cpp_murmurhash3_x86_32,
+    sha1 as cpp_sha1,
+    sha224 as cpp_sha224,
+    sha256 as cpp_sha256,
+    sha384 as cpp_sha384,
+    sha512 as cpp_sha512,
+    xxhash_64 as cpp_xxhash_64,
+)
+from pylibcudf.libcudf.table.table cimport table
+
+from .column cimport Column
+from .table cimport Table
+
+LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED
+
+cpdef Column murmurhash3_x86_32(
+    Table input,
+    uint32_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the MurmurHash3 32-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`murmurhash3_x86_32`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint32_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_murmurhash3_x86_32(
+            input.view(),
+            seed
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table murmurhash3_x64_128(
+    Table input,
+    uint64_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the MurmurHash3 64-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`murmurhash3_x64_128`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint64_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Table
+        A table of two UINT64 columns
+    """
+    cdef unique_ptr[table] c_result
+    with nogil:
+        c_result = cpp_murmurhash3_x64_128(
+            input.view(),
+            seed
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Column xxhash_64(
+    Table input,
+    uint64_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the xxHash 64-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`xxhash_64`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint64_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+
+    cdef unique_ptr[column] c_result
+    with  nogil:
+        c_result = cpp_xxhash_64(
+            input.view(),
+            seed
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column md5(Table input):
+    """Computes the MD5 hash value of each row in the given table.
+
+    For details, see :cpp:func:`md5`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the md5 hash of a row from the input
+
+    """
+
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_md5(input.view())
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column sha1(Table input):
+    """Computes the SHA-1 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha1`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha1(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha224(Table input):
+    """Computes the SHA-224 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha224`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha224(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha256(Table input):
+    """Computes the SHA-256 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha256`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha256(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha384(Table input):
+    """Computes the SHA-384 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha384`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha384(input.view())
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sha512(Table input):
+    """Computes the SHA-512 hash value of each row in the given table.
+
+    For details, see :cpp:func:`sha512`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_sha512(input.view())
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..2a0a8c15fdd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/interop.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table cimport Table
+
+
+cpdef Table from_dlpack(object managed_tensor)
+
+cpdef object to_dlpack(Table input)
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 642516a1b90..61e812353b7 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -1,6 +1,11 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from cpython.pycapsule cimport (
+    PyCapsule_GetPointer,
+    PyCapsule_IsValid,
+    PyCapsule_New,
+    PyCapsule_SetName,
+)
 from libc.stdlib cimport free
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport (
     ArrowArray,
     ArrowArrayStream,
     ArrowSchema,
+    DLManagedTensor,
     column_metadata,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
+    from_dlpack as cpp_from_dlpack,
     to_arrow_host_raw,
     to_arrow_schema_raw,
+    to_dlpack as cpp_to_dlpack,
 )
 from pylibcudf.libcudf.table.table cimport table
 
@@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None):
     # Note that metadata for scalars is primarily important for preserving
     # information on nested types since names are otherwise irrelevant.
     return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
+
+
+cpdef Table from_dlpack(object managed_tensor):
+    """
+    Convert a DLPack DLTensor into a cudf table.
+
+    For details, see :cpp:func:`cudf::from_dlpack`
+
+    Parameters
+    ----------
+    managed_tensor : PyCapsule
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    Table
+        Table with a copy of the tensor data.
+    """
+    if not PyCapsule_IsValid(managed_tensor, "dltensor"):
+        raise ValueError("Invalid PyCapsule object")
+    cdef unique_ptr[table] c_result
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        managed_tensor, "dltensor"
+    )
+    if dlpack_tensor is NULL:
+        raise ValueError("PyCapsule object contained a NULL pointer")
+    PyCapsule_SetName(managed_tensor, "used_dltensor")
+
+    # Note: A copy is always performed when converting the dlpack
+    # data to a libcudf table. We also delete the dlpack_tensor pointer
+    # as the pointer is not deleted by libcudf's from_dlpack function.
+    # TODO: https://github.com/rapidsai/cudf/issues/10874
+    # TODO: https://github.com/rapidsai/cudf/issues/10849
+    with nogil:
+        c_result = cpp_from_dlpack(dlpack_tensor)
+
+    cdef Table result = Table.from_libcudf(move(c_result))
+    dlpack_tensor.deleter(dlpack_tensor)
+    return result
+
+
+cpdef object to_dlpack(Table input):
+    """
+    Convert a cudf table into a DLPack DLTensor.
+
+    For details, see :cpp:func:`cudf::to_dlpack`
+
+    Parameters
+    ----------
+    input : Table
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    PyCapsule
+        1D or 2D DLPack tensor with a copy of the table data, or nullptr.
+    """
+    for col in input._columns:
+        if col.null_count():
+            raise ValueError(
+                "Cannot create a DLPack tensor with null values. "
+                "Input is required to have null count as zero."
+            )
+    cdef DLManagedTensor *dlpack_tensor
+
+    with nogil:
+        dlpack_tensor = cpp_to_dlpack(input.view())
+
+    return PyCapsule_New(
+        dlpack_tensor,
+        "dltensor",
+        dlmanaged_tensor_pycapsule_deleter
+    )
+
+
+cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
+    if PyCapsule_IsValid(pycap_obj, "used_dltensor"):
+        # we do not call a used capsule's deleter
+        return
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        pycap_obj, "dltensor"
+    )
+    if dlpack_tensor is not NULL:
+        dlpack_tensor.deleter(dlpack_tensor)
diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
index 73cdfb96af5..8bbc120cff8 100644
--- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -41,14 +41,14 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
         datetime_component component
     ) except +
 
-    ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency":
-        DAY "cudf::datetime::rounding_frequency::DAY"
-        HOUR "cudf::datetime::rounding_frequency::HOUR"
-        MINUTE "cudf::datetime::rounding_frequency::MINUTE"
-        SECOND "cudf::datetime::rounding_frequency::SECOND"
-        MILLISECOND "cudf::datetime::rounding_frequency::MILLISECOND"
-        MICROSECOND "cudf::datetime::rounding_frequency::MICROSECOND"
-        NANOSECOND "cudf::datetime::rounding_frequency::NANOSECOND"
+    cpdef enum class rounding_frequency(int32_t):
+        DAY
+        HOUR
+        MINUTE
+        SECOND
+        MILLISECOND
+        MICROSECOND
+        NANOSECOND
 
     cdef unique_ptr[column] ceil_datetimes(
         const column_view& column, rounding_frequency freq
@@ -64,6 +64,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
         const column_view& timestamps,
         const column_view& months
     ) except +
+    cdef unique_ptr[column] add_calendrical_months(
+        const column_view& timestamps,
+        const scalar& months
+    ) except +
     cdef unique_ptr[column] day_of_year(const column_view& column) except +
     cdef unique_ptr[column] is_leap_year(const column_view& column) except +
     cdef unique_ptr[column] last_day_of_month(
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 51678ba69d8..c4222bc9dc5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -3,6 +3,7 @@
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
+from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
@@ -10,36 +11,44 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
 
-    cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" (
+    cdef unique_ptr[column] murmurhash3_x86_32(
         const table_view& input,
         const uint32_t seed
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] md5 "cudf::hashing::md5" (
+    cdef unique_ptr[table] murmurhash3_x64_128(
+        const table_view& input,
+        const uint64_t seed
+    ) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] md5(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha1 "cudf::hashing::sha1" (
+    cdef unique_ptr[column] sha1(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha224 "cudf::hashing::sha224" (
+    cdef unique_ptr[column] sha224(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha256 "cudf::hashing::sha256" (
+    cdef unique_ptr[column] sha256(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha384 "cudf::hashing::sha384" (
+    cdef unique_ptr[column] sha384(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] sha512 "cudf::hashing::sha512" (
+    cdef unique_ptr[column] sha512(
         const table_view& input
-    ) except +
+    ) except +libcudf_exception_handler
 
-    cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" (
+    cdef unique_ptr[column] xxhash_64(
         const table_view& input,
         const uint64_t seed
-    ) except +
+    ) except +libcudf_exception_handler
+
+cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
+    cdef uint32_t DEFAULT_HASH_SEED
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pyx b/python/pylibcudf/pylibcudf/libcudf/hash.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 30b97fdec34..b75e9ca7001 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil:
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
-    cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
-                                       ) except +
+    cdef unique_ptr[table] from_dlpack(
+        const DLManagedTensor* managed_tensor
+    ) except +
 
-    DLManagedTensor* to_dlpack(table_view input_table
-                               ) except +
+    DLManagedTensor* to_dlpack(
+        const table_view& input
+    ) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index 673bffa28ae..be3a2d75718 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -8,9 +9,9 @@ from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
-    ctypedef enum letter_type:
-        CONSONANT 'nvtext::letter_type::CONSONANT'
-        VOWEL 'nvtext::letter_type::VOWEL'
+    cpdef enum class letter_type:
+        CONSONANT
+        VOWEL
 
     cdef unique_ptr[column] porter_stemmer_measure(
         const column_view & strings
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
index aabac0a617b..8dac86d688d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -9,14 +9,14 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
-    cdef cppclass tokenizer_result "nvtext::tokenizer_result":
+    cdef cppclass tokenizer_result:
         uint32_t nrows_tensor
         uint32_t sequence_length
         unique_ptr[column] tensor_token_ids
         unique_ptr[column] tensor_attention_mask
         unique_ptr[column] tensor_metadata
 
-    cdef struct hashed_vocabulary "nvtext::hashed_vocabulary":
+    cdef cppclass hashed_vocabulary:
         uint16_t first_token_id
         uint16_t separator_token_id
         uint16_t unknown_token_id
@@ -26,6 +26,8 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         unique_ptr[column] table
         unique_ptr[column] bin_coefficients
         unique_ptr[column] bin_offsets
+        unique_ptr[column] cp_metadata
+        unique_ptr[column] aux_cp_table
 
     cdef unique_ptr[hashed_vocabulary] load_vocabulary_file(
         const string &filename_hashed_vocabulary
@@ -33,7 +35,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
 
     cdef tokenizer_result subword_tokenize(
         const column_view & strings,
-        hashed_vocabulary & hashed_vocablary_obj,
+        hashed_vocabulary & hashed_vocabulary_obj,
         uint32_t max_sequence_length,
         uint32_t stride,
         bool do_lower,
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
index 40f0e2fa50c..6b0c90d0acc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
@@ -6,6 +6,7 @@ from libcpp.vector cimport vector
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
@@ -14,17 +15,18 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
-        regex_program,
-        string_scalar repl,
-        size_type maxrepl) except +
-
-    cdef unique_ptr[column] replace_with_backrefs(
-        column_view source_strings,
-        regex_program,
-        string repl) except +
+        column_view input,
+        regex_program prog,
+        string_scalar replacement,
+        size_type max_replace_count) except +
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
+        column_view input,
         vector[string] patterns,
-        column_view repls) except +
+        column_view replacements,
+        regex_flags flags) except +
+
+    cdef unique_ptr[column] replace_with_backrefs(
+        column_view input,
+        regex_program prog,
+        string replacement) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
index d21510bd731..47d79083b66 100644
--- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -27,6 +27,11 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         column_view input
     ) except +
 
+    cdef unique_ptr[column] compute_column(
+        table_view table,
+        expression expr
+    ) except +
+
     cdef unique_ptr[column] transform(
         column_view input,
         string unary_udf,
diff --git a/python/pylibcudf/pylibcudf/libcudf/types.pxd b/python/pylibcudf/pylibcudf/libcudf/types.pxd
index eabae68bc90..60e293e5cdb 100644
--- a/python/pylibcudf/pylibcudf/libcudf/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/types.pxd
@@ -70,18 +70,19 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         TIMESTAMP_MILLISECONDS
         TIMESTAMP_MICROSECONDS
         TIMESTAMP_NANOSECONDS
-        DICTIONARY32
-        STRING
-        LIST
-        STRUCT
-        NUM_TYPE_IDS
+        DURATION_DAYS
         DURATION_SECONDS
         DURATION_MILLISECONDS
         DURATION_MICROSECONDS
         DURATION_NANOSECONDS
+        DICTIONARY32
+        STRING
+        LIST
         DECIMAL32
         DECIMAL64
         DECIMAL128
+        STRUCT
+        NUM_TYPE_IDS
 
     cdef cppclass data_type:
         data_type() except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
index 7c648425eb5..c9c960d0a79 100644
--- a/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
@@ -1,9 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport int64_t
+from libc.stdint cimport int32_t, int64_t
 
 
 cdef extern from "cudf/wrappers/durations.hpp" namespace "cudf" nogil:
+    ctypedef int32_t duration_D
     ctypedef int64_t duration_s
     ctypedef int64_t duration_ms
     ctypedef int64_t duration_us
diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
index 50d37fd0a68..5dcd144529d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
@@ -1,9 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport int64_t
+from libc.stdint cimport int32_t, int64_t
 
 
 cdef extern from "cudf/wrappers/timestamps.hpp" namespace "cudf" nogil:
+    ctypedef int32_t timestamp_D
     ctypedef int64_t timestamp_s
     ctypedef int64_t timestamp_ms
     ctypedef int64_t timestamp_us
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 94df9bbbebb..93e3fb15259 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx
+set(cython_sources
+    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
+    replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index b6659827688..ef837167eb9 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,11 +1,17 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from . cimport (
+    byte_pair_encode,
     edit_distance,
     generate_ngrams,
     jaccard,
     minhash,
     ngrams_tokenize,
+    normalize,
+    replace,
+    stemmer,
+    subword_tokenize,
+    tokenize,
 )
 
 __all__ = [
@@ -13,5 +19,11 @@ __all__ = [
     "generate_ngrams",
     "jaccard",
     "minhash",
-    "ngrams_tokenize"
+    "byte_pair_encode"
+    "ngrams_tokenize",
+    "normalize",
+    "replace",
+    "stemmer",
+    "subword_tokenize",
+    "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index f74633a3521..4f125d3a733 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,11 +1,29 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams, jaccard, minhash, ngrams_tokenize
+from . import (
+    byte_pair_encode,
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+    normalize,
+    replace,
+    stemmer,
+    subword_tokenize,
+    tokenize,
+)
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
     "minhash",
+    "byte_pair_encode",
     "ngrams_tokenize",
+    "normalize",
+    "replace",
+    "stemmer",
+    "subword_tokenize",
+    "tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e4b93e96b9d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.byte_pair_encode cimport bpe_merge_pairs
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[bpe_merge_pairs] c_obj
+
+cpdef Column byte_pair_encoding(
+    Column input,
+    BPEMergePairs merge_pairs,
+    Scalar separator=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..76caad276d4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.scalar cimport Scalar
+
+
+cdef class BPEMergePairs:
+    """The table of merge pairs for the BPE encoder.
+
+    For details, see :cpp:class:`cudf::nvtext::bpe_merge_pairs`.
+    """
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+cpdef Column byte_pair_encoding(
+    Column input,
+    BPEMergePairs merge_pairs,
+    Scalar separator=None
+):
+    """
+    Byte pair encode the input strings.
+
+    For details, see cpp:func:`cudf::nvtext::byte_pair_encoding`
+
+    Parameters
+    ----------
+    input : Column
+        Strings to encode.
+    merge_pairs : BPEMergePairs
+       Substrings to rebuild each string on.
+    separator : Scalar
+        String used to build the output after encoding. Default is a space.
+
+    Returns
+    -------
+    Column
+        An encoded column of strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    if separator is None:
+        separator = Scalar.from_libcudf(
+            cpp_make_string_scalar(" ".encode())
+        )
+
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                input.view(),
+                dereference(merge_pairs.c_obj.get()),
+                dereference(<const string_scalar*>separator.c_obj.get()),
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
new file mode 100644
index 00000000000..90676145afa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+
+
+cpdef Column normalize_spaces(Column input)
+
+cpdef Column normalize_characters(Column input, bool do_lower_case)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
new file mode 100644
index 00000000000..637d900b659
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.normalize cimport (
+    normalize_characters as cpp_normalize_characters,
+    normalize_spaces as cpp_normalize_spaces,
+)
+
+
+cpdef Column normalize_spaces(Column input):
+    """
+    Returns a new strings column by normalizing the whitespace in
+    each string in the input column.
+
+    For details, see :cpp:func:`normalize_spaces`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+
+    Returns
+    -------
+    Column
+        New strings columns of normalized strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_spaces(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column normalize_characters(Column input, bool do_lower_case):
+    """
+    Normalizes strings characters for tokenizing.
+
+    For details, see :cpp:func:`normalize_characters`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    do_lower_case : bool
+        If true, upper-case characters are converted to lower-case
+        and accents are stripped from those characters. If false,
+        accented and upper-case characters are not transformed.
+
+    Returns
+    -------
+    Column
+        Normalized strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_characters(input.view(), do_lower_case)
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
new file mode 100644
index 00000000000..624f90e7486
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=*,
+)
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=*,
+    Scalar delimiter=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
new file mode 100644
index 00000000000..b65348ce14d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.replace cimport (
+    filter_tokens as cpp_filter_tokens,
+    replace_tokens as cpp_replace_tokens,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=None,
+):
+    """
+    Replaces specified tokens with corresponding replacement strings.
+
+    For details, see :cpp:func:`replace_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    targets : Column
+        Strings to compare against tokens found in ``input``
+    replacements : Column
+        Replacement strings for each string in ``targets``
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+
+    Returns
+    -------
+    Column
+        New strings column with replaced strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    with nogil:
+        c_result = cpp_replace_tokens(
+            input.view(),
+            targets.view(),
+            replacements.view(),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=None,
+    Scalar delimiter=None
+):
+    """
+    Removes tokens whose lengths are less than a specified number of characters.
+
+    For details, see :cpp:func:`filter_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    min_token_length : size_type
+        The minimum number of characters to retain a
+        token in the output string
+    replacement : Scalar, optional
+        Optional replacement string to be used in place of removed tokens
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+    Returns
+    -------
+    Column
+        New strings column of filtered strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    if replacement is None:
+        replacement = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_filter_tokens(
+            input.view(),
+            min_token_length,
+            dereference(<const string_scalar*>replacement.get()),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
new file mode 100644
index 00000000000..48762efc01f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnOrSize:
+    Column
+    size_type
+
+cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)
+
+cpdef Column porter_stemmer_measure(Column input)
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
new file mode 100644
index 00000000000..854d1053624
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.stemmer cimport (
+    is_letter as cpp_is_letter,
+    letter_type,
+    porter_stemmer_measure as cpp_porter_stemmer_measure,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column is_letter(
+    Column input,
+    bool check_vowels,
+    ColumnOrSize indices
+):
+    """
+    Returns boolean column indicating if the character
+    or characters at the provided character index or
+    indices (respectively) are consonants or vowels
+
+    For details, see :cpp:func:`is_letter`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    check_vowels : bool
+        If true, the check is for vowels. Otherwise the check is
+        for consonants.
+    indices : Union[Column, size_type]
+        The character position(s) to check in each string
+
+    Returns
+    -------
+    Column
+        New boolean column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_is_letter(
+            input.view(),
+            letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
+            indices if ColumnOrSize is size_type else indices.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column porter_stemmer_measure(Column input):
+    """
+    Returns the Porter Stemmer measurements of a strings column.
+
+    For details, see :cpp:func:`porter_stemmer_measure`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column of words to measure
+
+    Returns
+    -------
+    Column
+        New column of measure values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_porter_stemmer_measure(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd
new file mode 100644
index 00000000000..091c7b897ac
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary
+
+
+cdef class HashedVocabulary:
+    cdef unique_ptr[hashed_vocabulary] c_obj
+
+cpdef tuple[Column, Column, Column] subword_tokenize(
+    Column input,
+    HashedVocabulary vocabulary_table,
+    uint32_t max_sequence_length,
+    uint32_t stride,
+    bool do_lower_case,
+    bool do_truncate,
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
new file mode 100644
index 00000000000..04643d3bd84
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx
@@ -0,0 +1,84 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libc.stdint cimport uint32_t
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
+    load_vocabulary_file as cpp_load_vocabulary_file,
+    move as tr_move,
+    subword_tokenize as cpp_subword_tokenize,
+    tokenizer_result as cpp_tokenizer_result,
+)
+
+
+cdef class HashedVocabulary:
+    """The vocabulary data for use with the subword_tokenize function.
+
+    For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`.
+    """
+    def __cinit__(self, hash_file):
+        cdef string c_hash_file = <string>str(hash_file).encode()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+
+cpdef tuple[Column, Column, Column] subword_tokenize(
+    Column input,
+    HashedVocabulary vocabulary_table,
+    uint32_t max_sequence_length,
+    uint32_t stride,
+    bool do_lower_case,
+    bool do_truncate,
+):
+    """
+    Creates a tokenizer that cleans the text, splits it into
+    tokens and returns token-ids from an input vocabulary.
+
+    For details, see cpp:func:`subword_tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        The input strings to tokenize.
+    vocabulary_table : HashedVocabulary
+        The vocabulary table pre-loaded into this object.
+    max_sequence_length : uint32_t
+        Limit of the number of token-ids per row in final tensor for each string.
+    stride : uint32_t
+        Each row in the output token-ids will replicate
+        ``max_sequence_length`` - ``stride`` the token-ids
+        from the previous row, unless it is the first string.
+    do_lower_case : bool
+        If true, the tokenizer will convert uppercase characters in the
+        input stream to lower-case and strip accents from those characters.
+        If false, accented and uppercase characters are not transformed.
+    do_truncate : bool
+        If true, the tokenizer will discard all the token-ids after
+        ``max_sequence_length`` for each input string. If false, it
+        will use a new row in the output token-ids to continue
+        generating the output.
+
+    Returns
+    -------
+    tuple[Column, Column, Column]
+        A tuple of three columns containing the
+        tokens, masks, and metadata.
+    """
+    cdef cpp_tokenizer_result c_result
+    with nogil:
+        c_result = tr_move(
+            cpp_subword_tokenize(
+                input.view(),
+                dereference(vocabulary_table.c_obj.get()),
+                max_sequence_length,
+                stride,
+                do_lower_case,
+                do_truncate,
+            )
+        )
+    cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids))
+    cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask))
+    cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata))
+    return tokens, masks, metadata
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
new file mode 100644
index 00000000000..0aed9702d61
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+cdef class TokenizeVocabulary:
+    cdef unique_ptr[tokenize_vocabulary] c_obj
+
+cpdef Column tokenize_scalar(Column input, Scalar delimiter=*)
+
+cpdef Column tokenize_column(Column input, Column delimiters)
+
+cpdef Column count_tokens_scalar(Column input, Scalar delimiter=*)
+
+cpdef Column count_tokens_column(Column input, Column delimiters)
+
+cpdef Column character_tokenize(Column input)
+
+cpdef Column detokenize(Column input, Column row_indices, Scalar separator=*)
+
+cpdef Column tokenize_with_vocabulary(
+    Column input,
+    TokenizeVocabulary vocabulary,
+    Scalar delimiter,
+    size_type default_id=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
new file mode 100644
index 00000000000..ec02e8ebf4e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.tokenize cimport (
+    character_tokenize as cpp_character_tokenize,
+    count_tokens as cpp_count_tokens,
+    detokenize as cpp_detokenize,
+    load_vocabulary as cpp_load_vocabulary,
+    tokenize as cpp_tokenize,
+    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cdef class TokenizeVocabulary:
+    """The Vocabulary object to be used with ``tokenize_with_vocabulary``.
+
+    For details, see :cpp:class:`cudf::nvtext::tokenize_vocabulary`.
+    """
+    def __cinit__(self, Column vocab):
+        cdef column_view c_vocab = vocab.view()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary(c_vocab))
+
+cpdef Column tokenize_scalar(Column input, Scalar delimiter=None):
+    """
+    Returns a single column of strings by tokenizing the input
+    strings column using the provided characters as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+    delimiter : Scalar
+        String scalar used to separate individual strings into tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_tokenize(
+            input.view(),
+            dereference(<const string_scalar*>delimiter.c_obj.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column tokenize_column(Column input, Column delimiters):
+    """
+    Returns a single column of strings by tokenizing the input
+    strings column using multiple strings as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+    delimiters : Column
+        Strings column used to separate individual strings into tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_tokenize(
+            input.view(),
+            delimiters.view(),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None):
+    """
+    Returns the number of tokens in each string of a strings column
+    using the provided characters as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::count_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to count tokens
+    delimiters : Scalar
+        String scalar used to separate each string into tokens
+
+    Returns
+    -------
+    Column
+        New column of token counts
+    """
+    cdef unique_ptr[column] c_result
+
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_count_tokens(
+            input.view(),
+            dereference(<const string_scalar*>delimiter.c_obj.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column count_tokens_column(Column input, Column delimiters):
+    """
+    Returns the number of tokens in each string of a strings column
+    using multiple strings as delimiters.
+
+    For details, see cpp:func:`cudf::nvtext::count_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to count tokens
+    delimiters : Column
+        Strings column used to separate each string into tokens
+
+    Returns
+    -------
+    Column
+        New column of token counts
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_count_tokens(
+            input.view(),
+            delimiters.view(),
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column character_tokenize(Column input):
+    """
+    Returns a single column of strings by converting
+    each character to a string.
+
+    For details, see cpp:func:`cudf::nvtext::character_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_character_tokenize(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column detokenize(
+    Column input,
+    Column row_indices,
+    Scalar separator=None
+):
+    """
+    Creates a strings column from a strings column of tokens
+    and an associated column of row ids.
+
+    For details, see cpp:func:`cudf::nvtext::detokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to detokenize
+    row_indices : Column
+        The relative output row index assigned for each token in the input column
+    separator : Scalar
+        String to append after concatenating each token to the proper output row
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef unique_ptr[column] c_result
+
+    if separator is None:
+        separator = Scalar.from_libcudf(
+            cpp_make_string_scalar(" ".encode())
+        )
+
+    with nogil:
+        c_result = cpp_detokenize(
+            input.view(),
+            row_indices.view(),
+            dereference(<const string_scalar*>separator.c_obj.get())
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column tokenize_with_vocabulary(
+    Column input,
+    TokenizeVocabulary vocabulary,
+    Scalar delimiter,
+    size_type default_id=-1
+):
+    """
+    Returns the token ids for the input string by looking
+    up each delimited token in the given vocabulary.
+
+    For details, see cpp:func:`cudf::nvtext::tokenize_with_vocabulary`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+    vocabulary : TokenizeVocabulary
+        Used to lookup tokens within ``input``
+    delimiter : Scalar
+        Used to identify tokens within ``input``
+    default_id : size_type
+        The token id to be used for tokens not found in the vocabulary; Default is -1
+
+    Returns
+    -------
+    Column
+        Lists column of token ids
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_tokenize_with_vocabulary(
+            input.view(),
+            dereference(vocabulary.c_obj.get()),
+            dereference(<const string_scalar*>delimiter.c_obj.get()),
+            default_id
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 04dd131cd75..5d7fbd24b91 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -28,6 +28,7 @@ set(cython_sources
     regex_program.pyx
     repeat.pyx
     replace.pyx
+    replace_re.pyx
     side_type.pyx
     slice.pyx
     strip.pyx
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 93c61f3f72c..da1c1c576c0 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     regex_program,
     repeat,
     replace,
+    replace_re,
     side_type,
     slice,
     split,
@@ -42,6 +43,7 @@ __all__ = [
     "regex_program",
     "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
     "split",
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index d52b0405f1e..fa7294c7dbd 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -17,6 +17,7 @@
     regex_program,
     repeat,
     replace,
+    replace_re,
     side_type,
     slice,
     split,
@@ -31,6 +32,7 @@
     "capitalize",
     "case",
     "char_types",
+    "combine",
     "contains",
     "convert",
     "extract",
@@ -42,6 +44,7 @@
     "regex_program",
     "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
     "split",
diff --git a/python/pylibcudf/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
index f426b6888ae..91f585cd637 100644
--- a/python/pylibcudf/pylibcudf/strings/regex_program.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
@@ -37,6 +37,10 @@ cdef class RegexProgram:
         flags : Uniont[int, RegexFlags]
             Regex flags for interpreting special characters in the pattern
 
+        Returns
+        -------
+        RegexProgram
+            A new RegexProgram
         """
         cdef unique_ptr[regex_program] c_prog
         cdef regex_flags c_flags
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
index 6db7f04fcbb..2b94f5e3fee 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -136,6 +136,7 @@ cpdef Column replace_slice(
         Start position where repl will be added.
     stop : size_type, default -1
         End position (exclusive) to use for replacement.
+
     Returns
     -------
     pylibcudf.Column
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
new file mode 100644
index 00000000000..e27ccd55f7d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+ctypedef fused Replacement:
+    Column
+    Scalar
+
+ctypedef fused Patterns:
+    RegexProgram
+    list
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=*,
+    size_type max_replace_count=*,
+    regex_flags flags=*
+)
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+)
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
new file mode 100644
index 00000000000..ccc33fd4425
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
@@ -0,0 +1,134 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport replace_re as cpp_replace_re
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=None,
+    size_type max_replace_count=-1,
+    regex_flags flags=regex_flags.DEFAULT,
+):
+    """
+    For each string, replaces any character sequence matching the given patterns
+    with the provided replacement.
+
+    For details, see :cpp:func:`cudf::strings::replace_re`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+    patterns: RegexProgram or list[str]
+        If RegexProgram, the regex to match to each string.
+        If list[str], a list of regex strings to search within each string.
+    replacement : Scalar or Column
+        If Scalar, the string used to replace the matched sequence in each string.
+        ``patterns`` must be a RegexProgram.
+        If Column, the strings used for replacement.
+        ``patterns`` must be a list[str].
+    max_replace_count : int
+        The maximum number of times to replace the matched pattern
+        within each string. ``patterns`` must be a RegexProgram.
+        Default replaces every substring that is matched.
+    flags : RegexFlags
+        Regex flags for interpreting special characters in the patterns.
+        ``patterns`` must be a list[str]
+
+    Returns
+    -------
+    Column
+        New strings column
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[string] c_patterns
+
+    if Patterns is RegexProgram and Replacement is Scalar:
+        if replacement is None:
+            replacement = Scalar.from_libcudf(
+                cpp_make_string_scalar("".encode())
+            )
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    patterns.c_obj.get()[0],
+                    dereference(<string_scalar*>(replacement.get())),
+                    max_replace_count
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    elif Patterns is list and Replacement is Column:
+        c_patterns.reserve(len(patterns))
+        for pattern in patterns:
+            c_patterns.push_back(pattern.encode())
+
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    c_patterns,
+                    replacement.view(),
+                    flags,
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    else:
+        raise TypeError("Must pass either a RegexProgram and a Scalar or a list")
+
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+):
+    """
+    For each string, replaces any character sequence matching the given regex
+    using the replacement template for back-references.
+
+    For details, see :cpp:func:`cudf::strings::replace_with_backrefs`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    prog: RegexProgram
+        Regex program instance.
+
+    replacement : str
+         The replacement template for creating the output string.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_replacement = replacement.encode()
+
+    with nogil:
+        c_result = cpp_replace_re.replace_with_backrefs(
+            input.view(),
+            prog.c_obj.get()[0],
+            c_replacement,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index 9f389fa42c4..d95849ef371 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -7,10 +7,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
index fdce6f353ca..5265e411c7f 100644
--- a/python/pylibcudf/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -8,8 +8,9 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
@@ -17,13 +18,23 @@
 from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES
 
 
-# This fixture defines the standard set of types that all tests should default to
+def _type_to_str(typ):
+    if isinstance(typ, pa.ListType):
+        return f"list[{_type_to_str(typ.value_type)}]"
+    elif isinstance(typ, pa.StructType):
+        return f"struct[{', '.join(_type_to_str(typ.field(i).type) for i in range(typ.num_fields))}]"
+    else:
+        return str(typ)
+
+
+# This fixture defines [the standard set of types that all tests should default to
 # running on. If there is a need for some tests to run on a different set of types, that
 # type list fixture should also be defined below here if it is likely to be reused
 # across modules. Otherwise it may be defined on a per-module basis.
 @pytest.fixture(
     scope="session",
     params=DEFAULT_PA_TYPES,
+    ids=_type_to_str,
 )
 def pa_type(request):
     return request.param
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 0cd5064a697..3d9d99ffa61 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -5,10 +5,11 @@
 
 import fastavro
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_and_meta_eq
 
+import pylibcudf as plc
+
 avro_dtype_pairs = [
     ("boolean", pa.bool_()),
     ("int", pa.int32()),
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ab26f23418d..22c83acc47c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -5,9 +5,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     _convert_types,
     assert_table_and_meta_eq,
@@ -15,6 +13,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_CSV_SOURCE_KWARGS = {
     "format": "csv",
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
index 9d976fedf00..453e5ce32a8 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -3,9 +3,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     assert_table_and_meta_eq,
     make_source,
@@ -13,6 +11,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 42b14b1feff..5ed660ba6cf 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import _convert_types, assert_table_and_meta_eq, make_source
 
+import pylibcudf as plc
+
 # Shared kwargs to pass to make_source
 _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index f6e843ccf66..41298601539 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.parquet import read_table
+from utils import assert_table_and_meta_eq, make_source
+
+import pylibcudf as plc
 from pylibcudf.expressions import (
     ASTOperator,
     ColumnNameReference,
@@ -11,7 +13,6 @@
     Literal,
     Operation,
 )
-from utils import assert_table_and_meta_eq, make_source
 
 # Shared kwargs to pass to make_source
 _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 747f58ec8cf..0c43c363e55 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,9 +2,10 @@
 
 import io
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
 def io_class(request):
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
index 76b0424b2af..b3555013927 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import zoneinfo
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 def test_make_timezone_transition_table():
     if len(zoneinfo.TZPATH) == 0:
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
index f784cb3c191..bbb08e8b95a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -4,10 +4,11 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def idfn(param):
     ltype, rtype, outtype, plc_op, _ = param
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 8cedbc6d42f..e317362a76b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
+import pylibcudf as plc
+
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
index 0e129fdf0ef..24cd6b9e35f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
 import rmm
 
+import pylibcudf as plc
+
 VALID_TYPES = [
     pa.int8(),
     pa.int16(),
diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
index 7a5c1664eed..6d8b5993964 100644
--- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 param_pyarrow_tables = [
     pa.table([]),
     pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
index 628682d0a66..c0a41b96b1a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,7 +2,6 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -16,6 +15,8 @@
     metadata_from_arrow_type,
 )
 
+import pylibcudf as plc
+
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index 75930d59058..f5f24ef28e2 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -1,13 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import calendar
 import datetime
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
 def datetime_column(has_nulls, request):
@@ -45,6 +47,21 @@ def component(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        ("day", plc.datetime.RoundingFrequency.DAY),
+        ("hour", plc.datetime.RoundingFrequency.HOUR),
+        ("minute", plc.datetime.RoundingFrequency.MINUTE),
+        ("second", plc.datetime.RoundingFrequency.SECOND),
+        ("millisecond", plc.datetime.RoundingFrequency.MILLISECOND),
+        ("microsecond", plc.datetime.RoundingFrequency.MICROSECOND),
+        ("nanosecond", plc.datetime.RoundingFrequency.NANOSECOND),
+    ]
+)
+def rounding_frequency(request):
+    return request.param
+
+
 def test_extract_datetime_component(datetime_column, component):
     attr, component = component
     kwargs = {}
@@ -58,3 +75,139 @@ def test_extract_datetime_component(datetime_column, component):
     ).cast(pa.int16())
 
     assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "datetime_func",
+    [
+        "extract_millisecond_fraction",
+        "extract_microsecond_fraction",
+        "extract_nanosecond_fraction",
+    ],
+)
+def test_datetime_extracting_functions(datetime_column, datetime_func):
+    pa_col = plc.interop.to_arrow(datetime_column)
+    got = getattr(plc.datetime, datetime_func)(datetime_column)
+    kwargs = {}
+    attr = datetime_func.split("_")[1]
+    if attr == "weekday":
+        kwargs = {"count_from_zero": False}
+        attr = "day_of_week"
+    expect = getattr(pc, attr)(pa_col, **kwargs).cast(pa.int16())
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        ("ceil_temporal", "ceil_datetimes"),
+        ("floor_temporal", "floor_datetimes"),
+        ("round_temporal", "round_datetimes"),
+    ],
+)
+def test_rounding_operations(datetime_column, op, rounding_frequency):
+    got = getattr(plc.datetime, op[1])(datetime_column, rounding_frequency[1])
+    pa_col = plc.interop.to_arrow(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    expect = getattr(pc, op[0])(
+        pa_col,
+        unit=rounding_frequency[0],
+    ).cast(pa_got.type)
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "months",
+    [
+        pa.scalar(-3, pa.int32()),
+        pa.scalar(1, pa.int16()),
+        pa.array([1, -3, 2, 4, -1, 5], pa.int32()),
+    ],
+)
+def test_calendrical_months(datetime_column, months):
+    def add_calendrical_months(timestamps, months):
+        result = []
+        if isinstance(months, pa.Array):
+            months_list = months.to_pylist()
+        else:
+            months_list = [months.as_py()] * len(timestamps)
+        for i, dt in enumerate(timestamps):
+            if dt.as_py() is not None:
+                year, month = dt.as_py().year, dt.as_py().month
+                new_month = month + months_list[i]
+                new_year = year + (new_month - 1) // 12
+                result.append(
+                    dt.as_py().replace(
+                        year=new_year, month=(new_month - 1) % 12 + 1
+                    )
+                )
+            else:
+                result.append(None)
+        return pa.array(result)
+
+    pa_col = plc.interop.to_arrow(datetime_column)
+    got = plc.datetime.add_calendrical_months(
+        datetime_column, plc.interop.from_arrow(months)
+    )
+    pa_got = plc.interop.to_arrow(got)
+    expect = add_calendrical_months(pa_col, months).cast(pa_got.type)
+    assert_column_eq(expect, got)
+
+
+def test_day_of_year(datetime_column):
+    got = plc.datetime.day_of_year(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    expect = pa.array(
+        [
+            d.as_py().timetuple().tm_yday if d.as_py() is not None else None
+            for d in pa_col
+        ],
+        type=pa_got.type,
+    )
+    assert_column_eq(expect, got)
+
+
+def test_is_leap_year(datetime_column):
+    got = plc.datetime.is_leap_year(datetime_column)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    expect = pc.is_leap_year(pa_col)
+    assert_column_eq(expect, got)
+
+
+def test_last_day_of_month(datetime_column):
+    def last_day_of_month(dates):
+        return [
+            d.replace(day=calendar.monthrange(d.year, d.month)[1])
+            if d is not None
+            else d
+            for d in dates.to_pylist()
+        ]
+
+    got = plc.datetime.last_day_of_month(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    expect = pa.array(last_day_of_month(pa_col), type=pa_got.type)
+    assert_column_eq(expect, got)
+
+
+def test_extract_quarter(datetime_column):
+    got = plc.datetime.extract_quarter(datetime_column)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    expect = pc.quarter(pa_col).cast(pa_got.type)
+    assert_column_eq(expect, got)
+
+
+def test_days_in_month(datetime_column):
+    def days_in_month(dates):
+        return [
+            calendar.monthrange(d.year, d.month)[1] if d is not None else None
+            for d in dates.to_pylist()
+        ]
+
+    got = plc.datetime.days_in_month(datetime_column)
+    pa_col = plc.interop.to_arrow(datetime_column)
+    pa_got = plc.interop.to_arrow(got)
+    expect = pa.array(days_in_month(pa_col), type=pa_got.type)
+    assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
index 5894ef4624c..52c81c49b9d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,10 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
+import pyarrow.compute as pc
 import pytest
+from utils import assert_column_eq
 
-# We can't really evaluate these expressions, so just make sure
-# construction works properly
+import pylibcudf as plc
 
 
 def test_literal_construction_invalid():
@@ -22,7 +22,7 @@ def test_literal_construction_invalid():
     ],
 )
 def test_columnref_construction(tableref):
-    plc.expressions.ColumnReference(1.0, tableref)
+    plc.expressions.ColumnReference(1, tableref)
 
 
 def test_columnnameref_construction():
@@ -47,3 +47,35 @@ def test_columnnameref_construction():
 )
 def test_astoperation_construction(kwargs):
     plc.expressions.Operation(**kwargs)
+
+
+def test_evaluation():
+    table_h = pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+    lit = pa.scalar(42, type=pa.int64())
+    table = plc.interop.from_arrow(table_h)
+    # expr = abs(b * c - (a + 42))
+    expr = plc.expressions.Operation(
+        plc.expressions.ASTOperator.ABS,
+        plc.expressions.Operation(
+            plc.expressions.ASTOperator.SUB,
+            plc.expressions.Operation(
+                plc.expressions.ASTOperator.MUL,
+                plc.expressions.ColumnReference(1),
+                plc.expressions.ColumnReference(2),
+            ),
+            plc.expressions.Operation(
+                plc.expressions.ASTOperator.ADD,
+                plc.expressions.ColumnReference(0),
+                plc.expressions.Literal(plc.interop.from_arrow(lit)),
+            ),
+        ),
+    )
+
+    expect = pc.abs(
+        pc.subtract(
+            pc.multiply(table_h["b"], table_h["c"]), pc.add(table_h["a"], lit)
+        )
+    )
+    got = plc.transform.compute_column(table, expr)
+
+    assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py
new file mode 100644
index 00000000000..83fb50fa4ef
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import hashlib
+import struct
+
+import mmh3
+import numpy as np
+import pyarrow as pa
+import pytest
+import xxhash
+from utils import assert_column_eq, assert_table_eq
+
+import pylibcudf as plc
+
+SEED = 0
+METHODS = ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"]
+
+
+def scalar_to_binary(x):
+    if isinstance(x, str):
+        return x.encode()
+    elif isinstance(x, float):
+        return struct.pack("<d", x)
+    elif isinstance(x, bool):
+        return x.to_bytes(1, byteorder="little", signed=True)
+    elif isinstance(x, int):
+        return x.to_bytes(8, byteorder="little", signed=True)
+    else:
+        raise NotImplementedError
+
+
+def hash_single_uint32(val, seed=0):
+    return mmh3.hash(np.uint32(val).tobytes(), seed=seed, signed=False)
+
+
+def hash_combine_32(lhs, rhs):
+    return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))
+
+
+def uint_hash_combine_32(lhs, rhs):
+    return hash_combine_32(np.uint32(lhs), np.uint32(rhs))
+
+
+def libcudf_mmh3_x86_32(binary):
+    seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    hashval = mmh3.hash(binary, seed)
+    return hash_combine_32(seed, hashval)
+
+
+@pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()])
+def scalar_type(request):
+    return request.param
+
+
+@pytest.fixture
+def pa_scalar_input_column(scalar_type):
+    if pa.types.is_integer(scalar_type) or pa.types.is_floating(scalar_type):
+        return pa.array([1, 2, 3], type=scalar_type)
+    elif pa.types.is_string(scalar_type):
+        return pa.array(["a", "b", "c"], type=scalar_type)
+    elif pa.types.is_boolean(scalar_type):
+        return pa.array([True, True, False], type=scalar_type)
+
+
+@pytest.fixture
+def plc_scalar_input_tbl(pa_scalar_input_column):
+    return plc.interop.from_arrow(
+        pa.Table.from_arrays([pa_scalar_input_column], names=["data"])
+    )
+
+
+@pytest.fixture(scope="module")
+def list_struct_table():
+    data = pa.Table.from_pydict(
+        {
+            "list": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            "struct": [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}],
+        }
+    )
+    return data
+
+
+def python_hash_value(x, method):
+    if method == "murmurhash3_x86_32":
+        return libcudf_mmh3_x86_32(x)
+    elif method == "murmurhash3_x64_128":
+        hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)
+        hasher.update(x)
+        # libcudf returns a tuple of two 64-bit integers
+        return hasher.utupledigest()
+    elif method == "xxhash_64":
+        return xxhash.xxh64(
+            x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
+    else:
+        return getattr(hashlib, method)(x).hexdigest()
+
+
+@pytest.mark.parametrize(
+    "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"]
+)
+def test_hash_column_sha_md5(
+    pa_scalar_input_column, plc_scalar_input_tbl, method
+):
+    plc_hasher = getattr(plc.hashing, method)
+
+    def py_hasher(val):
+        return getattr(hashlib, method)(scalar_to_binary(val)).hexdigest()
+
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.string(),
+    )
+    got = plc_hasher(plc_scalar_input_tbl)
+    assert_column_eq(got, expect)
+
+
+def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        return xxhash.xxh64(
+            scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
+
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.uint64(),
+    )
+    got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0)
+
+    assert_column_eq(got, expect)
+
+
+@pytest.mark.parametrize(
+    "method", ["sha1", "sha224", "sha256", "sha384", "sha512"]
+)
+@pytest.mark.parametrize("dtype", ["list", "struct"])
+def test_sha_list_struct_err(list_struct_table, dtype, method):
+    err_types = list_struct_table.select([dtype])
+    plc_tbl = plc.interop.from_arrow(err_types)
+    plc_hasher = getattr(plc.hashing, method)
+
+    with pytest.raises(TypeError):
+        plc_hasher(plc_tbl)
+
+
+def test_md5_struct_err(list_struct_table):
+    err_types = list_struct_table.select(["struct"])
+    plc_tbl = plc.interop.from_arrow(err_types)
+
+    with pytest.raises(TypeError):
+        plc.hashing.md5(plc_tbl)
+
+
+def test_murmurhash3_x86_32(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        return libcudf_mmh3_x86_32(scalar_to_binary(val))
+
+    got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0)
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.uint32(),
+    )
+    got = plc.hashing.murmurhash3_x86_32(plc_scalar_input_tbl, 0)
+    assert_column_eq(got, expect)
+
+
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
+def test_murmurhash3_x86_32_list():
+    pa_tbl = pa.Table.from_pydict(
+        {
+            "list": pa.array(
+                [[1, 2, 3], [4, 5, 6], [7, 8, 9]], type=pa.list_(pa.uint32())
+            )
+        }
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+
+    def hash_list(list_):
+        hash_value = uint_hash_combine_32(0, hash_single_uint32(len(list_)))
+
+        for element in list_:
+            hash_value = uint_hash_combine_32(
+                hash_value,
+                hash_single_uint32(
+                    element, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+                ),
+            )
+
+        final = uint_hash_combine_32(
+            plc.hashing.LIBCUDF_DEFAULT_HASH_SEED, hash_value
+        )
+        return final
+
+    expect = pa.array(
+        [hash_list(val) for val in pa_tbl["list"].to_pylist()],
+        type=pa.uint32(),
+    )
+    got = plc.hashing.murmurhash3_x86_32(
+        plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
+    assert_column_eq(got, expect)
+
+
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
+def test_murmurhash3_x86_32_struct():
+    pa_tbl = pa.table(
+        {
+            "struct": pa.array(
+                [
+                    {"a": 1, "b": 2, "c": 3},
+                    {"a": 4, "b": 5, "c": 6},
+                    {"a": 7, "b": 8, "c": 9},
+                ],
+                type=pa.struct(
+                    [
+                        pa.field("a", pa.uint32()),
+                        pa.field("b", pa.uint32(), pa.field("c", pa.uint32())),
+                    ]
+                ),
+            )
+        }
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+
+    def hash_struct(s):
+        seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        keys = list(s.keys())
+
+        combined_hash = hash_single_uint32(s[keys[0]], seed=seed)
+        combined_hash = uint_hash_combine_32(0, combined_hash)
+        combined_hash = uint_hash_combine_32(seed, combined_hash)
+
+        for key in keys[1:]:
+            current_hash = hash_single_uint32(s[key], seed=seed)
+            combined_hash = uint_hash_combine_32(combined_hash, current_hash)
+
+        return combined_hash
+
+    got = plc.hashing.murmurhash3_x86_32(
+        plc_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
+
+    expect = pa.array(
+        [hash_struct(val) for val in pa_tbl["struct"].to_pylist()],
+        type=pa.uint32(),
+    )
+    assert_column_eq(got, expect)
+
+
+def test_murmurhash3_x64_128(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)
+        hasher.update(val)
+        return hasher.utupledigest()
+
+    tuples = [
+        py_hasher(scalar_to_binary(val))
+        for val in pa_scalar_input_column.to_pylist()
+    ]
+    expect = pa.Table.from_arrays(
+        [
+            pa.array([np.uint64(t[0]) for t in tuples]),
+            pa.array([np.uint64(t[1]) for t in tuples]),
+        ],
+        names=["0", "1"],
+    )
+    got = plc.hashing.murmurhash3_x64_128(plc_scalar_input_tbl, 0)
+
+    assert_table_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index 01c998f16d4..af80b6e5978 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,8 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import cupy as cp
+import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+from utils import assert_table_eq
+
+import pylibcudf as plc
 
 
 def test_list_dtype_roundtrip():
@@ -66,3 +70,31 @@ def test_decimal_other(data_type):
 
     arrow_type = plc.interop.to_arrow(data_type, precision=precision)
     assert arrow_type == pa.decimal128(precision, 0)
+
+
+def test_round_trip_dlpack_plc_table():
+    expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]})
+    plc_table = plc.interop.from_arrow(expected)
+    result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+    assert_table_eq(expected, result)
+
+
+@pytest.mark.parametrize("array", [np.array, cp.array])
+def test_round_trip_dlpack_array(array):
+    arr = array([1, 2, 3])
+    result = plc.interop.from_dlpack(arr.__dlpack__())
+    expected = pa.table({"a": [1, 2, 3]})
+    assert_table_eq(expected, result)
+
+
+def test_to_dlpack_error():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": [1, None, 3], "b": [5, 6, 7]})
+    )
+    with pytest.raises(ValueError, match="Cannot create a DLPack tensor"):
+        plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+
+
+def test_from_dlpack_error():
+    with pytest.raises(ValueError, match="Invalid PyCapsule object"):
+        plc.interop.from_dlpack(1)
diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
index 61e02f4d28d..f43a56046a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,9 +2,10 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py
index 3d2955211f8..486a9524e92 100644
--- a/python/pylibcudf/pylibcudf/tests/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/test_json.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def plc_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
index f7fb7463b50..beacfc63ce5 100644
--- a/python/pylibcudf/pylibcudf/tests/test_labeling.py
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("left_inclusive", [True, False])
 @pytest.mark.parametrize("right_inclusive", [True, False])
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index 2353a6ff8f9..f3ef555f11d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def test_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
index 3edcae59edc..cd3da856de2 100644
--- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py
+++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.null_mask import MaskState
 
 import rmm
 
+import pylibcudf as plc
+from pylibcudf.null_mask import MaskState
+
 
 @pytest.fixture(params=[False, True])
 def nullable(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py
new file mode 100644
index 00000000000..7d6718a959b
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_byte_pair_encode.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    return pa.array(
+        [
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t est",
+            "s ent",
+        ]
+    )
+
+
+@pytest.mark.parametrize(
+    "separator", [None, plc.interop.from_arrow(pa.scalar("e"))]
+)
+def test_byte_pair_encoding(input_col, separator):
+    plc_col = plc.interop.from_arrow(
+        pa.array(["test sentence", "thisis test"])
+    )
+    result = plc.nvtext.byte_pair_encode.byte_pair_encoding(
+        plc_col,
+        plc.nvtext.byte_pair_encode.BPEMergePairs(
+            plc.interop.from_arrow(input_col)
+        ),
+        separator,
+    )
+    if separator is None:
+        expected = pa.array(["test   sent ence", "t h is is   test"])
+    else:
+        expected = pa.array(["teste esenteence", "teheiseise etest"])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
index 7d93c471cc4..8b14e0db576 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def edit_distance_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
index 5cf9874d595..fae4685f81b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
index d5a168426b1..05fe7b53c16 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index 4e389a63f90..ead9ee094af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
 def minhash_input_data(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
index 283a009288d..84748b5597e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
new file mode 100644
index 00000000000..25b6d1389ec
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def norm_spaces_input_data():
+    arr = ["a b", "  c  d\n", "e \t f "]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def norm_chars_input_data():
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    return pa.array(arr)
+
+
+def test_normalize_spaces(norm_spaces_input_data):
+    result = plc.nvtext.normalize.normalize_spaces(
+        plc.interop.from_arrow(norm_spaces_input_data)
+    )
+    expected = pa.array(["a b", "c d", "e f"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalize_characters(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        do_lower,
+    )
+    expected = pa.array(
+        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    )
+    if not do_lower:
+        expected = pa.array(
+            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+        )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
new file mode 100644
index 00000000000..65687f31c85
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def targets():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("delim", ["*", None])
+def test_replace_tokens(input_col, targets, delim):
+    replacements = pa.array(["slow", "cat", "looked", "rat"])
+    result = plc.nvtext.replace.replace_tokens(
+        plc.interop.from_arrow(input_col),
+        plc.interop.from_arrow(targets),
+        plc.interop.from_arrow(replacements),
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(["slow", "cat", "jumps*over the", "rat"])
+    if not delim:
+        expected = pa.array(
+            ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("min_token_length", [4, 5])
+@pytest.mark.parametrize("replace", ["---", None])
+@pytest.mark.parametrize("delim", ["*", None])
+def test_filter_tokens(input_col, min_token_length, replace, delim):
+    result = plc.nvtext.replace.filter_tokens(
+        plc.interop.from_arrow(input_col),
+        min_token_length,
+        plc.interop.from_arrow(pa.scalar(replace)) if replace else None,
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(
+        ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    )
+    if not delim and not replace and min_token_length == 4:
+        expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "])
+    if not delim and not replace and min_token_length == 5:
+        expected = pa.array([" quick", "brown ", "jumps*over ", " "])
+    if not delim and replace == "---" and min_token_length == 4:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "lazy ---"]
+        )
+    if not delim and replace == "---" and min_token_length == 5:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "--- ---"]
+        )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
new file mode 100644
index 00000000000..e7f4a971f08
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["trouble", "toy", "syzygy"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("check_vowels", [True, False])
+@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
+def test_is_letter(input_col, check_vowels, indices):
+    def is_letter(s, i, check):
+        vowels = "aeiouy"
+        return (s[i] in vowels) == check
+
+    result = plc.nvtext.stemmer.is_letter(
+        plc.interop.from_arrow(input_col),
+        check_vowels,
+        plc.interop.from_arrow(pa.array(indices))
+        if isinstance(indices, list)
+        else indices,
+    )
+    expected = pa.array(
+        [
+            is_letter(
+                s,
+                indices[i] if isinstance(indices, list) else indices,
+                check_vowels,
+            )
+            for i, s in enumerate(input_col.to_pylist())
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_porter_stemmer_measure(input_col):
+    result = plc.nvtext.stemmer.porter_stemmer_measure(
+        plc.interop.from_arrow(input_col),
+    )
+    expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
new file mode 100644
index 00000000000..516d0f7f78d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def vocab_file(tmpdir):
+    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
+    content = "1\n0\n10\n"
+    coefficients = [65559] * 10
+    for c in coefficients:
+        content = content + str(c) + " 0\n"
+    table = [0] * 10
+    table[0] = 3015668
+    content = content + "10\n"
+    for v in table:
+        content = content + str(v) + "\n"
+    content = content + "100\n101\n102\n\n"
+    hash_file.write(content)
+    return str(hash_file)
+
+
+@pytest.fixture
+def column_input():
+    return pa.array(["This is a test"])
+
+
+@pytest.mark.parametrize("max_sequence_length", [64, 128])
+@pytest.mark.parametrize("stride", [32, 64])
+@pytest.mark.parametrize("do_lower_case", [True, False])
+@pytest.mark.parametrize("do_truncate", [True, False])
+def test_subword_tokenize(
+    vocab_file,
+    column_input,
+    max_sequence_length,
+    stride,
+    do_lower_case,
+    do_truncate,
+):
+    vocab = plc.nvtext.subword_tokenize.HashedVocabulary(vocab_file)
+    tokens, masks, metadata = plc.nvtext.subword_tokenize.subword_tokenize(
+        plc.interop.from_arrow(column_input),
+        vocab,
+        max_sequence_length,
+        stride,
+        do_lower_case,
+        do_truncate,
+    )
+    expected_tokens = pa.array(
+        [100] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
+    )
+    expected_masks = pa.array(
+        [1] * 4 + [0] * (max_sequence_length - 4), type=pa.uint32()
+    )
+    expected_metadata = pa.array([0, 0, 3], type=pa.uint32())
+
+    assert_column_eq(tokens, expected_tokens)
+    assert_column_eq(masks, expected_masks)
+    assert_column_eq(metadata, expected_metadata)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
new file mode 100644
index 00000000000..f1b4a5637e1
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_tokenize.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    return pa.array(["a", "b c", "d.e:f;"])
+
+
+@pytest.mark.parametrize(
+    "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))]
+)
+def test_tokenize_scalar(input_col, delimiter):
+    result = plc.nvtext.tokenize.tokenize_scalar(
+        plc.interop.from_arrow(input_col), delimiter
+    )
+    if delimiter is None:
+        expected = pa.array(["a", "b", "c", "d.e:f;"])
+    else:
+        expected = pa.array(["a", "b c", "d", "e:f;"])
+    assert_column_eq(result, expected)
+
+
+def test_tokenize_column(input_col):
+    delimiters = pa.array([" ", ".", ":", ";"])
+    result = plc.nvtext.tokenize.tokenize_column(
+        plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters)
+    )
+    expected = pa.array(["a", "b", "c", "d", "e", "f"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))]
+)
+def test_count_tokens_scalar(input_col, delimiter):
+    result = plc.nvtext.tokenize.count_tokens_scalar(
+        plc.interop.from_arrow(input_col), delimiter
+    )
+    if delimiter is None:
+        expected = pa.array([1, 2, 1], type=pa.int32())
+    else:
+        expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)
+
+
+def test_count_tokens_column(input_col):
+    delimiters = pa.array([" ", ".", ":", ";"])
+    result = plc.nvtext.tokenize.count_tokens_column(
+        plc.interop.from_arrow(input_col), plc.interop.from_arrow(delimiters)
+    )
+    expected = pa.array([1, 2, 3], type=pa.int32())
+    assert_column_eq(result, expected)
+
+
+def test_character_tokenize(input_col):
+    result = plc.nvtext.tokenize.character_tokenize(
+        plc.interop.from_arrow(input_col)
+    )
+    expected = pa.array(["a", "b", " ", "c", "d", ".", "e", ":", "f", ";"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "delimiter", [None, plc.interop.from_arrow(pa.scalar("."))]
+)
+def test_detokenize(input_col, delimiter):
+    row_indices = pa.array([0, 0, 1])
+    result = plc.nvtext.tokenize.detokenize(
+        plc.interop.from_arrow(input_col), plc.interop.from_arrow(row_indices)
+    )
+    expected = pa.array(["a b c", "d.e:f;"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("default_id", [-1, 0])
+def test_tokenize_with_vocabulary(input_col, default_id):
+    result = plc.nvtext.tokenize.tokenize_with_vocabulary(
+        plc.interop.from_arrow(input_col),
+        plc.nvtext.tokenize.TokenizeVocabulary(
+            plc.interop.from_arrow(input_col)
+        ),
+        plc.interop.from_arrow(pa.scalar(" ")),
+        default_id,
+    )
+    pa_result = plc.interop.to_arrow(result)
+    if default_id == -1:
+        expected = pa.array([[0], [-1, -1], [2]], type=pa_result.type)
+    else:
+        expected = pa.array([[0], [0, 0], [2]], type=pa_result.type)
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
index 444d0089d2c..c55e54cebc6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def partitioning_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
index bac56691306..e4a24fb1c98 100644
--- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py
+++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 # Map pylibcudf interpolation options to pyarrow options
 interp_mapping = {
     plc.types.Interpolation.LINEAR: "linear",
diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
index 777315df538..52598f2c462 100644
--- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py
+++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("pat", ["(", "*", "\\"])
 def test_regex_program_invalid(pat):
diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
index 01115bc363a..ef23e23766a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py
index 0b30316b9a0..2526580bc13 100644
--- a/python/pylibcudf/pylibcudf/tests/test_round.py
+++ b/python/pylibcudf/pylibcudf/tests/test_round.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=["float32", "float64"])
 def column(request, has_nulls):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
index a1820def0b1..f461657281a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture()
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
index 176ccc55b96..3e31c75c38a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py
index 233cc253b14..08ac371fd96 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_case.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def string_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
index bcd030c019e..06b44210d74 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_all_characters_of_type():
     pa_array = pa.array(["1", "A"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
index 4a7007a0d6b..eea3ac68e84 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_combine.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_concatenate_scalar_seperator():
     plc_table = plc.interop.from_arrow(
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4e4dd7cbb00..ba9a4a7d3b8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def target_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index 69f7a0fdd33..3f3f452c4f6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     scope="module",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
index 117c59ff1b8..b391d2b290e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_booleans():
     pa_array = pa.array(["true", None, "True"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
index f3e84286a36..c9368d858a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
@@ -3,10 +3,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def fmt():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
index 6d704309bfd..2d3578e4e71 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
@@ -3,10 +3,11 @@
 from datetime import datetime, timedelta
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     params=[
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
index b1c4d729604..012e722038e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
@@ -2,9 +2,10 @@
 import decimal
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_fixed_point():
     typ = pa.decimal128(38, 2)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
index e9918fab559..8ee2b5075af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_floats():
     typ = pa.float32()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
index 6d1d565af30..01192c2d1f8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_integers():
     typ = pa.int8()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
index 4dc3e512624..b533809f106 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_ipv4_to_integers():
     arr = pa.array(["123.45.67.890", None])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
index 8591732b39e..737036a4f0f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
 @pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
index fee8c3fb8f6..528736798c7 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
@@ -2,9 +2,10 @@
 import urllib
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_url_encode():
     data = ["/home/nfs", None]
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
index 788b86423c4..e70edf4fb33 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py
index db3b13a5aae..82ec18832a9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
index d6b37a388f0..fa9eee3594b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_find_multiple():
     arr = pa.array(["abc", "def"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
index debfad92d00..b73d812c898 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -2,9 +2,10 @@
 import re
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_findall():
     arr = pa.array(["bunny", "rabbit", "hare", "dog"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
index 2ba775d17ae..79498132097 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_padding.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
index 18b5d8bf4d0..c06c06be7c6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
 def test_repeat_strings(repeats):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
index 5a9c2007b73..2c7d25133de 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
new file mode 100644
index 00000000000..511f826441a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.mark.parametrize("max_replace_count", [-1, 1])
+def test_replace_re_regex_program_scalar(max_replace_count):
+    arr = pa.array(["foo", "fuz", None])
+    pat = "f."
+    repl = "ba"
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pat, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        plc.interop.from_arrow(pa.scalar(repl)),
+        max_replace_count=max_replace_count,
+    )
+    expected = pc.replace_substring_regex(
+        arr,
+        pat,
+        repl,
+        max_replacements=max_replace_count
+        if max_replace_count != -1
+        else None,
+    )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "flags",
+    [
+        plc.strings.regex_flags.RegexFlags.DEFAULT,
+        plc.strings.regex_flags.RegexFlags.DOTALL,
+    ],
+)
+def test_replace_re_list_str_columns(flags):
+    arr = pa.array(["foo", "fuz", None])
+    pats = ["oo", "uz"]
+    repls = ["a", "b"]
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        pats,
+        plc.interop.from_arrow(pa.array(repls)),
+        flags=flags,
+    )
+    expected = arr
+    for pat, repl in zip(pats, repls):
+        expected = pc.replace_substring_regex(
+            expected,
+            pat,
+            repl,
+        )
+    assert_column_eq(result, expected)
+
+
+def test_replace_with_backrefs():
+    arr = pa.array(["Z756", None])
+    result = plc.strings.replace_re.replace_with_backrefs(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            "(\\d)(\\d)", plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        "V\\2\\1",
+    )
+    expected = pa.array(["ZV576", None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
index d9ce5591b98..1759f739e31 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def pa_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
index 80cae8d1c6b..4e80f19b814 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
index 2aeffac8209..450b336ce65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
index 005e5e4a405..5869e5f4920 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 data_strings = [
     "AbC",
     "123abc",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
index 2ae893e69fb..84fd3354ac6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
index a1c820cd586..00442d866e9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
@@ -2,9 +2,10 @@
 import textwrap
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_wrap():
     width = 12
diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py
index e822d6a97a8..ac39ef4c5c9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_table.py
+++ b/python/pylibcudf/pylibcudf/tests/test_table.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize(
     "arrow_tbl",
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index d5c618f07e4..49802fe64ac 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,9 +3,10 @@
 import math
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
index ac11123f680..b0c0bc72ead 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transpose.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from packaging.version import parse
 
+import pylibcudf as plc
+
 
 @pytest.mark.skipif(
     parse(pa.__version__) < parse("16.0.0"),
diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
index b530f433c97..4fb623158f0 100644
--- a/python/pylibcudf/pylibcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/transform.pxd
@@ -3,6 +3,7 @@ from libcpp cimport bool
 from pylibcudf.libcudf.types cimport bitmask_type, data_type
 
 from .column cimport Column
+from .expressions cimport Expression
 from .gpumemoryview cimport gpumemoryview
 from .table cimport Table
 from .types cimport DataType
@@ -10,6 +11,8 @@ from .types cimport DataType
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
 
+cpdef Column compute_column(Table input, Expression expr)
+
 cpdef tuple[gpumemoryview, int] bools_to_mask(Column input)
 
 cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit)
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index bce9702752a..e8d95cadb0c 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move, pair
@@ -43,6 +44,32 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
     )
 
 
+cpdef Column compute_column(Table input, Expression expr):
+    """Create a column by evaluating an expression on a table.
+
+    For details see :cpp:func:`compute_column`.
+
+    Parameters
+    ----------
+    input : Table
+        Table used for expression evaluation
+    expr : Expression
+        Expression to evaluate
+
+    Returns
+    -------
+    Column of the evaluated expression
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_transform.compute_column(
+            input.view(), dereference(expr.c_obj.get())
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
 cpdef tuple[gpumemoryview, int] bools_to_mask(Column input):
     """Create a bitmask from a column of boolean elements
 
diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx
index 58c7d97e9bc..a0c31f994a3 100644
--- a/python/pylibcudf/pylibcudf/types.pyx
+++ b/python/pylibcudf/pylibcudf/types.pyx
@@ -79,6 +79,16 @@ cpdef size_type size_of(DataType t):
     Only fixed-width types are supported.
 
     For details, see :cpp:func:`size_of`.
+
+    Parameters
+    ----------
+    t : DataType
+        The DataType to get the size of.
+
+    Returns
+    -------
+    int
+        Size in bytes of an element of the specified type.
     """
     with nogil:
         return cpp_size_of(t.c_obj)
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index ea5b3065896..a80c85a1fa8 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -53,48 +53,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module